diff --git "a/checkpoint-10830/trainer_state.json" "b/checkpoint-10830/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10830/trainer_state.json" @@ -0,0 +1,75843 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 10830, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002770083102493075, + "grad_norm": 1.493026852607727, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.0538, + "step": 1 + }, + { + "epoch": 0.000554016620498615, + "grad_norm": 1.3199710845947266, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9806, + "step": 2 + }, + { + "epoch": 0.0008310249307479224, + "grad_norm": 1.3620953559875488, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0318, + "step": 3 + }, + { + "epoch": 0.00110803324099723, + "grad_norm": 1.4497380256652832, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.0472, + "step": 4 + }, + { + "epoch": 0.0013850415512465374, + "grad_norm": 1.3085776567459106, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9745, + "step": 5 + }, + { + "epoch": 0.0016620498614958448, + "grad_norm": 1.4101017713546753, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.0561, + "step": 6 + }, + { + "epoch": 0.0019390581717451524, + "grad_norm": 1.4515964984893799, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.0348, + "step": 7 + }, + { + "epoch": 0.00221606648199446, + "grad_norm": 1.2317475080490112, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.9423, + "step": 8 + }, + { + "epoch": 0.002493074792243767, + "grad_norm": 1.3780522346496582, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.0005, + "step": 9 + }, + { + "epoch": 0.002770083102493075, + "grad_norm": 1.424789309501648, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0221, + "step": 10 + }, + { + "epoch": 0.0030470914127423824, + "grad_norm": 1.352807641029358, + "learning_rate": 5.5e-07, + "loss": 1.0366, + "step": 11 + }, + { + "epoch": 0.0033240997229916896, + "grad_norm": 1.4198808670043945, + "learning_rate": 6.000000000000001e-07, + "loss": 0.9874, + "step": 12 + }, + { + "epoch": 0.003601108033240997, + "grad_norm": 1.37282133102417, + "learning_rate": 6.5e-07, + "loss": 0.9976, + "step": 13 + }, + { + "epoch": 0.003878116343490305, + "grad_norm": 1.3420826196670532, + "learning_rate": 7.000000000000001e-07, + "loss": 1.0269, + "step": 14 + }, + { + "epoch": 0.004155124653739612, + "grad_norm": 1.3467051982879639, + "learning_rate": 7.5e-07, + "loss": 1.0208, + "step": 15 + }, + { + "epoch": 0.00443213296398892, + "grad_norm": 1.336922526359558, + "learning_rate": 8.000000000000001e-07, + "loss": 1.0465, + "step": 16 + }, + { + "epoch": 0.004709141274238227, + "grad_norm": 1.2495678663253784, + "learning_rate": 8.500000000000001e-07, + "loss": 0.981, + "step": 17 + }, + { + "epoch": 0.004986149584487534, + "grad_norm": 1.3265974521636963, + "learning_rate": 9.000000000000001e-07, + "loss": 0.949, + "step": 18 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 1.204640507698059, + "learning_rate": 9.500000000000001e-07, + "loss": 0.9917, + "step": 19 + }, + { + "epoch": 0.00554016620498615, + "grad_norm": 1.1768105030059814, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0069, + "step": 20 + }, + { + "epoch": 0.005817174515235457, + "grad_norm": 1.1372712850570679, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.9638, + "step": 21 + }, + { + "epoch": 0.006094182825484765, + "grad_norm": 1.0769680738449097, + "learning_rate": 1.1e-06, + "loss": 0.9549, + "step": 22 + }, + { + "epoch": 0.006371191135734072, + "grad_norm": 1.0383652448654175, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.9953, + "step": 23 + }, + { + "epoch": 0.006648199445983379, + "grad_norm": 0.8858652114868164, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9546, + "step": 24 + }, + { + "epoch": 0.006925207756232687, + "grad_norm": 0.7955514192581177, + "learning_rate": 1.25e-06, + "loss": 0.9369, + "step": 25 + }, + { + "epoch": 0.007202216066481994, + "grad_norm": 0.7989655137062073, + "learning_rate": 1.3e-06, + "loss": 0.9543, + "step": 26 + }, + { + "epoch": 0.007479224376731302, + "grad_norm": 0.7056751251220703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.9063, + "step": 27 + }, + { + "epoch": 0.00775623268698061, + "grad_norm": 0.6925091743469238, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.9554, + "step": 28 + }, + { + "epoch": 0.008033240997229917, + "grad_norm": 0.7046273350715637, + "learning_rate": 1.45e-06, + "loss": 0.954, + "step": 29 + }, + { + "epoch": 0.008310249307479225, + "grad_norm": 0.7233896255493164, + "learning_rate": 1.5e-06, + "loss": 1.0106, + "step": 30 + }, + { + "epoch": 0.008587257617728532, + "grad_norm": 0.6680523157119751, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.9682, + "step": 31 + }, + { + "epoch": 0.00886426592797784, + "grad_norm": 0.624336302280426, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.9258, + "step": 32 + }, + { + "epoch": 0.009141274238227148, + "grad_norm": 0.6681525111198425, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.9115, + "step": 33 + }, + { + "epoch": 0.009418282548476454, + "grad_norm": 0.641791820526123, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.9337, + "step": 34 + }, + { + "epoch": 0.009695290858725761, + "grad_norm": 0.6529171466827393, + "learning_rate": 1.75e-06, + "loss": 0.9515, + "step": 35 + }, + { + "epoch": 0.009972299168975069, + "grad_norm": 0.5708187222480774, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.9184, + "step": 36 + }, + { + "epoch": 0.010249307479224376, + "grad_norm": 0.5597304105758667, + "learning_rate": 1.85e-06, + "loss": 0.9142, + "step": 37 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 0.5888805389404297, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.9374, + "step": 38 + }, + { + "epoch": 0.010803324099722992, + "grad_norm": 0.5895810723304749, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.968, + "step": 39 + }, + { + "epoch": 0.0110803324099723, + "grad_norm": 0.5828868746757507, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8874, + "step": 40 + }, + { + "epoch": 0.011357340720221607, + "grad_norm": 0.5907834768295288, + "learning_rate": 2.05e-06, + "loss": 0.9543, + "step": 41 + }, + { + "epoch": 0.011634349030470914, + "grad_norm": 0.5174127221107483, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.95, + "step": 42 + }, + { + "epoch": 0.011911357340720222, + "grad_norm": 0.5172901749610901, + "learning_rate": 2.15e-06, + "loss": 0.9246, + "step": 43 + }, + { + "epoch": 0.01218836565096953, + "grad_norm": 0.5019023418426514, + "learning_rate": 2.2e-06, + "loss": 0.8369, + "step": 44 + }, + { + "epoch": 0.012465373961218837, + "grad_norm": 0.5965269207954407, + "learning_rate": 2.25e-06, + "loss": 0.9166, + "step": 45 + }, + { + "epoch": 0.012742382271468145, + "grad_norm": 0.5445163249969482, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.8671, + "step": 46 + }, + { + "epoch": 0.01301939058171745, + "grad_norm": 0.5398634076118469, + "learning_rate": 2.35e-06, + "loss": 0.9209, + "step": 47 + }, + { + "epoch": 0.013296398891966758, + "grad_norm": 0.49044713377952576, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.8799, + "step": 48 + }, + { + "epoch": 0.013573407202216066, + "grad_norm": 0.5420831441879272, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.8706, + "step": 49 + }, + { + "epoch": 0.013850415512465374, + "grad_norm": 0.49241095781326294, + "learning_rate": 2.5e-06, + "loss": 0.8907, + "step": 50 + }, + { + "epoch": 0.014127423822714681, + "grad_norm": 0.4719524383544922, + "learning_rate": 2.55e-06, + "loss": 0.8677, + "step": 51 + }, + { + "epoch": 0.014404432132963989, + "grad_norm": 0.5064590573310852, + "learning_rate": 2.6e-06, + "loss": 0.9435, + "step": 52 + }, + { + "epoch": 0.014681440443213296, + "grad_norm": 0.46574175357818604, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.8723, + "step": 53 + }, + { + "epoch": 0.014958448753462604, + "grad_norm": 0.4822365641593933, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.8311, + "step": 54 + }, + { + "epoch": 0.015235457063711912, + "grad_norm": 0.4746630787849426, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8785, + "step": 55 + }, + { + "epoch": 0.01551246537396122, + "grad_norm": 0.49930208921432495, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9016, + "step": 56 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 0.4789316952228546, + "learning_rate": 2.85e-06, + "loss": 0.8373, + "step": 57 + }, + { + "epoch": 0.016066481994459834, + "grad_norm": 0.4359954297542572, + "learning_rate": 2.9e-06, + "loss": 0.8478, + "step": 58 + }, + { + "epoch": 0.016343490304709142, + "grad_norm": 0.4391110837459564, + "learning_rate": 2.95e-06, + "loss": 0.8584, + "step": 59 + }, + { + "epoch": 0.01662049861495845, + "grad_norm": 0.42015522718429565, + "learning_rate": 3e-06, + "loss": 0.8128, + "step": 60 + }, + { + "epoch": 0.016897506925207757, + "grad_norm": 0.4458240866661072, + "learning_rate": 3.05e-06, + "loss": 0.9213, + "step": 61 + }, + { + "epoch": 0.017174515235457065, + "grad_norm": 0.43013569712638855, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.8891, + "step": 62 + }, + { + "epoch": 0.017451523545706372, + "grad_norm": 0.4295575022697449, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.8786, + "step": 63 + }, + { + "epoch": 0.01772853185595568, + "grad_norm": 0.4423756003379822, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8812, + "step": 64 + }, + { + "epoch": 0.018005540166204988, + "grad_norm": 0.4237767457962036, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.8272, + "step": 65 + }, + { + "epoch": 0.018282548476454295, + "grad_norm": 0.41085582971572876, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.8848, + "step": 66 + }, + { + "epoch": 0.0185595567867036, + "grad_norm": 0.4319153130054474, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.874, + "step": 67 + }, + { + "epoch": 0.018836565096952907, + "grad_norm": 0.44288167357444763, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.8799, + "step": 68 + }, + { + "epoch": 0.019113573407202215, + "grad_norm": 0.4345931112766266, + "learning_rate": 3.45e-06, + "loss": 0.9089, + "step": 69 + }, + { + "epoch": 0.019390581717451522, + "grad_norm": 0.41736096143722534, + "learning_rate": 3.5e-06, + "loss": 0.8675, + "step": 70 + }, + { + "epoch": 0.01966759002770083, + "grad_norm": 0.4338845908641815, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8776, + "step": 71 + }, + { + "epoch": 0.019944598337950138, + "grad_norm": 0.4050421118736267, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8871, + "step": 72 + }, + { + "epoch": 0.020221606648199445, + "grad_norm": 0.40696731209754944, + "learning_rate": 3.65e-06, + "loss": 0.8587, + "step": 73 + }, + { + "epoch": 0.020498614958448753, + "grad_norm": 0.40157297253608704, + "learning_rate": 3.7e-06, + "loss": 0.832, + "step": 74 + }, + { + "epoch": 0.02077562326869806, + "grad_norm": 0.42174622416496277, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9153, + "step": 75 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 0.44821983575820923, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8115, + "step": 76 + }, + { + "epoch": 0.021329639889196676, + "grad_norm": 0.4175020158290863, + "learning_rate": 3.85e-06, + "loss": 0.8775, + "step": 77 + }, + { + "epoch": 0.021606648199445983, + "grad_norm": 0.3963990807533264, + "learning_rate": 3.900000000000001e-06, + "loss": 0.823, + "step": 78 + }, + { + "epoch": 0.02188365650969529, + "grad_norm": 0.4085932672023773, + "learning_rate": 3.95e-06, + "loss": 0.8255, + "step": 79 + }, + { + "epoch": 0.0221606648199446, + "grad_norm": 0.39907416701316833, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8611, + "step": 80 + }, + { + "epoch": 0.022437673130193906, + "grad_norm": 0.41605669260025024, + "learning_rate": 4.05e-06, + "loss": 0.8628, + "step": 81 + }, + { + "epoch": 0.022714681440443214, + "grad_norm": 0.41773635149002075, + "learning_rate": 4.1e-06, + "loss": 0.8363, + "step": 82 + }, + { + "epoch": 0.02299168975069252, + "grad_norm": 0.4385448694229126, + "learning_rate": 4.15e-06, + "loss": 0.8849, + "step": 83 + }, + { + "epoch": 0.02326869806094183, + "grad_norm": 0.4114312529563904, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8031, + "step": 84 + }, + { + "epoch": 0.023545706371191136, + "grad_norm": 0.41531598567962646, + "learning_rate": 4.25e-06, + "loss": 0.8371, + "step": 85 + }, + { + "epoch": 0.023822714681440444, + "grad_norm": 0.4031124413013458, + "learning_rate": 4.3e-06, + "loss": 0.8606, + "step": 86 + }, + { + "epoch": 0.02409972299168975, + "grad_norm": 0.40101325511932373, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8568, + "step": 87 + }, + { + "epoch": 0.02437673130193906, + "grad_norm": 0.4076603949069977, + "learning_rate": 4.4e-06, + "loss": 0.877, + "step": 88 + }, + { + "epoch": 0.024653739612188367, + "grad_norm": 0.4312387704849243, + "learning_rate": 4.450000000000001e-06, + "loss": 0.8396, + "step": 89 + }, + { + "epoch": 0.024930747922437674, + "grad_norm": 0.38493359088897705, + "learning_rate": 4.5e-06, + "loss": 0.7502, + "step": 90 + }, + { + "epoch": 0.025207756232686982, + "grad_norm": 0.4111658036708832, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7986, + "step": 91 + }, + { + "epoch": 0.02548476454293629, + "grad_norm": 0.40969961881637573, + "learning_rate": 4.600000000000001e-06, + "loss": 0.8424, + "step": 92 + }, + { + "epoch": 0.025761772853185594, + "grad_norm": 0.3882950246334076, + "learning_rate": 4.65e-06, + "loss": 0.7825, + "step": 93 + }, + { + "epoch": 0.0260387811634349, + "grad_norm": 0.4332575798034668, + "learning_rate": 4.7e-06, + "loss": 0.824, + "step": 94 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 0.4235541820526123, + "learning_rate": 4.75e-06, + "loss": 0.8532, + "step": 95 + }, + { + "epoch": 0.026592797783933517, + "grad_norm": 0.408812940120697, + "learning_rate": 4.800000000000001e-06, + "loss": 0.8803, + "step": 96 + }, + { + "epoch": 0.026869806094182824, + "grad_norm": 0.40831273794174194, + "learning_rate": 4.85e-06, + "loss": 0.8272, + "step": 97 + }, + { + "epoch": 0.027146814404432132, + "grad_norm": 0.4236239492893219, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.841, + "step": 98 + }, + { + "epoch": 0.02742382271468144, + "grad_norm": 0.40624263882637024, + "learning_rate": 4.95e-06, + "loss": 0.8278, + "step": 99 + }, + { + "epoch": 0.027700831024930747, + "grad_norm": 0.41792380809783936, + "learning_rate": 5e-06, + "loss": 0.842, + "step": 100 + }, + { + "epoch": 0.027977839335180055, + "grad_norm": 0.4306834936141968, + "learning_rate": 4.999999973459309e-06, + "loss": 0.8569, + "step": 101 + }, + { + "epoch": 0.028254847645429362, + "grad_norm": 0.40494856238365173, + "learning_rate": 4.999999893837233e-06, + "loss": 0.818, + "step": 102 + }, + { + "epoch": 0.02853185595567867, + "grad_norm": 0.3964189291000366, + "learning_rate": 4.999999761133774e-06, + "loss": 0.8353, + "step": 103 + }, + { + "epoch": 0.028808864265927978, + "grad_norm": 0.4005384147167206, + "learning_rate": 4.999999575348937e-06, + "loss": 0.8338, + "step": 104 + }, + { + "epoch": 0.029085872576177285, + "grad_norm": 0.41835519671440125, + "learning_rate": 4.999999336482725e-06, + "loss": 0.8211, + "step": 105 + }, + { + "epoch": 0.029362880886426593, + "grad_norm": 0.4398205280303955, + "learning_rate": 4.999999044535142e-06, + "loss": 0.8708, + "step": 106 + }, + { + "epoch": 0.0296398891966759, + "grad_norm": 0.4256204664707184, + "learning_rate": 4.999998699506195e-06, + "loss": 0.8362, + "step": 107 + }, + { + "epoch": 0.029916897506925208, + "grad_norm": 0.4092758893966675, + "learning_rate": 4.999998301395892e-06, + "loss": 0.793, + "step": 108 + }, + { + "epoch": 0.030193905817174516, + "grad_norm": 0.4022393822669983, + "learning_rate": 4.99999785020424e-06, + "loss": 0.855, + "step": 109 + }, + { + "epoch": 0.030470914127423823, + "grad_norm": 0.39198121428489685, + "learning_rate": 4.9999973459312505e-06, + "loss": 0.781, + "step": 110 + }, + { + "epoch": 0.03074792243767313, + "grad_norm": 0.40529149770736694, + "learning_rate": 4.999996788576932e-06, + "loss": 0.8363, + "step": 111 + }, + { + "epoch": 0.03102493074792244, + "grad_norm": 0.4149932563304901, + "learning_rate": 4.999996178141298e-06, + "loss": 0.8145, + "step": 112 + }, + { + "epoch": 0.031301939058171746, + "grad_norm": 0.4344826340675354, + "learning_rate": 4.99999551462436e-06, + "loss": 0.8545, + "step": 113 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 0.41700538992881775, + "learning_rate": 4.999994798026133e-06, + "loss": 0.8152, + "step": 114 + }, + { + "epoch": 0.03185595567867036, + "grad_norm": 0.4254540205001831, + "learning_rate": 4.999994028346633e-06, + "loss": 0.8452, + "step": 115 + }, + { + "epoch": 0.03213296398891967, + "grad_norm": 0.4131767451763153, + "learning_rate": 4.999993205585875e-06, + "loss": 0.8019, + "step": 116 + }, + { + "epoch": 0.032409972299168976, + "grad_norm": 0.4246065616607666, + "learning_rate": 4.999992329743877e-06, + "loss": 0.8517, + "step": 117 + }, + { + "epoch": 0.032686980609418284, + "grad_norm": 0.3999440670013428, + "learning_rate": 4.999991400820658e-06, + "loss": 0.7985, + "step": 118 + }, + { + "epoch": 0.03296398891966759, + "grad_norm": 0.41130325198173523, + "learning_rate": 4.999990418816236e-06, + "loss": 0.8455, + "step": 119 + }, + { + "epoch": 0.0332409972299169, + "grad_norm": 0.40182363986968994, + "learning_rate": 4.999989383730634e-06, + "loss": 0.8081, + "step": 120 + }, + { + "epoch": 0.03351800554016621, + "grad_norm": 0.4479239583015442, + "learning_rate": 4.999988295563873e-06, + "loss": 0.8775, + "step": 121 + }, + { + "epoch": 0.033795013850415515, + "grad_norm": 0.41858404874801636, + "learning_rate": 4.999987154315977e-06, + "loss": 0.7986, + "step": 122 + }, + { + "epoch": 0.03407202216066482, + "grad_norm": 0.4002572000026703, + "learning_rate": 4.999985959986968e-06, + "loss": 0.7207, + "step": 123 + }, + { + "epoch": 0.03434903047091413, + "grad_norm": 0.40468156337738037, + "learning_rate": 4.999984712576874e-06, + "loss": 0.7843, + "step": 124 + }, + { + "epoch": 0.03462603878116344, + "grad_norm": 0.42578715085983276, + "learning_rate": 4.99998341208572e-06, + "loss": 0.761, + "step": 125 + }, + { + "epoch": 0.034903047091412745, + "grad_norm": 0.4391866624355316, + "learning_rate": 4.9999820585135335e-06, + "loss": 0.8479, + "step": 126 + }, + { + "epoch": 0.03518005540166205, + "grad_norm": 0.4086853861808777, + "learning_rate": 4.999980651860345e-06, + "loss": 0.7987, + "step": 127 + }, + { + "epoch": 0.03545706371191136, + "grad_norm": 0.4147675335407257, + "learning_rate": 4.999979192126181e-06, + "loss": 0.8087, + "step": 128 + }, + { + "epoch": 0.03573407202216067, + "grad_norm": 0.40620920062065125, + "learning_rate": 4.999977679311076e-06, + "loss": 0.8191, + "step": 129 + }, + { + "epoch": 0.036011080332409975, + "grad_norm": 0.41266223788261414, + "learning_rate": 4.999976113415059e-06, + "loss": 0.8403, + "step": 130 + }, + { + "epoch": 0.03628808864265928, + "grad_norm": 0.4243127405643463, + "learning_rate": 4.999974494438166e-06, + "loss": 0.8207, + "step": 131 + }, + { + "epoch": 0.03656509695290859, + "grad_norm": 0.4049611985683441, + "learning_rate": 4.99997282238043e-06, + "loss": 0.7541, + "step": 132 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 0.43927672505378723, + "learning_rate": 4.999971097241887e-06, + "loss": 0.8645, + "step": 133 + }, + { + "epoch": 0.0371191135734072, + "grad_norm": 0.4410925805568695, + "learning_rate": 4.999969319022573e-06, + "loss": 0.8064, + "step": 134 + }, + { + "epoch": 0.037396121883656507, + "grad_norm": 0.42407283186912537, + "learning_rate": 4.999967487722527e-06, + "loss": 0.7815, + "step": 135 + }, + { + "epoch": 0.037673130193905814, + "grad_norm": 0.4339797794818878, + "learning_rate": 4.9999656033417865e-06, + "loss": 0.7834, + "step": 136 + }, + { + "epoch": 0.03795013850415512, + "grad_norm": 0.4262239634990692, + "learning_rate": 4.999963665880392e-06, + "loss": 0.8049, + "step": 137 + }, + { + "epoch": 0.03822714681440443, + "grad_norm": 0.41159674525260925, + "learning_rate": 4.999961675338384e-06, + "loss": 0.8195, + "step": 138 + }, + { + "epoch": 0.03850415512465374, + "grad_norm": 0.39753422141075134, + "learning_rate": 4.999959631715806e-06, + "loss": 0.7023, + "step": 139 + }, + { + "epoch": 0.038781163434903045, + "grad_norm": 0.4160650968551636, + "learning_rate": 4.999957535012701e-06, + "loss": 0.7488, + "step": 140 + }, + { + "epoch": 0.03905817174515235, + "grad_norm": 0.4177490174770355, + "learning_rate": 4.999955385229113e-06, + "loss": 0.8098, + "step": 141 + }, + { + "epoch": 0.03933518005540166, + "grad_norm": 0.441594660282135, + "learning_rate": 4.999953182365088e-06, + "loss": 0.825, + "step": 142 + }, + { + "epoch": 0.03961218836565097, + "grad_norm": 0.44137829542160034, + "learning_rate": 4.999950926420673e-06, + "loss": 0.7645, + "step": 143 + }, + { + "epoch": 0.039889196675900275, + "grad_norm": 0.44875359535217285, + "learning_rate": 4.999948617395916e-06, + "loss": 0.8157, + "step": 144 + }, + { + "epoch": 0.04016620498614958, + "grad_norm": 0.4129585325717926, + "learning_rate": 4.999946255290865e-06, + "loss": 0.8188, + "step": 145 + }, + { + "epoch": 0.04044321329639889, + "grad_norm": 0.4289536476135254, + "learning_rate": 4.999943840105571e-06, + "loss": 0.7128, + "step": 146 + }, + { + "epoch": 0.0407202216066482, + "grad_norm": 0.4285975992679596, + "learning_rate": 4.999941371840084e-06, + "loss": 0.7924, + "step": 147 + }, + { + "epoch": 0.040997229916897505, + "grad_norm": 0.4204922318458557, + "learning_rate": 4.99993885049446e-06, + "loss": 0.7936, + "step": 148 + }, + { + "epoch": 0.04127423822714681, + "grad_norm": 0.43389734625816345, + "learning_rate": 4.999936276068749e-06, + "loss": 0.8296, + "step": 149 + }, + { + "epoch": 0.04155124653739612, + "grad_norm": 0.45425310730934143, + "learning_rate": 4.999933648563005e-06, + "loss": 0.8428, + "step": 150 + }, + { + "epoch": 0.04182825484764543, + "grad_norm": 0.44408005475997925, + "learning_rate": 4.9999309679772875e-06, + "loss": 0.8612, + "step": 151 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 0.44268134236335754, + "learning_rate": 4.999928234311651e-06, + "loss": 0.8082, + "step": 152 + }, + { + "epoch": 0.042382271468144044, + "grad_norm": 0.43825775384902954, + "learning_rate": 4.999925447566154e-06, + "loss": 0.8004, + "step": 153 + }, + { + "epoch": 0.04265927977839335, + "grad_norm": 0.42596134543418884, + "learning_rate": 4.999922607740855e-06, + "loss": 0.833, + "step": 154 + }, + { + "epoch": 0.04293628808864266, + "grad_norm": 0.42418310046195984, + "learning_rate": 4.999919714835816e-06, + "loss": 0.8072, + "step": 155 + }, + { + "epoch": 0.043213296398891966, + "grad_norm": 0.4111458361148834, + "learning_rate": 4.999916768851096e-06, + "loss": 0.7352, + "step": 156 + }, + { + "epoch": 0.043490304709141274, + "grad_norm": 0.4288739860057831, + "learning_rate": 4.99991376978676e-06, + "loss": 0.8314, + "step": 157 + }, + { + "epoch": 0.04376731301939058, + "grad_norm": 0.4291544556617737, + "learning_rate": 4.999910717642871e-06, + "loss": 0.8112, + "step": 158 + }, + { + "epoch": 0.04404432132963989, + "grad_norm": 0.42698702216148376, + "learning_rate": 4.9999076124194925e-06, + "loss": 0.7727, + "step": 159 + }, + { + "epoch": 0.0443213296398892, + "grad_norm": 0.3970215320587158, + "learning_rate": 4.999904454116691e-06, + "loss": 0.753, + "step": 160 + }, + { + "epoch": 0.044598337950138504, + "grad_norm": 0.4505182206630707, + "learning_rate": 4.9999012427345345e-06, + "loss": 0.8088, + "step": 161 + }, + { + "epoch": 0.04487534626038781, + "grad_norm": 0.4358536899089813, + "learning_rate": 4.9998979782730905e-06, + "loss": 0.8246, + "step": 162 + }, + { + "epoch": 0.04515235457063712, + "grad_norm": 0.4245203733444214, + "learning_rate": 4.999894660732428e-06, + "loss": 0.7818, + "step": 163 + }, + { + "epoch": 0.04542936288088643, + "grad_norm": 0.4643189609050751, + "learning_rate": 4.999891290112619e-06, + "loss": 0.8118, + "step": 164 + }, + { + "epoch": 0.045706371191135735, + "grad_norm": 0.4445042312145233, + "learning_rate": 4.999887866413732e-06, + "loss": 0.8134, + "step": 165 + }, + { + "epoch": 0.04598337950138504, + "grad_norm": 0.47488442063331604, + "learning_rate": 4.999884389635843e-06, + "loss": 0.8216, + "step": 166 + }, + { + "epoch": 0.04626038781163435, + "grad_norm": 0.44003528356552124, + "learning_rate": 4.999880859779024e-06, + "loss": 0.8301, + "step": 167 + }, + { + "epoch": 0.04653739612188366, + "grad_norm": 0.45998671650886536, + "learning_rate": 4.99987727684335e-06, + "loss": 0.7673, + "step": 168 + }, + { + "epoch": 0.046814404432132965, + "grad_norm": 0.43222859501838684, + "learning_rate": 4.999873640828897e-06, + "loss": 0.8208, + "step": 169 + }, + { + "epoch": 0.04709141274238227, + "grad_norm": 0.45207300782203674, + "learning_rate": 4.999869951735744e-06, + "loss": 0.8097, + "step": 170 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 0.4396827220916748, + "learning_rate": 4.999866209563967e-06, + "loss": 0.8436, + "step": 171 + }, + { + "epoch": 0.04764542936288089, + "grad_norm": 0.4444088935852051, + "learning_rate": 4.999862414313647e-06, + "loss": 0.822, + "step": 172 + }, + { + "epoch": 0.047922437673130196, + "grad_norm": 0.4522409439086914, + "learning_rate": 4.999858565984864e-06, + "loss": 0.8165, + "step": 173 + }, + { + "epoch": 0.0481994459833795, + "grad_norm": 0.42866262793540955, + "learning_rate": 4.999854664577699e-06, + "loss": 0.7353, + "step": 174 + }, + { + "epoch": 0.04847645429362881, + "grad_norm": 0.43448740243911743, + "learning_rate": 4.999850710092236e-06, + "loss": 0.7896, + "step": 175 + }, + { + "epoch": 0.04875346260387812, + "grad_norm": 0.42707017064094543, + "learning_rate": 4.999846702528559e-06, + "loss": 0.7476, + "step": 176 + }, + { + "epoch": 0.049030470914127426, + "grad_norm": 0.4463941156864166, + "learning_rate": 4.999842641886752e-06, + "loss": 0.88, + "step": 177 + }, + { + "epoch": 0.049307479224376734, + "grad_norm": 0.44734442234039307, + "learning_rate": 4.999838528166902e-06, + "loss": 0.7961, + "step": 178 + }, + { + "epoch": 0.04958448753462604, + "grad_norm": 0.4721640944480896, + "learning_rate": 4.999834361369096e-06, + "loss": 0.8119, + "step": 179 + }, + { + "epoch": 0.04986149584487535, + "grad_norm": 0.4124251902103424, + "learning_rate": 4.999830141493423e-06, + "loss": 0.7843, + "step": 180 + }, + { + "epoch": 0.05013850415512466, + "grad_norm": 0.4647778868675232, + "learning_rate": 4.999825868539971e-06, + "loss": 0.7736, + "step": 181 + }, + { + "epoch": 0.050415512465373964, + "grad_norm": 0.4526465833187103, + "learning_rate": 4.999821542508833e-06, + "loss": 0.7312, + "step": 182 + }, + { + "epoch": 0.05069252077562327, + "grad_norm": 0.4479784369468689, + "learning_rate": 4.9998171634001e-06, + "loss": 0.768, + "step": 183 + }, + { + "epoch": 0.05096952908587258, + "grad_norm": 0.4574298560619354, + "learning_rate": 4.9998127312138645e-06, + "loss": 0.8163, + "step": 184 + }, + { + "epoch": 0.05124653739612189, + "grad_norm": 0.470766544342041, + "learning_rate": 4.99980824595022e-06, + "loss": 0.7886, + "step": 185 + }, + { + "epoch": 0.05152354570637119, + "grad_norm": 0.46268603205680847, + "learning_rate": 4.999803707609264e-06, + "loss": 0.7692, + "step": 186 + }, + { + "epoch": 0.051800554016620495, + "grad_norm": 0.45423924922943115, + "learning_rate": 4.999799116191091e-06, + "loss": 0.8334, + "step": 187 + }, + { + "epoch": 0.0520775623268698, + "grad_norm": 0.4589080512523651, + "learning_rate": 4.9997944716957985e-06, + "loss": 0.7835, + "step": 188 + }, + { + "epoch": 0.05235457063711911, + "grad_norm": 0.4934840202331543, + "learning_rate": 4.999789774123486e-06, + "loss": 0.7404, + "step": 189 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.4710850119590759, + "learning_rate": 4.999785023474253e-06, + "loss": 0.8659, + "step": 190 + }, + { + "epoch": 0.052908587257617726, + "grad_norm": 0.44491663575172424, + "learning_rate": 4.9997802197482e-06, + "loss": 0.8241, + "step": 191 + }, + { + "epoch": 0.05318559556786703, + "grad_norm": 0.43371081352233887, + "learning_rate": 4.9997753629454295e-06, + "loss": 0.8145, + "step": 192 + }, + { + "epoch": 0.05346260387811634, + "grad_norm": 0.4385470151901245, + "learning_rate": 4.999770453066043e-06, + "loss": 0.7779, + "step": 193 + }, + { + "epoch": 0.05373961218836565, + "grad_norm": 0.46287888288497925, + "learning_rate": 4.999765490110148e-06, + "loss": 0.7731, + "step": 194 + }, + { + "epoch": 0.054016620498614956, + "grad_norm": 0.4724902808666229, + "learning_rate": 4.999760474077847e-06, + "loss": 0.8538, + "step": 195 + }, + { + "epoch": 0.054293628808864264, + "grad_norm": 0.4405905604362488, + "learning_rate": 4.999755404969247e-06, + "loss": 0.7764, + "step": 196 + }, + { + "epoch": 0.05457063711911357, + "grad_norm": 0.45440441370010376, + "learning_rate": 4.9997502827844565e-06, + "loss": 0.7488, + "step": 197 + }, + { + "epoch": 0.05484764542936288, + "grad_norm": 0.4569130539894104, + "learning_rate": 4.999745107523583e-06, + "loss": 0.7604, + "step": 198 + }, + { + "epoch": 0.05512465373961219, + "grad_norm": 0.44485560059547424, + "learning_rate": 4.999739879186739e-06, + "loss": 0.7815, + "step": 199 + }, + { + "epoch": 0.055401662049861494, + "grad_norm": 0.46380650997161865, + "learning_rate": 4.999734597774033e-06, + "loss": 0.8247, + "step": 200 + }, + { + "epoch": 0.0556786703601108, + "grad_norm": 0.43232858180999756, + "learning_rate": 4.999729263285578e-06, + "loss": 0.7638, + "step": 201 + }, + { + "epoch": 0.05595567867036011, + "grad_norm": 0.4762643277645111, + "learning_rate": 4.999723875721486e-06, + "loss": 0.8308, + "step": 202 + }, + { + "epoch": 0.05623268698060942, + "grad_norm": 0.4557073414325714, + "learning_rate": 4.999718435081873e-06, + "loss": 0.7677, + "step": 203 + }, + { + "epoch": 0.056509695290858725, + "grad_norm": 0.4623831808567047, + "learning_rate": 4.999712941366854e-06, + "loss": 0.7769, + "step": 204 + }, + { + "epoch": 0.05678670360110803, + "grad_norm": 0.4392584264278412, + "learning_rate": 4.999707394576547e-06, + "loss": 0.7662, + "step": 205 + }, + { + "epoch": 0.05706371191135734, + "grad_norm": 0.46204671263694763, + "learning_rate": 4.999701794711067e-06, + "loss": 0.7529, + "step": 206 + }, + { + "epoch": 0.05734072022160665, + "grad_norm": 0.4556841254234314, + "learning_rate": 4.999696141770535e-06, + "loss": 0.7985, + "step": 207 + }, + { + "epoch": 0.057617728531855955, + "grad_norm": 0.4767592251300812, + "learning_rate": 4.9996904357550705e-06, + "loss": 0.7712, + "step": 208 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 0.4875626266002655, + "learning_rate": 4.9996846766647935e-06, + "loss": 0.8396, + "step": 209 + }, + { + "epoch": 0.05817174515235457, + "grad_norm": 0.48093894124031067, + "learning_rate": 4.999678864499828e-06, + "loss": 0.7904, + "step": 210 + }, + { + "epoch": 0.05844875346260388, + "grad_norm": 0.4370277225971222, + "learning_rate": 4.999672999260297e-06, + "loss": 0.7666, + "step": 211 + }, + { + "epoch": 0.058725761772853186, + "grad_norm": 0.500248372554779, + "learning_rate": 4.9996670809463245e-06, + "loss": 0.8361, + "step": 212 + }, + { + "epoch": 0.05900277008310249, + "grad_norm": 0.47237977385520935, + "learning_rate": 4.999661109558036e-06, + "loss": 0.8191, + "step": 213 + }, + { + "epoch": 0.0592797783933518, + "grad_norm": 0.4805334210395813, + "learning_rate": 4.99965508509556e-06, + "loss": 0.8315, + "step": 214 + }, + { + "epoch": 0.05955678670360111, + "grad_norm": 0.4623453617095947, + "learning_rate": 4.999649007559022e-06, + "loss": 0.8125, + "step": 215 + }, + { + "epoch": 0.059833795013850416, + "grad_norm": 0.4652257561683655, + "learning_rate": 4.999642876948553e-06, + "loss": 0.7935, + "step": 216 + }, + { + "epoch": 0.060110803324099724, + "grad_norm": 0.47447672486305237, + "learning_rate": 4.9996366932642814e-06, + "loss": 0.8014, + "step": 217 + }, + { + "epoch": 0.06038781163434903, + "grad_norm": 0.45881807804107666, + "learning_rate": 4.99963045650634e-06, + "loss": 0.7345, + "step": 218 + }, + { + "epoch": 0.06066481994459834, + "grad_norm": 0.4540233612060547, + "learning_rate": 4.99962416667486e-06, + "loss": 0.8386, + "step": 219 + }, + { + "epoch": 0.060941828254847646, + "grad_norm": 0.461611807346344, + "learning_rate": 4.999617823769977e-06, + "loss": 0.7832, + "step": 220 + }, + { + "epoch": 0.061218836565096954, + "grad_norm": 0.4760967791080475, + "learning_rate": 4.999611427791823e-06, + "loss": 0.8167, + "step": 221 + }, + { + "epoch": 0.06149584487534626, + "grad_norm": 0.476432740688324, + "learning_rate": 4.999604978740535e-06, + "loss": 0.8037, + "step": 222 + }, + { + "epoch": 0.06177285318559557, + "grad_norm": 0.4600237309932709, + "learning_rate": 4.999598476616251e-06, + "loss": 0.7826, + "step": 223 + }, + { + "epoch": 0.06204986149584488, + "grad_norm": 0.45525020360946655, + "learning_rate": 4.999591921419107e-06, + "loss": 0.7856, + "step": 224 + }, + { + "epoch": 0.062326869806094184, + "grad_norm": 0.47345778346061707, + "learning_rate": 4.999585313149244e-06, + "loss": 0.755, + "step": 225 + }, + { + "epoch": 0.06260387811634349, + "grad_norm": 0.46238973736763, + "learning_rate": 4.999578651806801e-06, + "loss": 0.8153, + "step": 226 + }, + { + "epoch": 0.0628808864265928, + "grad_norm": 0.48064154386520386, + "learning_rate": 4.99957193739192e-06, + "loss": 0.7965, + "step": 227 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 0.5076362490653992, + "learning_rate": 4.9995651699047436e-06, + "loss": 0.8075, + "step": 228 + }, + { + "epoch": 0.06343490304709141, + "grad_norm": 0.4526555836200714, + "learning_rate": 4.999558349345416e-06, + "loss": 0.8074, + "step": 229 + }, + { + "epoch": 0.06371191135734072, + "grad_norm": 0.4602035880088806, + "learning_rate": 4.999551475714081e-06, + "loss": 0.7598, + "step": 230 + }, + { + "epoch": 0.06398891966759003, + "grad_norm": 0.46630504727363586, + "learning_rate": 4.999544549010885e-06, + "loss": 0.7549, + "step": 231 + }, + { + "epoch": 0.06426592797783934, + "grad_norm": 0.46300217509269714, + "learning_rate": 4.999537569235975e-06, + "loss": 0.7216, + "step": 232 + }, + { + "epoch": 0.06454293628808865, + "grad_norm": 0.46852821111679077, + "learning_rate": 4.9995305363895e-06, + "loss": 0.7966, + "step": 233 + }, + { + "epoch": 0.06481994459833795, + "grad_norm": 0.49596261978149414, + "learning_rate": 4.999523450471609e-06, + "loss": 0.8022, + "step": 234 + }, + { + "epoch": 0.06509695290858726, + "grad_norm": 0.48228102922439575, + "learning_rate": 4.99951631148245e-06, + "loss": 0.7834, + "step": 235 + }, + { + "epoch": 0.06537396121883657, + "grad_norm": 0.46538177132606506, + "learning_rate": 4.999509119422178e-06, + "loss": 0.7692, + "step": 236 + }, + { + "epoch": 0.06565096952908588, + "grad_norm": 0.4842844009399414, + "learning_rate": 4.999501874290944e-06, + "loss": 0.7869, + "step": 237 + }, + { + "epoch": 0.06592797783933518, + "grad_norm": 0.4862155616283417, + "learning_rate": 4.999494576088903e-06, + "loss": 0.8026, + "step": 238 + }, + { + "epoch": 0.06620498614958449, + "grad_norm": 0.4713565409183502, + "learning_rate": 4.999487224816208e-06, + "loss": 0.7665, + "step": 239 + }, + { + "epoch": 0.0664819944598338, + "grad_norm": 0.4564686715602875, + "learning_rate": 4.9994798204730165e-06, + "loss": 0.791, + "step": 240 + }, + { + "epoch": 0.0667590027700831, + "grad_norm": 0.4710893929004669, + "learning_rate": 4.999472363059485e-06, + "loss": 0.7406, + "step": 241 + }, + { + "epoch": 0.06703601108033241, + "grad_norm": 0.4848516583442688, + "learning_rate": 4.999464852575774e-06, + "loss": 0.8032, + "step": 242 + }, + { + "epoch": 0.06731301939058172, + "grad_norm": 0.4644072651863098, + "learning_rate": 4.9994572890220395e-06, + "loss": 0.7975, + "step": 243 + }, + { + "epoch": 0.06759002770083103, + "grad_norm": 0.46789732575416565, + "learning_rate": 4.999449672398445e-06, + "loss": 0.7652, + "step": 244 + }, + { + "epoch": 0.06786703601108034, + "grad_norm": 0.4857868552207947, + "learning_rate": 4.99944200270515e-06, + "loss": 0.7853, + "step": 245 + }, + { + "epoch": 0.06814404432132964, + "grad_norm": 0.4426226019859314, + "learning_rate": 4.999434279942319e-06, + "loss": 0.6664, + "step": 246 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 0.4759196937084198, + "learning_rate": 4.9994265041101155e-06, + "loss": 0.8032, + "step": 247 + }, + { + "epoch": 0.06869806094182826, + "grad_norm": 0.4856152832508087, + "learning_rate": 4.999418675208704e-06, + "loss": 0.7578, + "step": 248 + }, + { + "epoch": 0.06897506925207757, + "grad_norm": 0.4679270088672638, + "learning_rate": 4.999410793238252e-06, + "loss": 0.7618, + "step": 249 + }, + { + "epoch": 0.06925207756232687, + "grad_norm": 0.48782986402511597, + "learning_rate": 4.999402858198925e-06, + "loss": 0.7189, + "step": 250 + }, + { + "epoch": 0.06952908587257618, + "grad_norm": 0.44610273838043213, + "learning_rate": 4.999394870090894e-06, + "loss": 0.784, + "step": 251 + }, + { + "epoch": 0.06980609418282549, + "grad_norm": 0.4906792938709259, + "learning_rate": 4.9993868289143255e-06, + "loss": 0.7445, + "step": 252 + }, + { + "epoch": 0.0700831024930748, + "grad_norm": 0.4803594946861267, + "learning_rate": 4.9993787346693924e-06, + "loss": 0.7581, + "step": 253 + }, + { + "epoch": 0.0703601108033241, + "grad_norm": 0.47042086720466614, + "learning_rate": 4.999370587356267e-06, + "loss": 0.7409, + "step": 254 + }, + { + "epoch": 0.07063711911357341, + "grad_norm": 0.5199850797653198, + "learning_rate": 4.999362386975121e-06, + "loss": 0.7776, + "step": 255 + }, + { + "epoch": 0.07091412742382272, + "grad_norm": 0.5027449131011963, + "learning_rate": 4.999354133526127e-06, + "loss": 0.808, + "step": 256 + }, + { + "epoch": 0.07119113573407203, + "grad_norm": 0.5312409996986389, + "learning_rate": 4.999345827009464e-06, + "loss": 0.8147, + "step": 257 + }, + { + "epoch": 0.07146814404432134, + "grad_norm": 0.4875726103782654, + "learning_rate": 4.9993374674253065e-06, + "loss": 0.7487, + "step": 258 + }, + { + "epoch": 0.07174515235457064, + "grad_norm": 0.46339771151542664, + "learning_rate": 4.999329054773832e-06, + "loss": 0.6929, + "step": 259 + }, + { + "epoch": 0.07202216066481995, + "grad_norm": 0.4794660210609436, + "learning_rate": 4.999320589055218e-06, + "loss": 0.7913, + "step": 260 + }, + { + "epoch": 0.07229916897506926, + "grad_norm": 0.481033056974411, + "learning_rate": 4.999312070269647e-06, + "loss": 0.7211, + "step": 261 + }, + { + "epoch": 0.07257617728531857, + "grad_norm": 0.49464985728263855, + "learning_rate": 4.999303498417296e-06, + "loss": 0.7695, + "step": 262 + }, + { + "epoch": 0.07285318559556787, + "grad_norm": 0.5082013607025146, + "learning_rate": 4.99929487349835e-06, + "loss": 0.7431, + "step": 263 + }, + { + "epoch": 0.07313019390581718, + "grad_norm": 0.45051777362823486, + "learning_rate": 4.999286195512992e-06, + "loss": 0.7167, + "step": 264 + }, + { + "epoch": 0.07340720221606649, + "grad_norm": 0.47732335329055786, + "learning_rate": 4.999277464461405e-06, + "loss": 0.7561, + "step": 265 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 0.4845159649848938, + "learning_rate": 4.999268680343775e-06, + "loss": 0.7316, + "step": 266 + }, + { + "epoch": 0.07396121883656509, + "grad_norm": 0.522322952747345, + "learning_rate": 4.999259843160289e-06, + "loss": 0.7979, + "step": 267 + }, + { + "epoch": 0.0742382271468144, + "grad_norm": 0.4906967282295227, + "learning_rate": 4.9992509529111335e-06, + "loss": 0.7278, + "step": 268 + }, + { + "epoch": 0.0745152354570637, + "grad_norm": 0.4843719005584717, + "learning_rate": 4.999242009596498e-06, + "loss": 0.7645, + "step": 269 + }, + { + "epoch": 0.07479224376731301, + "grad_norm": 0.4734237790107727, + "learning_rate": 4.9992330132165725e-06, + "loss": 0.7667, + "step": 270 + }, + { + "epoch": 0.07506925207756232, + "grad_norm": 0.49938181042671204, + "learning_rate": 4.999223963771548e-06, + "loss": 0.825, + "step": 271 + }, + { + "epoch": 0.07534626038781163, + "grad_norm": 0.5260105729103088, + "learning_rate": 4.999214861261616e-06, + "loss": 0.776, + "step": 272 + }, + { + "epoch": 0.07562326869806094, + "grad_norm": 0.4987190365791321, + "learning_rate": 4.99920570568697e-06, + "loss": 0.7815, + "step": 273 + }, + { + "epoch": 0.07590027700831024, + "grad_norm": 0.5055657625198364, + "learning_rate": 4.9991964970478055e-06, + "loss": 0.7818, + "step": 274 + }, + { + "epoch": 0.07617728531855955, + "grad_norm": 0.49878621101379395, + "learning_rate": 4.999187235344316e-06, + "loss": 0.7492, + "step": 275 + }, + { + "epoch": 0.07645429362880886, + "grad_norm": 0.5403164625167847, + "learning_rate": 4.9991779205767e-06, + "loss": 0.8327, + "step": 276 + }, + { + "epoch": 0.07673130193905817, + "grad_norm": 0.518882155418396, + "learning_rate": 4.999168552745154e-06, + "loss": 0.7789, + "step": 277 + }, + { + "epoch": 0.07700831024930747, + "grad_norm": 0.5157930254936218, + "learning_rate": 4.999159131849878e-06, + "loss": 0.7889, + "step": 278 + }, + { + "epoch": 0.07728531855955678, + "grad_norm": 0.4988757371902466, + "learning_rate": 4.999149657891071e-06, + "loss": 0.7961, + "step": 279 + }, + { + "epoch": 0.07756232686980609, + "grad_norm": 0.5131459832191467, + "learning_rate": 4.999140130868935e-06, + "loss": 0.7952, + "step": 280 + }, + { + "epoch": 0.0778393351800554, + "grad_norm": 0.5224674344062805, + "learning_rate": 4.999130550783672e-06, + "loss": 0.8077, + "step": 281 + }, + { + "epoch": 0.0781163434903047, + "grad_norm": 0.5174897313117981, + "learning_rate": 4.999120917635485e-06, + "loss": 0.8272, + "step": 282 + }, + { + "epoch": 0.07839335180055401, + "grad_norm": 0.4849293529987335, + "learning_rate": 4.999111231424579e-06, + "loss": 0.7451, + "step": 283 + }, + { + "epoch": 0.07867036011080332, + "grad_norm": 0.4857077896595001, + "learning_rate": 4.999101492151159e-06, + "loss": 0.7637, + "step": 284 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 0.4597320854663849, + "learning_rate": 4.999091699815434e-06, + "loss": 0.7693, + "step": 285 + }, + { + "epoch": 0.07922437673130193, + "grad_norm": 0.4780363440513611, + "learning_rate": 4.999081854417608e-06, + "loss": 0.7371, + "step": 286 + }, + { + "epoch": 0.07950138504155124, + "grad_norm": 0.4885082244873047, + "learning_rate": 4.999071955957894e-06, + "loss": 0.8039, + "step": 287 + }, + { + "epoch": 0.07977839335180055, + "grad_norm": 0.5139564871788025, + "learning_rate": 4.9990620044365e-06, + "loss": 0.7936, + "step": 288 + }, + { + "epoch": 0.08005540166204986, + "grad_norm": 0.4996238350868225, + "learning_rate": 4.999051999853637e-06, + "loss": 0.7985, + "step": 289 + }, + { + "epoch": 0.08033240997229917, + "grad_norm": 0.4952002763748169, + "learning_rate": 4.9990419422095185e-06, + "loss": 0.8066, + "step": 290 + }, + { + "epoch": 0.08060941828254847, + "grad_norm": 0.5024906992912292, + "learning_rate": 4.999031831504358e-06, + "loss": 0.7374, + "step": 291 + }, + { + "epoch": 0.08088642659279778, + "grad_norm": 0.4709744453430176, + "learning_rate": 4.9990216677383695e-06, + "loss": 0.7183, + "step": 292 + }, + { + "epoch": 0.08116343490304709, + "grad_norm": 0.509341835975647, + "learning_rate": 4.99901145091177e-06, + "loss": 0.7757, + "step": 293 + }, + { + "epoch": 0.0814404432132964, + "grad_norm": 0.49265819787979126, + "learning_rate": 4.999001181024775e-06, + "loss": 0.7329, + "step": 294 + }, + { + "epoch": 0.0817174515235457, + "grad_norm": 0.5245534181594849, + "learning_rate": 4.998990858077602e-06, + "loss": 0.7452, + "step": 295 + }, + { + "epoch": 0.08199445983379501, + "grad_norm": 0.519623339176178, + "learning_rate": 4.998980482070473e-06, + "loss": 0.7812, + "step": 296 + }, + { + "epoch": 0.08227146814404432, + "grad_norm": 0.5161734819412231, + "learning_rate": 4.998970053003607e-06, + "loss": 0.7834, + "step": 297 + }, + { + "epoch": 0.08254847645429363, + "grad_norm": 0.48719218373298645, + "learning_rate": 4.998959570877224e-06, + "loss": 0.7662, + "step": 298 + }, + { + "epoch": 0.08282548476454293, + "grad_norm": 0.5277923941612244, + "learning_rate": 4.998949035691548e-06, + "loss": 0.7422, + "step": 299 + }, + { + "epoch": 0.08310249307479224, + "grad_norm": 0.49356892704963684, + "learning_rate": 4.998938447446803e-06, + "loss": 0.7372, + "step": 300 + }, + { + "epoch": 0.08337950138504155, + "grad_norm": 0.5243819952011108, + "learning_rate": 4.998927806143212e-06, + "loss": 0.7854, + "step": 301 + }, + { + "epoch": 0.08365650969529086, + "grad_norm": 0.5223947763442993, + "learning_rate": 4.998917111781003e-06, + "loss": 0.7182, + "step": 302 + }, + { + "epoch": 0.08393351800554016, + "grad_norm": 0.49004465341567993, + "learning_rate": 4.998906364360401e-06, + "loss": 0.7404, + "step": 303 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.502047598361969, + "learning_rate": 4.998895563881637e-06, + "loss": 0.7783, + "step": 304 + }, + { + "epoch": 0.08448753462603878, + "grad_norm": 0.4903964400291443, + "learning_rate": 4.9988847103449376e-06, + "loss": 0.7503, + "step": 305 + }, + { + "epoch": 0.08476454293628809, + "grad_norm": 0.5018570423126221, + "learning_rate": 4.998873803750534e-06, + "loss": 0.7641, + "step": 306 + }, + { + "epoch": 0.0850415512465374, + "grad_norm": 0.5024906396865845, + "learning_rate": 4.998862844098659e-06, + "loss": 0.8005, + "step": 307 + }, + { + "epoch": 0.0853185595567867, + "grad_norm": 0.5039277076721191, + "learning_rate": 4.998851831389544e-06, + "loss": 0.7907, + "step": 308 + }, + { + "epoch": 0.08559556786703601, + "grad_norm": 0.5008938312530518, + "learning_rate": 4.998840765623424e-06, + "loss": 0.774, + "step": 309 + }, + { + "epoch": 0.08587257617728532, + "grad_norm": 0.4881752133369446, + "learning_rate": 4.998829646800533e-06, + "loss": 0.6848, + "step": 310 + }, + { + "epoch": 0.08614958448753463, + "grad_norm": 0.5107519030570984, + "learning_rate": 4.998818474921107e-06, + "loss": 0.7868, + "step": 311 + }, + { + "epoch": 0.08642659279778393, + "grad_norm": 0.5051405429840088, + "learning_rate": 4.998807249985382e-06, + "loss": 0.7591, + "step": 312 + }, + { + "epoch": 0.08670360110803324, + "grad_norm": 0.5206391215324402, + "learning_rate": 4.9987959719935995e-06, + "loss": 0.7905, + "step": 313 + }, + { + "epoch": 0.08698060941828255, + "grad_norm": 0.5223249793052673, + "learning_rate": 4.998784640945997e-06, + "loss": 0.7769, + "step": 314 + }, + { + "epoch": 0.08725761772853186, + "grad_norm": 0.5234482884407043, + "learning_rate": 4.998773256842816e-06, + "loss": 0.7348, + "step": 315 + }, + { + "epoch": 0.08753462603878116, + "grad_norm": 0.5284384489059448, + "learning_rate": 4.998761819684297e-06, + "loss": 0.7819, + "step": 316 + }, + { + "epoch": 0.08781163434903047, + "grad_norm": 0.5017664432525635, + "learning_rate": 4.998750329470683e-06, + "loss": 0.7495, + "step": 317 + }, + { + "epoch": 0.08808864265927978, + "grad_norm": 0.5360183119773865, + "learning_rate": 4.998738786202219e-06, + "loss": 0.8078, + "step": 318 + }, + { + "epoch": 0.08836565096952909, + "grad_norm": 0.4968814253807068, + "learning_rate": 4.9987271898791486e-06, + "loss": 0.7568, + "step": 319 + }, + { + "epoch": 0.0886426592797784, + "grad_norm": 0.5032696723937988, + "learning_rate": 4.99871554050172e-06, + "loss": 0.7403, + "step": 320 + }, + { + "epoch": 0.0889196675900277, + "grad_norm": 0.5070774555206299, + "learning_rate": 4.998703838070179e-06, + "loss": 0.7752, + "step": 321 + }, + { + "epoch": 0.08919667590027701, + "grad_norm": 0.5313223600387573, + "learning_rate": 4.9986920825847746e-06, + "loss": 0.7908, + "step": 322 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 0.5166332125663757, + "learning_rate": 4.998680274045756e-06, + "loss": 0.7316, + "step": 323 + }, + { + "epoch": 0.08975069252077562, + "grad_norm": 0.5434812307357788, + "learning_rate": 4.998668412453375e-06, + "loss": 0.7771, + "step": 324 + }, + { + "epoch": 0.09002770083102493, + "grad_norm": 0.5094904899597168, + "learning_rate": 4.998656497807882e-06, + "loss": 0.7731, + "step": 325 + }, + { + "epoch": 0.09030470914127424, + "grad_norm": 0.5478783249855042, + "learning_rate": 4.998644530109531e-06, + "loss": 0.8333, + "step": 326 + }, + { + "epoch": 0.09058171745152355, + "grad_norm": 0.5198971629142761, + "learning_rate": 4.998632509358575e-06, + "loss": 0.7141, + "step": 327 + }, + { + "epoch": 0.09085872576177285, + "grad_norm": 0.5152432322502136, + "learning_rate": 4.998620435555271e-06, + "loss": 0.8345, + "step": 328 + }, + { + "epoch": 0.09113573407202216, + "grad_norm": 0.4936472475528717, + "learning_rate": 4.9986083086998745e-06, + "loss": 0.7765, + "step": 329 + }, + { + "epoch": 0.09141274238227147, + "grad_norm": 0.5072058439254761, + "learning_rate": 4.998596128792642e-06, + "loss": 0.7071, + "step": 330 + }, + { + "epoch": 0.09168975069252078, + "grad_norm": 0.5457636713981628, + "learning_rate": 4.998583895833834e-06, + "loss": 0.7662, + "step": 331 + }, + { + "epoch": 0.09196675900277008, + "grad_norm": 0.51166170835495, + "learning_rate": 4.998571609823708e-06, + "loss": 0.8131, + "step": 332 + }, + { + "epoch": 0.09224376731301939, + "grad_norm": 0.5171675682067871, + "learning_rate": 4.998559270762527e-06, + "loss": 0.7898, + "step": 333 + }, + { + "epoch": 0.0925207756232687, + "grad_norm": 0.5349574685096741, + "learning_rate": 4.9985468786505515e-06, + "loss": 0.7768, + "step": 334 + }, + { + "epoch": 0.09279778393351801, + "grad_norm": 0.5129192471504211, + "learning_rate": 4.9985344334880455e-06, + "loss": 0.7694, + "step": 335 + }, + { + "epoch": 0.09307479224376732, + "grad_norm": 0.5073570609092712, + "learning_rate": 4.998521935275272e-06, + "loss": 0.7798, + "step": 336 + }, + { + "epoch": 0.09335180055401662, + "grad_norm": 0.5064870119094849, + "learning_rate": 4.998509384012499e-06, + "loss": 0.7825, + "step": 337 + }, + { + "epoch": 0.09362880886426593, + "grad_norm": 0.5043659806251526, + "learning_rate": 4.99849677969999e-06, + "loss": 0.7532, + "step": 338 + }, + { + "epoch": 0.09390581717451524, + "grad_norm": 0.5378217101097107, + "learning_rate": 4.998484122338014e-06, + "loss": 0.7881, + "step": 339 + }, + { + "epoch": 0.09418282548476455, + "grad_norm": 0.48902422189712524, + "learning_rate": 4.998471411926841e-06, + "loss": 0.7362, + "step": 340 + }, + { + "epoch": 0.09445983379501385, + "grad_norm": 0.49571239948272705, + "learning_rate": 4.998458648466738e-06, + "loss": 0.7496, + "step": 341 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 0.47406813502311707, + "learning_rate": 4.9984458319579775e-06, + "loss": 0.7248, + "step": 342 + }, + { + "epoch": 0.09501385041551247, + "grad_norm": 0.5102288722991943, + "learning_rate": 4.998432962400832e-06, + "loss": 0.773, + "step": 343 + }, + { + "epoch": 0.09529085872576178, + "grad_norm": 0.5285248160362244, + "learning_rate": 4.9984200397955755e-06, + "loss": 0.7537, + "step": 344 + }, + { + "epoch": 0.09556786703601108, + "grad_norm": 0.5389485359191895, + "learning_rate": 4.99840706414248e-06, + "loss": 0.7941, + "step": 345 + }, + { + "epoch": 0.09584487534626039, + "grad_norm": 0.49833494424819946, + "learning_rate": 4.9983940354418225e-06, + "loss": 0.782, + "step": 346 + }, + { + "epoch": 0.0961218836565097, + "grad_norm": 0.5146127939224243, + "learning_rate": 4.998380953693879e-06, + "loss": 0.7467, + "step": 347 + }, + { + "epoch": 0.096398891966759, + "grad_norm": 0.5298901200294495, + "learning_rate": 4.998367818898928e-06, + "loss": 0.7928, + "step": 348 + }, + { + "epoch": 0.09667590027700831, + "grad_norm": 0.49955227971076965, + "learning_rate": 4.998354631057248e-06, + "loss": 0.7364, + "step": 349 + }, + { + "epoch": 0.09695290858725762, + "grad_norm": 0.510553777217865, + "learning_rate": 4.99834139016912e-06, + "loss": 0.7328, + "step": 350 + }, + { + "epoch": 0.09722991689750693, + "grad_norm": 0.530810534954071, + "learning_rate": 4.998328096234823e-06, + "loss": 0.7356, + "step": 351 + }, + { + "epoch": 0.09750692520775624, + "grad_norm": 0.5010079741477966, + "learning_rate": 4.998314749254641e-06, + "loss": 0.7729, + "step": 352 + }, + { + "epoch": 0.09778393351800554, + "grad_norm": 0.5428636074066162, + "learning_rate": 4.998301349228858e-06, + "loss": 0.7817, + "step": 353 + }, + { + "epoch": 0.09806094182825485, + "grad_norm": 0.530292809009552, + "learning_rate": 4.998287896157757e-06, + "loss": 0.7581, + "step": 354 + }, + { + "epoch": 0.09833795013850416, + "grad_norm": 0.4924919903278351, + "learning_rate": 4.998274390041622e-06, + "loss": 0.749, + "step": 355 + }, + { + "epoch": 0.09861495844875347, + "grad_norm": 0.5401570796966553, + "learning_rate": 4.998260830880744e-06, + "loss": 0.8132, + "step": 356 + }, + { + "epoch": 0.09889196675900278, + "grad_norm": 0.5332610011100769, + "learning_rate": 4.998247218675407e-06, + "loss": 0.7535, + "step": 357 + }, + { + "epoch": 0.09916897506925208, + "grad_norm": 0.5515699982643127, + "learning_rate": 4.998233553425903e-06, + "loss": 0.7602, + "step": 358 + }, + { + "epoch": 0.09944598337950139, + "grad_norm": 0.5187147259712219, + "learning_rate": 4.998219835132521e-06, + "loss": 0.8135, + "step": 359 + }, + { + "epoch": 0.0997229916897507, + "grad_norm": 0.537226676940918, + "learning_rate": 4.998206063795551e-06, + "loss": 0.7067, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 0.5453082919120789, + "learning_rate": 4.998192239415287e-06, + "loss": 0.7277, + "step": 361 + }, + { + "epoch": 0.10027700831024931, + "grad_norm": 0.5305824875831604, + "learning_rate": 4.998178361992022e-06, + "loss": 0.7555, + "step": 362 + }, + { + "epoch": 0.10055401662049862, + "grad_norm": 0.5143697261810303, + "learning_rate": 4.998164431526052e-06, + "loss": 0.7336, + "step": 363 + }, + { + "epoch": 0.10083102493074793, + "grad_norm": 0.48405060172080994, + "learning_rate": 4.99815044801767e-06, + "loss": 0.73, + "step": 364 + }, + { + "epoch": 0.10110803324099724, + "grad_norm": 0.537443995475769, + "learning_rate": 4.998136411467174e-06, + "loss": 0.7398, + "step": 365 + }, + { + "epoch": 0.10138504155124654, + "grad_norm": 0.563195526599884, + "learning_rate": 4.9981223218748635e-06, + "loss": 0.7933, + "step": 366 + }, + { + "epoch": 0.10166204986149585, + "grad_norm": 0.532027006149292, + "learning_rate": 4.9981081792410365e-06, + "loss": 0.7176, + "step": 367 + }, + { + "epoch": 0.10193905817174516, + "grad_norm": 0.533951997756958, + "learning_rate": 4.998093983565994e-06, + "loss": 0.7919, + "step": 368 + }, + { + "epoch": 0.10221606648199447, + "grad_norm": 0.5181298851966858, + "learning_rate": 4.998079734850036e-06, + "loss": 0.7518, + "step": 369 + }, + { + "epoch": 0.10249307479224377, + "grad_norm": 0.5276827812194824, + "learning_rate": 4.998065433093465e-06, + "loss": 0.7567, + "step": 370 + }, + { + "epoch": 0.10277008310249308, + "grad_norm": 0.5201783776283264, + "learning_rate": 4.998051078296586e-06, + "loss": 0.7873, + "step": 371 + }, + { + "epoch": 0.10304709141274238, + "grad_norm": 0.5155224204063416, + "learning_rate": 4.998036670459704e-06, + "loss": 0.7725, + "step": 372 + }, + { + "epoch": 0.10332409972299168, + "grad_norm": 0.5198693871498108, + "learning_rate": 4.998022209583124e-06, + "loss": 0.7613, + "step": 373 + }, + { + "epoch": 0.10360110803324099, + "grad_norm": 0.5042403936386108, + "learning_rate": 4.998007695667153e-06, + "loss": 0.6974, + "step": 374 + }, + { + "epoch": 0.1038781163434903, + "grad_norm": 0.5788574814796448, + "learning_rate": 4.9979931287121e-06, + "loss": 0.7692, + "step": 375 + }, + { + "epoch": 0.1041551246537396, + "grad_norm": 0.51310133934021, + "learning_rate": 4.997978508718273e-06, + "loss": 0.7307, + "step": 376 + }, + { + "epoch": 0.10443213296398891, + "grad_norm": 0.5450568199157715, + "learning_rate": 4.997963835685983e-06, + "loss": 0.7569, + "step": 377 + }, + { + "epoch": 0.10470914127423822, + "grad_norm": 0.49824953079223633, + "learning_rate": 4.997949109615542e-06, + "loss": 0.7723, + "step": 378 + }, + { + "epoch": 0.10498614958448753, + "grad_norm": 0.5454652905464172, + "learning_rate": 4.9979343305072625e-06, + "loss": 0.756, + "step": 379 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.5060559511184692, + "learning_rate": 4.997919498361458e-06, + "loss": 0.7974, + "step": 380 + }, + { + "epoch": 0.10554016620498614, + "grad_norm": 0.5230852365493774, + "learning_rate": 4.997904613178443e-06, + "loss": 0.776, + "step": 381 + }, + { + "epoch": 0.10581717451523545, + "grad_norm": 0.5657520890235901, + "learning_rate": 4.997889674958535e-06, + "loss": 0.7845, + "step": 382 + }, + { + "epoch": 0.10609418282548476, + "grad_norm": 0.5440294146537781, + "learning_rate": 4.9978746837020495e-06, + "loss": 0.7658, + "step": 383 + }, + { + "epoch": 0.10637119113573407, + "grad_norm": 0.5455194115638733, + "learning_rate": 4.997859639409306e-06, + "loss": 0.7923, + "step": 384 + }, + { + "epoch": 0.10664819944598337, + "grad_norm": 0.5205777287483215, + "learning_rate": 4.997844542080623e-06, + "loss": 0.7595, + "step": 385 + }, + { + "epoch": 0.10692520775623268, + "grad_norm": 0.5009412169456482, + "learning_rate": 4.9978293917163225e-06, + "loss": 0.7421, + "step": 386 + }, + { + "epoch": 0.10720221606648199, + "grad_norm": 0.5138939619064331, + "learning_rate": 4.997814188316725e-06, + "loss": 0.7442, + "step": 387 + }, + { + "epoch": 0.1074792243767313, + "grad_norm": 0.5283079147338867, + "learning_rate": 4.997798931882153e-06, + "loss": 0.7864, + "step": 388 + }, + { + "epoch": 0.1077562326869806, + "grad_norm": 0.522305965423584, + "learning_rate": 4.997783622412932e-06, + "loss": 0.6957, + "step": 389 + }, + { + "epoch": 0.10803324099722991, + "grad_norm": 0.5620875954627991, + "learning_rate": 4.997768259909385e-06, + "loss": 0.7834, + "step": 390 + }, + { + "epoch": 0.10831024930747922, + "grad_norm": 0.5754278898239136, + "learning_rate": 4.997752844371839e-06, + "loss": 0.7981, + "step": 391 + }, + { + "epoch": 0.10858725761772853, + "grad_norm": 0.5440781712532043, + "learning_rate": 4.997737375800623e-06, + "loss": 0.7658, + "step": 392 + }, + { + "epoch": 0.10886426592797784, + "grad_norm": 0.5162774920463562, + "learning_rate": 4.997721854196063e-06, + "loss": 0.7367, + "step": 393 + }, + { + "epoch": 0.10914127423822714, + "grad_norm": 0.5624562501907349, + "learning_rate": 4.9977062795584895e-06, + "loss": 0.7703, + "step": 394 + }, + { + "epoch": 0.10941828254847645, + "grad_norm": 0.5338923931121826, + "learning_rate": 4.997690651888234e-06, + "loss": 0.8014, + "step": 395 + }, + { + "epoch": 0.10969529085872576, + "grad_norm": 0.5490716695785522, + "learning_rate": 4.997674971185627e-06, + "loss": 0.8036, + "step": 396 + }, + { + "epoch": 0.10997229916897507, + "grad_norm": 0.4992327094078064, + "learning_rate": 4.997659237451002e-06, + "loss": 0.7433, + "step": 397 + }, + { + "epoch": 0.11024930747922437, + "grad_norm": 0.5591122508049011, + "learning_rate": 4.997643450684692e-06, + "loss": 0.756, + "step": 398 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 0.5397447347640991, + "learning_rate": 4.997627610887035e-06, + "loss": 0.7525, + "step": 399 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 0.5162704586982727, + "learning_rate": 4.997611718058365e-06, + "loss": 0.7558, + "step": 400 + }, + { + "epoch": 0.1110803324099723, + "grad_norm": 0.5250886678695679, + "learning_rate": 4.997595772199019e-06, + "loss": 0.7266, + "step": 401 + }, + { + "epoch": 0.1113573407202216, + "grad_norm": 0.5203819274902344, + "learning_rate": 4.9975797733093386e-06, + "loss": 0.6951, + "step": 402 + }, + { + "epoch": 0.11163434903047091, + "grad_norm": 0.5280457139015198, + "learning_rate": 4.997563721389661e-06, + "loss": 0.701, + "step": 403 + }, + { + "epoch": 0.11191135734072022, + "grad_norm": 0.5666073560714722, + "learning_rate": 4.997547616440327e-06, + "loss": 0.7444, + "step": 404 + }, + { + "epoch": 0.11218836565096953, + "grad_norm": 0.5710530877113342, + "learning_rate": 4.997531458461678e-06, + "loss": 0.8137, + "step": 405 + }, + { + "epoch": 0.11246537396121883, + "grad_norm": 0.5098825097084045, + "learning_rate": 4.99751524745406e-06, + "loss": 0.7014, + "step": 406 + }, + { + "epoch": 0.11274238227146814, + "grad_norm": 0.5209449529647827, + "learning_rate": 4.997498983417814e-06, + "loss": 0.768, + "step": 407 + }, + { + "epoch": 0.11301939058171745, + "grad_norm": 0.5273720622062683, + "learning_rate": 4.997482666353287e-06, + "loss": 0.7811, + "step": 408 + }, + { + "epoch": 0.11329639889196676, + "grad_norm": 0.5411760807037354, + "learning_rate": 4.997466296260826e-06, + "loss": 0.7593, + "step": 409 + }, + { + "epoch": 0.11357340720221606, + "grad_norm": 0.5214300155639648, + "learning_rate": 4.997449873140776e-06, + "loss": 0.7749, + "step": 410 + }, + { + "epoch": 0.11385041551246537, + "grad_norm": 0.5203116536140442, + "learning_rate": 4.9974333969934886e-06, + "loss": 0.7535, + "step": 411 + }, + { + "epoch": 0.11412742382271468, + "grad_norm": 0.5743964910507202, + "learning_rate": 4.997416867819312e-06, + "loss": 0.7537, + "step": 412 + }, + { + "epoch": 0.11440443213296399, + "grad_norm": 0.5519621968269348, + "learning_rate": 4.997400285618597e-06, + "loss": 0.8026, + "step": 413 + }, + { + "epoch": 0.1146814404432133, + "grad_norm": 0.5224972367286682, + "learning_rate": 4.997383650391698e-06, + "loss": 0.7503, + "step": 414 + }, + { + "epoch": 0.1149584487534626, + "grad_norm": 0.561909556388855, + "learning_rate": 4.997366962138965e-06, + "loss": 0.7419, + "step": 415 + }, + { + "epoch": 0.11523545706371191, + "grad_norm": 0.5283569097518921, + "learning_rate": 4.997350220860755e-06, + "loss": 0.6599, + "step": 416 + }, + { + "epoch": 0.11551246537396122, + "grad_norm": 0.5303184390068054, + "learning_rate": 4.997333426557422e-06, + "loss": 0.7193, + "step": 417 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 0.5405933856964111, + "learning_rate": 4.997316579229322e-06, + "loss": 0.7526, + "step": 418 + }, + { + "epoch": 0.11606648199445983, + "grad_norm": 0.5390035510063171, + "learning_rate": 4.997299678876813e-06, + "loss": 0.7556, + "step": 419 + }, + { + "epoch": 0.11634349030470914, + "grad_norm": 0.5486871600151062, + "learning_rate": 4.997282725500256e-06, + "loss": 0.7899, + "step": 420 + }, + { + "epoch": 0.11662049861495845, + "grad_norm": 0.5667344927787781, + "learning_rate": 4.997265719100009e-06, + "loss": 0.7683, + "step": 421 + }, + { + "epoch": 0.11689750692520776, + "grad_norm": 0.5894953608512878, + "learning_rate": 4.9972486596764335e-06, + "loss": 0.8083, + "step": 422 + }, + { + "epoch": 0.11717451523545706, + "grad_norm": 0.5495768785476685, + "learning_rate": 4.997231547229892e-06, + "loss": 0.7519, + "step": 423 + }, + { + "epoch": 0.11745152354570637, + "grad_norm": 0.5191949009895325, + "learning_rate": 4.997214381760746e-06, + "loss": 0.7611, + "step": 424 + }, + { + "epoch": 0.11772853185595568, + "grad_norm": 0.5151809453964233, + "learning_rate": 4.997197163269363e-06, + "loss": 0.7357, + "step": 425 + }, + { + "epoch": 0.11800554016620499, + "grad_norm": 0.5261116623878479, + "learning_rate": 4.997179891756107e-06, + "loss": 0.701, + "step": 426 + }, + { + "epoch": 0.1182825484764543, + "grad_norm": 0.5604850649833679, + "learning_rate": 4.997162567221344e-06, + "loss": 0.7442, + "step": 427 + }, + { + "epoch": 0.1185595567867036, + "grad_norm": 0.553929328918457, + "learning_rate": 4.997145189665443e-06, + "loss": 0.7543, + "step": 428 + }, + { + "epoch": 0.11883656509695291, + "grad_norm": 0.547197163105011, + "learning_rate": 4.997127759088772e-06, + "loss": 0.6987, + "step": 429 + }, + { + "epoch": 0.11911357340720222, + "grad_norm": 0.546122133731842, + "learning_rate": 4.997110275491702e-06, + "loss": 0.7578, + "step": 430 + }, + { + "epoch": 0.11939058171745152, + "grad_norm": 0.5426105260848999, + "learning_rate": 4.997092738874605e-06, + "loss": 0.7829, + "step": 431 + }, + { + "epoch": 0.11966759002770083, + "grad_norm": 0.5304821133613586, + "learning_rate": 4.9970751492378514e-06, + "loss": 0.7703, + "step": 432 + }, + { + "epoch": 0.11994459833795014, + "grad_norm": 0.5384340286254883, + "learning_rate": 4.997057506581816e-06, + "loss": 0.7285, + "step": 433 + }, + { + "epoch": 0.12022160664819945, + "grad_norm": 0.5669154524803162, + "learning_rate": 4.9970398109068716e-06, + "loss": 0.7767, + "step": 434 + }, + { + "epoch": 0.12049861495844875, + "grad_norm": 0.5595332384109497, + "learning_rate": 4.997022062213397e-06, + "loss": 0.7556, + "step": 435 + }, + { + "epoch": 0.12077562326869806, + "grad_norm": 0.5652584433555603, + "learning_rate": 4.997004260501767e-06, + "loss": 0.7491, + "step": 436 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 0.5273657441139221, + "learning_rate": 4.996986405772359e-06, + "loss": 0.7251, + "step": 437 + }, + { + "epoch": 0.12132963988919668, + "grad_norm": 0.536659836769104, + "learning_rate": 4.996968498025553e-06, + "loss": 0.712, + "step": 438 + }, + { + "epoch": 0.12160664819944599, + "grad_norm": 0.5609496235847473, + "learning_rate": 4.99695053726173e-06, + "loss": 0.7873, + "step": 439 + }, + { + "epoch": 0.12188365650969529, + "grad_norm": 0.5493842363357544, + "learning_rate": 4.9969325234812705e-06, + "loss": 0.7245, + "step": 440 + }, + { + "epoch": 0.1221606648199446, + "grad_norm": 0.5588358640670776, + "learning_rate": 4.996914456684556e-06, + "loss": 0.7185, + "step": 441 + }, + { + "epoch": 0.12243767313019391, + "grad_norm": 0.5604621767997742, + "learning_rate": 4.996896336871973e-06, + "loss": 0.7619, + "step": 442 + }, + { + "epoch": 0.12271468144044322, + "grad_norm": 0.5636209845542908, + "learning_rate": 4.996878164043903e-06, + "loss": 0.7588, + "step": 443 + }, + { + "epoch": 0.12299168975069252, + "grad_norm": 0.5372119545936584, + "learning_rate": 4.996859938200733e-06, + "loss": 0.7461, + "step": 444 + }, + { + "epoch": 0.12326869806094183, + "grad_norm": 0.5723452568054199, + "learning_rate": 4.996841659342852e-06, + "loss": 0.7263, + "step": 445 + }, + { + "epoch": 0.12354570637119114, + "grad_norm": 0.5590176582336426, + "learning_rate": 4.996823327470644e-06, + "loss": 0.7499, + "step": 446 + }, + { + "epoch": 0.12382271468144045, + "grad_norm": 0.5287463665008545, + "learning_rate": 4.996804942584502e-06, + "loss": 0.7078, + "step": 447 + }, + { + "epoch": 0.12409972299168975, + "grad_norm": 0.5544666647911072, + "learning_rate": 4.9967865046848155e-06, + "loss": 0.7296, + "step": 448 + }, + { + "epoch": 0.12437673130193906, + "grad_norm": 0.6168888211250305, + "learning_rate": 4.996768013771974e-06, + "loss": 0.7747, + "step": 449 + }, + { + "epoch": 0.12465373961218837, + "grad_norm": 0.550980269908905, + "learning_rate": 4.996749469846372e-06, + "loss": 0.7419, + "step": 450 + }, + { + "epoch": 0.12493074792243768, + "grad_norm": 0.5525485277175903, + "learning_rate": 4.996730872908403e-06, + "loss": 0.7427, + "step": 451 + }, + { + "epoch": 0.12520775623268698, + "grad_norm": 0.5857051014900208, + "learning_rate": 4.9967122229584614e-06, + "loss": 0.7936, + "step": 452 + }, + { + "epoch": 0.12548476454293628, + "grad_norm": 0.5720162391662598, + "learning_rate": 4.996693519996943e-06, + "loss": 0.8068, + "step": 453 + }, + { + "epoch": 0.1257617728531856, + "grad_norm": 0.539708137512207, + "learning_rate": 4.996674764024247e-06, + "loss": 0.7568, + "step": 454 + }, + { + "epoch": 0.1260387811634349, + "grad_norm": 0.5840601921081543, + "learning_rate": 4.996655955040768e-06, + "loss": 0.7781, + "step": 455 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 0.5401661396026611, + "learning_rate": 4.996637093046907e-06, + "loss": 0.7179, + "step": 456 + }, + { + "epoch": 0.1265927977839335, + "grad_norm": 0.5300602912902832, + "learning_rate": 4.996618178043067e-06, + "loss": 0.7329, + "step": 457 + }, + { + "epoch": 0.12686980609418283, + "grad_norm": 0.5564039349555969, + "learning_rate": 4.996599210029645e-06, + "loss": 0.7799, + "step": 458 + }, + { + "epoch": 0.12714681440443212, + "grad_norm": 0.59776371717453, + "learning_rate": 4.996580189007049e-06, + "loss": 0.7401, + "step": 459 + }, + { + "epoch": 0.12742382271468145, + "grad_norm": 0.5383924245834351, + "learning_rate": 4.996561114975678e-06, + "loss": 0.754, + "step": 460 + }, + { + "epoch": 0.12770083102493074, + "grad_norm": 0.5681918859481812, + "learning_rate": 4.996541987935939e-06, + "loss": 0.706, + "step": 461 + }, + { + "epoch": 0.12797783933518006, + "grad_norm": 0.5702113509178162, + "learning_rate": 4.996522807888239e-06, + "loss": 0.754, + "step": 462 + }, + { + "epoch": 0.12825484764542935, + "grad_norm": 0.5508577227592468, + "learning_rate": 4.996503574832984e-06, + "loss": 0.7765, + "step": 463 + }, + { + "epoch": 0.12853185595567868, + "grad_norm": 0.5382973551750183, + "learning_rate": 4.996484288770582e-06, + "loss": 0.7074, + "step": 464 + }, + { + "epoch": 0.12880886426592797, + "grad_norm": 0.5670721530914307, + "learning_rate": 4.996464949701444e-06, + "loss": 0.7513, + "step": 465 + }, + { + "epoch": 0.1290858725761773, + "grad_norm": 0.5926334261894226, + "learning_rate": 4.9964455576259795e-06, + "loss": 0.8037, + "step": 466 + }, + { + "epoch": 0.12936288088642658, + "grad_norm": 0.5309609174728394, + "learning_rate": 4.9964261125446015e-06, + "loss": 0.7443, + "step": 467 + }, + { + "epoch": 0.1296398891966759, + "grad_norm": 0.5415372848510742, + "learning_rate": 4.99640661445772e-06, + "loss": 0.7404, + "step": 468 + }, + { + "epoch": 0.1299168975069252, + "grad_norm": 0.5469106435775757, + "learning_rate": 4.996387063365753e-06, + "loss": 0.7584, + "step": 469 + }, + { + "epoch": 0.13019390581717452, + "grad_norm": 0.5746037364006042, + "learning_rate": 4.9963674592691125e-06, + "loss": 0.7732, + "step": 470 + }, + { + "epoch": 0.13047091412742381, + "grad_norm": 0.5430501699447632, + "learning_rate": 4.996347802168216e-06, + "loss": 0.6859, + "step": 471 + }, + { + "epoch": 0.13074792243767314, + "grad_norm": 0.5458232164382935, + "learning_rate": 4.996328092063481e-06, + "loss": 0.7435, + "step": 472 + }, + { + "epoch": 0.13102493074792243, + "grad_norm": 0.5637723207473755, + "learning_rate": 4.996308328955325e-06, + "loss": 0.7916, + "step": 473 + }, + { + "epoch": 0.13130193905817175, + "grad_norm": 0.603699803352356, + "learning_rate": 4.996288512844169e-06, + "loss": 0.7256, + "step": 474 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.539997935295105, + "learning_rate": 4.996268643730433e-06, + "loss": 0.7704, + "step": 475 + }, + { + "epoch": 0.13185595567867037, + "grad_norm": 0.5679884552955627, + "learning_rate": 4.996248721614538e-06, + "loss": 0.7806, + "step": 476 + }, + { + "epoch": 0.13213296398891966, + "grad_norm": 0.52909255027771, + "learning_rate": 4.996228746496909e-06, + "loss": 0.7311, + "step": 477 + }, + { + "epoch": 0.13240997229916898, + "grad_norm": 0.5588310956954956, + "learning_rate": 4.996208718377968e-06, + "loss": 0.7754, + "step": 478 + }, + { + "epoch": 0.13268698060941828, + "grad_norm": 0.5584151744842529, + "learning_rate": 4.9961886372581425e-06, + "loss": 0.7034, + "step": 479 + }, + { + "epoch": 0.1329639889196676, + "grad_norm": 0.5524438619613647, + "learning_rate": 4.9961685031378575e-06, + "loss": 0.7404, + "step": 480 + }, + { + "epoch": 0.1332409972299169, + "grad_norm": 0.5850812196731567, + "learning_rate": 4.99614831601754e-06, + "loss": 0.7978, + "step": 481 + }, + { + "epoch": 0.1335180055401662, + "grad_norm": 0.5962361693382263, + "learning_rate": 4.996128075897619e-06, + "loss": 0.7731, + "step": 482 + }, + { + "epoch": 0.1337950138504155, + "grad_norm": 0.5720769166946411, + "learning_rate": 4.996107782778526e-06, + "loss": 0.7761, + "step": 483 + }, + { + "epoch": 0.13407202216066483, + "grad_norm": 0.5690904855728149, + "learning_rate": 4.996087436660689e-06, + "loss": 0.7837, + "step": 484 + }, + { + "epoch": 0.13434903047091412, + "grad_norm": 0.5276780128479004, + "learning_rate": 4.996067037544542e-06, + "loss": 0.7025, + "step": 485 + }, + { + "epoch": 0.13462603878116344, + "grad_norm": 0.5334899425506592, + "learning_rate": 4.996046585430517e-06, + "loss": 0.705, + "step": 486 + }, + { + "epoch": 0.13490304709141274, + "grad_norm": 0.5671269297599792, + "learning_rate": 4.99602608031905e-06, + "loss": 0.712, + "step": 487 + }, + { + "epoch": 0.13518005540166206, + "grad_norm": 0.5702267289161682, + "learning_rate": 4.9960055222105745e-06, + "loss": 0.7627, + "step": 488 + }, + { + "epoch": 0.13545706371191135, + "grad_norm": 0.5774746537208557, + "learning_rate": 4.995984911105527e-06, + "loss": 0.7708, + "step": 489 + }, + { + "epoch": 0.13573407202216067, + "grad_norm": 0.626246452331543, + "learning_rate": 4.995964247004346e-06, + "loss": 0.7916, + "step": 490 + }, + { + "epoch": 0.13601108033240997, + "grad_norm": 0.5762872099876404, + "learning_rate": 4.99594352990747e-06, + "loss": 0.7308, + "step": 491 + }, + { + "epoch": 0.1362880886426593, + "grad_norm": 0.5570892691612244, + "learning_rate": 4.9959227598153395e-06, + "loss": 0.7184, + "step": 492 + }, + { + "epoch": 0.13656509695290858, + "grad_norm": 0.5843603610992432, + "learning_rate": 4.995901936728394e-06, + "loss": 0.7467, + "step": 493 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 0.5812234878540039, + "learning_rate": 4.9958810606470775e-06, + "loss": 0.7576, + "step": 494 + }, + { + "epoch": 0.1371191135734072, + "grad_norm": 0.5652825832366943, + "learning_rate": 4.995860131571832e-06, + "loss": 0.7052, + "step": 495 + }, + { + "epoch": 0.13739612188365652, + "grad_norm": 0.5675123929977417, + "learning_rate": 4.995839149503103e-06, + "loss": 0.7658, + "step": 496 + }, + { + "epoch": 0.1376731301939058, + "grad_norm": 0.565321147441864, + "learning_rate": 4.995818114441335e-06, + "loss": 0.7434, + "step": 497 + }, + { + "epoch": 0.13795013850415513, + "grad_norm": 0.5747477412223816, + "learning_rate": 4.9957970263869735e-06, + "loss": 0.7441, + "step": 498 + }, + { + "epoch": 0.13822714681440443, + "grad_norm": 0.5597822070121765, + "learning_rate": 4.9957758853404685e-06, + "loss": 0.7484, + "step": 499 + }, + { + "epoch": 0.13850415512465375, + "grad_norm": 0.5810073018074036, + "learning_rate": 4.995754691302267e-06, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.13878116343490304, + "grad_norm": 0.5751741528511047, + "learning_rate": 4.995733444272821e-06, + "loss": 0.8118, + "step": 501 + }, + { + "epoch": 0.13905817174515236, + "grad_norm": 0.5670412182807922, + "learning_rate": 4.99571214425258e-06, + "loss": 0.7356, + "step": 502 + }, + { + "epoch": 0.13933518005540166, + "grad_norm": 0.5851835608482361, + "learning_rate": 4.995690791241997e-06, + "loss": 0.794, + "step": 503 + }, + { + "epoch": 0.13961218836565098, + "grad_norm": 0.5708001255989075, + "learning_rate": 4.995669385241525e-06, + "loss": 0.7258, + "step": 504 + }, + { + "epoch": 0.13988919667590027, + "grad_norm": 0.5651164054870605, + "learning_rate": 4.995647926251619e-06, + "loss": 0.7173, + "step": 505 + }, + { + "epoch": 0.1401662049861496, + "grad_norm": 0.580865204334259, + "learning_rate": 4.995626414272734e-06, + "loss": 0.8015, + "step": 506 + }, + { + "epoch": 0.1404432132963989, + "grad_norm": 0.5580934882164001, + "learning_rate": 4.995604849305328e-06, + "loss": 0.7708, + "step": 507 + }, + { + "epoch": 0.1407202216066482, + "grad_norm": 0.5962908267974854, + "learning_rate": 4.995583231349857e-06, + "loss": 0.7617, + "step": 508 + }, + { + "epoch": 0.1409972299168975, + "grad_norm": 0.5531542897224426, + "learning_rate": 4.995561560406781e-06, + "loss": 0.7422, + "step": 509 + }, + { + "epoch": 0.14127423822714683, + "grad_norm": 0.5768089294433594, + "learning_rate": 4.995539836476561e-06, + "loss": 0.8083, + "step": 510 + }, + { + "epoch": 0.14155124653739612, + "grad_norm": 0.5697327256202698, + "learning_rate": 4.995518059559656e-06, + "loss": 0.7677, + "step": 511 + }, + { + "epoch": 0.14182825484764544, + "grad_norm": 0.5646145939826965, + "learning_rate": 4.995496229656531e-06, + "loss": 0.7015, + "step": 512 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 0.5464889407157898, + "learning_rate": 4.995474346767647e-06, + "loss": 0.7104, + "step": 513 + }, + { + "epoch": 0.14238227146814406, + "grad_norm": 0.6004087924957275, + "learning_rate": 4.995452410893471e-06, + "loss": 0.8018, + "step": 514 + }, + { + "epoch": 0.14265927977839335, + "grad_norm": 0.6363440155982971, + "learning_rate": 4.9954304220344676e-06, + "loss": 0.7857, + "step": 515 + }, + { + "epoch": 0.14293628808864267, + "grad_norm": 0.5825870633125305, + "learning_rate": 4.9954083801911025e-06, + "loss": 0.7574, + "step": 516 + }, + { + "epoch": 0.14321329639889196, + "grad_norm": 0.5635371804237366, + "learning_rate": 4.995386285363846e-06, + "loss": 0.7117, + "step": 517 + }, + { + "epoch": 0.1434903047091413, + "grad_norm": 0.5889778137207031, + "learning_rate": 4.995364137553166e-06, + "loss": 0.7625, + "step": 518 + }, + { + "epoch": 0.14376731301939058, + "grad_norm": 0.5800310969352722, + "learning_rate": 4.995341936759533e-06, + "loss": 0.7419, + "step": 519 + }, + { + "epoch": 0.1440443213296399, + "grad_norm": 0.5781015157699585, + "learning_rate": 4.995319682983417e-06, + "loss": 0.738, + "step": 520 + }, + { + "epoch": 0.1443213296398892, + "grad_norm": 0.5652474761009216, + "learning_rate": 4.995297376225293e-06, + "loss": 0.7867, + "step": 521 + }, + { + "epoch": 0.14459833795013852, + "grad_norm": 0.5604120492935181, + "learning_rate": 4.995275016485633e-06, + "loss": 0.7324, + "step": 522 + }, + { + "epoch": 0.1448753462603878, + "grad_norm": 0.5692135095596313, + "learning_rate": 4.995252603764913e-06, + "loss": 0.7839, + "step": 523 + }, + { + "epoch": 0.14515235457063713, + "grad_norm": 0.5682529211044312, + "learning_rate": 4.995230138063607e-06, + "loss": 0.7833, + "step": 524 + }, + { + "epoch": 0.14542936288088643, + "grad_norm": 0.6351110339164734, + "learning_rate": 4.995207619382194e-06, + "loss": 0.7737, + "step": 525 + }, + { + "epoch": 0.14570637119113575, + "grad_norm": 0.5999937057495117, + "learning_rate": 4.995185047721151e-06, + "loss": 0.7681, + "step": 526 + }, + { + "epoch": 0.14598337950138504, + "grad_norm": 0.592374861240387, + "learning_rate": 4.995162423080957e-06, + "loss": 0.7981, + "step": 527 + }, + { + "epoch": 0.14626038781163436, + "grad_norm": 0.5485160946846008, + "learning_rate": 4.995139745462092e-06, + "loss": 0.7414, + "step": 528 + }, + { + "epoch": 0.14653739612188366, + "grad_norm": 0.5628599524497986, + "learning_rate": 4.99511701486504e-06, + "loss": 0.7589, + "step": 529 + }, + { + "epoch": 0.14681440443213298, + "grad_norm": 0.5511693954467773, + "learning_rate": 4.9950942312902805e-06, + "loss": 0.6909, + "step": 530 + }, + { + "epoch": 0.14709141274238227, + "grad_norm": 0.5663548111915588, + "learning_rate": 4.995071394738299e-06, + "loss": 0.8017, + "step": 531 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.5832469463348389, + "learning_rate": 4.995048505209581e-06, + "loss": 0.7032, + "step": 532 + }, + { + "epoch": 0.1476454293628809, + "grad_norm": 0.5971035361289978, + "learning_rate": 4.995025562704611e-06, + "loss": 0.7705, + "step": 533 + }, + { + "epoch": 0.14792243767313018, + "grad_norm": 0.5898804068565369, + "learning_rate": 4.995002567223877e-06, + "loss": 0.7446, + "step": 534 + }, + { + "epoch": 0.1481994459833795, + "grad_norm": 0.5439696311950684, + "learning_rate": 4.994979518767866e-06, + "loss": 0.7333, + "step": 535 + }, + { + "epoch": 0.1484764542936288, + "grad_norm": 0.5540503263473511, + "learning_rate": 4.994956417337069e-06, + "loss": 0.7951, + "step": 536 + }, + { + "epoch": 0.14875346260387812, + "grad_norm": 0.5638033747673035, + "learning_rate": 4.994933262931976e-06, + "loss": 0.7685, + "step": 537 + }, + { + "epoch": 0.1490304709141274, + "grad_norm": 0.5664800405502319, + "learning_rate": 4.994910055553079e-06, + "loss": 0.7569, + "step": 538 + }, + { + "epoch": 0.14930747922437673, + "grad_norm": 0.6010585427284241, + "learning_rate": 4.994886795200869e-06, + "loss": 0.756, + "step": 539 + }, + { + "epoch": 0.14958448753462603, + "grad_norm": 0.5838546752929688, + "learning_rate": 4.994863481875842e-06, + "loss": 0.7701, + "step": 540 + }, + { + "epoch": 0.14986149584487535, + "grad_norm": 0.6126722097396851, + "learning_rate": 4.994840115578491e-06, + "loss": 0.7468, + "step": 541 + }, + { + "epoch": 0.15013850415512464, + "grad_norm": 0.5752817392349243, + "learning_rate": 4.994816696309314e-06, + "loss": 0.7532, + "step": 542 + }, + { + "epoch": 0.15041551246537396, + "grad_norm": 0.5906909704208374, + "learning_rate": 4.994793224068807e-06, + "loss": 0.7719, + "step": 543 + }, + { + "epoch": 0.15069252077562326, + "grad_norm": 0.606719434261322, + "learning_rate": 4.994769698857469e-06, + "loss": 0.7709, + "step": 544 + }, + { + "epoch": 0.15096952908587258, + "grad_norm": 0.5538843870162964, + "learning_rate": 4.9947461206757995e-06, + "loss": 0.7749, + "step": 545 + }, + { + "epoch": 0.15124653739612187, + "grad_norm": 0.5870587825775146, + "learning_rate": 4.994722489524299e-06, + "loss": 0.6922, + "step": 546 + }, + { + "epoch": 0.1515235457063712, + "grad_norm": 0.5842876434326172, + "learning_rate": 4.994698805403469e-06, + "loss": 0.7525, + "step": 547 + }, + { + "epoch": 0.1518005540166205, + "grad_norm": 0.5563871264457703, + "learning_rate": 4.994675068313813e-06, + "loss": 0.7227, + "step": 548 + }, + { + "epoch": 0.1520775623268698, + "grad_norm": 0.5730799436569214, + "learning_rate": 4.994651278255835e-06, + "loss": 0.754, + "step": 549 + }, + { + "epoch": 0.1523545706371191, + "grad_norm": 0.6045185327529907, + "learning_rate": 4.994627435230039e-06, + "loss": 0.7342, + "step": 550 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 0.5501013398170471, + "learning_rate": 4.994603539236932e-06, + "loss": 0.659, + "step": 551 + }, + { + "epoch": 0.15290858725761772, + "grad_norm": 0.5863350033760071, + "learning_rate": 4.994579590277021e-06, + "loss": 0.7128, + "step": 552 + }, + { + "epoch": 0.15318559556786704, + "grad_norm": 0.5850765109062195, + "learning_rate": 4.9945555883508155e-06, + "loss": 0.7499, + "step": 553 + }, + { + "epoch": 0.15346260387811633, + "grad_norm": 0.61758953332901, + "learning_rate": 4.994531533458824e-06, + "loss": 0.7482, + "step": 554 + }, + { + "epoch": 0.15373961218836565, + "grad_norm": 0.5901806354522705, + "learning_rate": 4.994507425601559e-06, + "loss": 0.7402, + "step": 555 + }, + { + "epoch": 0.15401662049861495, + "grad_norm": 0.5781405568122864, + "learning_rate": 4.994483264779529e-06, + "loss": 0.7703, + "step": 556 + }, + { + "epoch": 0.15429362880886427, + "grad_norm": 0.5744783282279968, + "learning_rate": 4.994459050993251e-06, + "loss": 0.7451, + "step": 557 + }, + { + "epoch": 0.15457063711911356, + "grad_norm": 0.5938632488250732, + "learning_rate": 4.9944347842432365e-06, + "loss": 0.7549, + "step": 558 + }, + { + "epoch": 0.15484764542936288, + "grad_norm": 0.6398091316223145, + "learning_rate": 4.994410464530001e-06, + "loss": 0.757, + "step": 559 + }, + { + "epoch": 0.15512465373961218, + "grad_norm": 0.5992589592933655, + "learning_rate": 4.994386091854061e-06, + "loss": 0.7152, + "step": 560 + }, + { + "epoch": 0.1554016620498615, + "grad_norm": 0.6135343313217163, + "learning_rate": 4.994361666215935e-06, + "loss": 0.6954, + "step": 561 + }, + { + "epoch": 0.1556786703601108, + "grad_norm": 0.6189139485359192, + "learning_rate": 4.99433718761614e-06, + "loss": 0.7436, + "step": 562 + }, + { + "epoch": 0.15595567867036012, + "grad_norm": 0.5707741379737854, + "learning_rate": 4.994312656055198e-06, + "loss": 0.6516, + "step": 563 + }, + { + "epoch": 0.1562326869806094, + "grad_norm": 0.5933526754379272, + "learning_rate": 4.9942880715336284e-06, + "loss": 0.7263, + "step": 564 + }, + { + "epoch": 0.15650969529085873, + "grad_norm": 0.598362922668457, + "learning_rate": 4.994263434051953e-06, + "loss": 0.7285, + "step": 565 + }, + { + "epoch": 0.15678670360110802, + "grad_norm": 0.5745391249656677, + "learning_rate": 4.994238743610695e-06, + "loss": 0.7279, + "step": 566 + }, + { + "epoch": 0.15706371191135735, + "grad_norm": 0.5936552286148071, + "learning_rate": 4.994214000210379e-06, + "loss": 0.7818, + "step": 567 + }, + { + "epoch": 0.15734072022160664, + "grad_norm": 0.6095513701438904, + "learning_rate": 4.99418920385153e-06, + "loss": 0.7492, + "step": 568 + }, + { + "epoch": 0.15761772853185596, + "grad_norm": 0.6010913252830505, + "learning_rate": 4.994164354534675e-06, + "loss": 0.7537, + "step": 569 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.6135689616203308, + "learning_rate": 4.994139452260341e-06, + "loss": 0.772, + "step": 570 + }, + { + "epoch": 0.15817174515235458, + "grad_norm": 0.6023951172828674, + "learning_rate": 4.994114497029058e-06, + "loss": 0.7703, + "step": 571 + }, + { + "epoch": 0.15844875346260387, + "grad_norm": 0.5701375603675842, + "learning_rate": 4.994089488841354e-06, + "loss": 0.7329, + "step": 572 + }, + { + "epoch": 0.1587257617728532, + "grad_norm": 0.5752888917922974, + "learning_rate": 4.994064427697761e-06, + "loss": 0.7459, + "step": 573 + }, + { + "epoch": 0.15900277008310248, + "grad_norm": 0.6101413369178772, + "learning_rate": 4.994039313598812e-06, + "loss": 0.7899, + "step": 574 + }, + { + "epoch": 0.1592797783933518, + "grad_norm": 0.5689857602119446, + "learning_rate": 4.994014146545039e-06, + "loss": 0.7274, + "step": 575 + }, + { + "epoch": 0.1595567867036011, + "grad_norm": 0.5898393392562866, + "learning_rate": 4.9939889265369755e-06, + "loss": 0.7446, + "step": 576 + }, + { + "epoch": 0.15983379501385042, + "grad_norm": 0.6104997396469116, + "learning_rate": 4.993963653575159e-06, + "loss": 0.6932, + "step": 577 + }, + { + "epoch": 0.16011080332409972, + "grad_norm": 0.6198656558990479, + "learning_rate": 4.993938327660125e-06, + "loss": 0.7027, + "step": 578 + }, + { + "epoch": 0.16038781163434904, + "grad_norm": 0.5969960689544678, + "learning_rate": 4.993912948792412e-06, + "loss": 0.7283, + "step": 579 + }, + { + "epoch": 0.16066481994459833, + "grad_norm": 0.5853732824325562, + "learning_rate": 4.993887516972558e-06, + "loss": 0.7232, + "step": 580 + }, + { + "epoch": 0.16094182825484765, + "grad_norm": 0.5809240341186523, + "learning_rate": 4.9938620322011035e-06, + "loss": 0.7308, + "step": 581 + }, + { + "epoch": 0.16121883656509695, + "grad_norm": 0.6255086064338684, + "learning_rate": 4.993836494478589e-06, + "loss": 0.7621, + "step": 582 + }, + { + "epoch": 0.16149584487534627, + "grad_norm": 0.5679871439933777, + "learning_rate": 4.993810903805557e-06, + "loss": 0.715, + "step": 583 + }, + { + "epoch": 0.16177285318559556, + "grad_norm": 0.6073582172393799, + "learning_rate": 4.993785260182552e-06, + "loss": 0.7685, + "step": 584 + }, + { + "epoch": 0.16204986149584488, + "grad_norm": 0.6174919009208679, + "learning_rate": 4.9937595636101165e-06, + "loss": 0.7825, + "step": 585 + }, + { + "epoch": 0.16232686980609418, + "grad_norm": 0.5922225117683411, + "learning_rate": 4.993733814088798e-06, + "loss": 0.6968, + "step": 586 + }, + { + "epoch": 0.1626038781163435, + "grad_norm": 0.5925888419151306, + "learning_rate": 4.993708011619142e-06, + "loss": 0.7288, + "step": 587 + }, + { + "epoch": 0.1628808864265928, + "grad_norm": 0.5409933924674988, + "learning_rate": 4.9936821562016965e-06, + "loss": 0.6759, + "step": 588 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 0.5877025127410889, + "learning_rate": 4.99365624783701e-06, + "loss": 0.7661, + "step": 589 + }, + { + "epoch": 0.1634349030470914, + "grad_norm": 0.5677634477615356, + "learning_rate": 4.993630286525634e-06, + "loss": 0.7698, + "step": 590 + }, + { + "epoch": 0.16371191135734073, + "grad_norm": 0.6122007369995117, + "learning_rate": 4.993604272268119e-06, + "loss": 0.7112, + "step": 591 + }, + { + "epoch": 0.16398891966759002, + "grad_norm": 0.5659086108207703, + "learning_rate": 4.993578205065017e-06, + "loss": 0.746, + "step": 592 + }, + { + "epoch": 0.16426592797783934, + "grad_norm": 0.6072430610656738, + "learning_rate": 4.993552084916883e-06, + "loss": 0.7886, + "step": 593 + }, + { + "epoch": 0.16454293628808864, + "grad_norm": 0.599383533000946, + "learning_rate": 4.993525911824268e-06, + "loss": 0.711, + "step": 594 + }, + { + "epoch": 0.16481994459833796, + "grad_norm": 0.5857592821121216, + "learning_rate": 4.993499685787732e-06, + "loss": 0.7318, + "step": 595 + }, + { + "epoch": 0.16509695290858725, + "grad_norm": 0.5840777158737183, + "learning_rate": 4.99347340680783e-06, + "loss": 0.7209, + "step": 596 + }, + { + "epoch": 0.16537396121883657, + "grad_norm": 0.55238938331604, + "learning_rate": 4.9934470748851196e-06, + "loss": 0.7329, + "step": 597 + }, + { + "epoch": 0.16565096952908587, + "grad_norm": 0.5710905194282532, + "learning_rate": 4.99342069002016e-06, + "loss": 0.7157, + "step": 598 + }, + { + "epoch": 0.1659279778393352, + "grad_norm": 0.6297195553779602, + "learning_rate": 4.993394252213512e-06, + "loss": 0.7414, + "step": 599 + }, + { + "epoch": 0.16620498614958448, + "grad_norm": 0.6218348741531372, + "learning_rate": 4.993367761465736e-06, + "loss": 0.7627, + "step": 600 + }, + { + "epoch": 0.1664819944598338, + "grad_norm": 0.6218968629837036, + "learning_rate": 4.993341217777396e-06, + "loss": 0.7518, + "step": 601 + }, + { + "epoch": 0.1667590027700831, + "grad_norm": 0.6355380415916443, + "learning_rate": 4.993314621149054e-06, + "loss": 0.7509, + "step": 602 + }, + { + "epoch": 0.16703601108033242, + "grad_norm": 0.5781269669532776, + "learning_rate": 4.9932879715812745e-06, + "loss": 0.7405, + "step": 603 + }, + { + "epoch": 0.1673130193905817, + "grad_norm": 0.5942702293395996, + "learning_rate": 4.993261269074625e-06, + "loss": 0.7434, + "step": 604 + }, + { + "epoch": 0.16759002770083103, + "grad_norm": 0.6177949905395508, + "learning_rate": 4.9932345136296725e-06, + "loss": 0.7514, + "step": 605 + }, + { + "epoch": 0.16786703601108033, + "grad_norm": 0.6140938997268677, + "learning_rate": 4.993207705246983e-06, + "loss": 0.7241, + "step": 606 + }, + { + "epoch": 0.16814404432132965, + "grad_norm": 0.5944777727127075, + "learning_rate": 4.9931808439271275e-06, + "loss": 0.7917, + "step": 607 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.610851526260376, + "learning_rate": 4.993153929670676e-06, + "loss": 0.7748, + "step": 608 + }, + { + "epoch": 0.16869806094182827, + "grad_norm": 0.5631710886955261, + "learning_rate": 4.9931269624782e-06, + "loss": 0.6437, + "step": 609 + }, + { + "epoch": 0.16897506925207756, + "grad_norm": 0.5906318426132202, + "learning_rate": 4.993099942350271e-06, + "loss": 0.7635, + "step": 610 + }, + { + "epoch": 0.16925207756232688, + "grad_norm": 0.6264278888702393, + "learning_rate": 4.993072869287464e-06, + "loss": 0.7851, + "step": 611 + }, + { + "epoch": 0.16952908587257617, + "grad_norm": 0.5960770845413208, + "learning_rate": 4.993045743290353e-06, + "loss": 0.7507, + "step": 612 + }, + { + "epoch": 0.1698060941828255, + "grad_norm": 0.5849323272705078, + "learning_rate": 4.993018564359514e-06, + "loss": 0.7203, + "step": 613 + }, + { + "epoch": 0.1700831024930748, + "grad_norm": 0.6004356741905212, + "learning_rate": 4.992991332495525e-06, + "loss": 0.7446, + "step": 614 + }, + { + "epoch": 0.1703601108033241, + "grad_norm": 0.6141982078552246, + "learning_rate": 4.992964047698964e-06, + "loss": 0.7275, + "step": 615 + }, + { + "epoch": 0.1706371191135734, + "grad_norm": 0.589536190032959, + "learning_rate": 4.992936709970409e-06, + "loss": 0.7654, + "step": 616 + }, + { + "epoch": 0.17091412742382273, + "grad_norm": 0.6536529660224915, + "learning_rate": 4.992909319310442e-06, + "loss": 0.7798, + "step": 617 + }, + { + "epoch": 0.17119113573407202, + "grad_norm": 0.5964363217353821, + "learning_rate": 4.992881875719644e-06, + "loss": 0.7158, + "step": 618 + }, + { + "epoch": 0.17146814404432134, + "grad_norm": 0.6039339900016785, + "learning_rate": 4.992854379198598e-06, + "loss": 0.7219, + "step": 619 + }, + { + "epoch": 0.17174515235457063, + "grad_norm": 0.6138789057731628, + "learning_rate": 4.9928268297478866e-06, + "loss": 0.7485, + "step": 620 + }, + { + "epoch": 0.17202216066481996, + "grad_norm": 0.6011034250259399, + "learning_rate": 4.992799227368096e-06, + "loss": 0.7412, + "step": 621 + }, + { + "epoch": 0.17229916897506925, + "grad_norm": 0.6099687814712524, + "learning_rate": 4.992771572059812e-06, + "loss": 0.7477, + "step": 622 + }, + { + "epoch": 0.17257617728531857, + "grad_norm": 0.6124699711799622, + "learning_rate": 4.992743863823621e-06, + "loss": 0.7584, + "step": 623 + }, + { + "epoch": 0.17285318559556787, + "grad_norm": 0.6268585920333862, + "learning_rate": 4.992716102660114e-06, + "loss": 0.6723, + "step": 624 + }, + { + "epoch": 0.1731301939058172, + "grad_norm": 0.567093551158905, + "learning_rate": 4.9926882885698765e-06, + "loss": 0.7189, + "step": 625 + }, + { + "epoch": 0.17340720221606648, + "grad_norm": 0.635819673538208, + "learning_rate": 4.992660421553501e-06, + "loss": 0.7455, + "step": 626 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 0.654516339302063, + "learning_rate": 4.99263250161158e-06, + "loss": 0.769, + "step": 627 + }, + { + "epoch": 0.1739612188365651, + "grad_norm": 0.5695293545722961, + "learning_rate": 4.992604528744705e-06, + "loss": 0.7282, + "step": 628 + }, + { + "epoch": 0.17423822714681442, + "grad_norm": 0.565676748752594, + "learning_rate": 4.992576502953471e-06, + "loss": 0.694, + "step": 629 + }, + { + "epoch": 0.1745152354570637, + "grad_norm": 0.6509530544281006, + "learning_rate": 4.992548424238472e-06, + "loss": 0.7377, + "step": 630 + }, + { + "epoch": 0.17479224376731303, + "grad_norm": 0.620535135269165, + "learning_rate": 4.9925202926003045e-06, + "loss": 0.707, + "step": 631 + }, + { + "epoch": 0.17506925207756233, + "grad_norm": 0.6193838119506836, + "learning_rate": 4.992492108039566e-06, + "loss": 0.7609, + "step": 632 + }, + { + "epoch": 0.17534626038781165, + "grad_norm": 0.6202815771102905, + "learning_rate": 4.992463870556856e-06, + "loss": 0.7457, + "step": 633 + }, + { + "epoch": 0.17562326869806094, + "grad_norm": 0.5950646996498108, + "learning_rate": 4.9924355801527714e-06, + "loss": 0.7154, + "step": 634 + }, + { + "epoch": 0.17590027700831026, + "grad_norm": 0.6505277156829834, + "learning_rate": 4.992407236827916e-06, + "loss": 0.7481, + "step": 635 + }, + { + "epoch": 0.17617728531855956, + "grad_norm": 0.5843820571899414, + "learning_rate": 4.9923788405828884e-06, + "loss": 0.7416, + "step": 636 + }, + { + "epoch": 0.17645429362880888, + "grad_norm": 0.6132304668426514, + "learning_rate": 4.9923503914182945e-06, + "loss": 0.7854, + "step": 637 + }, + { + "epoch": 0.17673130193905817, + "grad_norm": 0.6458740830421448, + "learning_rate": 4.992321889334737e-06, + "loss": 0.7823, + "step": 638 + }, + { + "epoch": 0.17700831024930747, + "grad_norm": 0.6528242230415344, + "learning_rate": 4.992293334332821e-06, + "loss": 0.762, + "step": 639 + }, + { + "epoch": 0.1772853185595568, + "grad_norm": 0.5972003936767578, + "learning_rate": 4.992264726413152e-06, + "loss": 0.7252, + "step": 640 + }, + { + "epoch": 0.17756232686980608, + "grad_norm": 0.5978172421455383, + "learning_rate": 4.992236065576339e-06, + "loss": 0.7056, + "step": 641 + }, + { + "epoch": 0.1778393351800554, + "grad_norm": 0.5857217907905579, + "learning_rate": 4.992207351822989e-06, + "loss": 0.7205, + "step": 642 + }, + { + "epoch": 0.1781163434903047, + "grad_norm": 0.5811772346496582, + "learning_rate": 4.9921785851537136e-06, + "loss": 0.7363, + "step": 643 + }, + { + "epoch": 0.17839335180055402, + "grad_norm": 0.5959508419036865, + "learning_rate": 4.992149765569122e-06, + "loss": 0.7724, + "step": 644 + }, + { + "epoch": 0.1786703601108033, + "grad_norm": 0.5911051630973816, + "learning_rate": 4.992120893069827e-06, + "loss": 0.7241, + "step": 645 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 0.6051844954490662, + "learning_rate": 4.992091967656441e-06, + "loss": 0.7279, + "step": 646 + }, + { + "epoch": 0.17922437673130193, + "grad_norm": 0.5811322331428528, + "learning_rate": 4.992062989329578e-06, + "loss": 0.6759, + "step": 647 + }, + { + "epoch": 0.17950138504155125, + "grad_norm": 0.5511055588722229, + "learning_rate": 4.9920339580898545e-06, + "loss": 0.6911, + "step": 648 + }, + { + "epoch": 0.17977839335180054, + "grad_norm": 0.5894351601600647, + "learning_rate": 4.992004873937886e-06, + "loss": 0.7258, + "step": 649 + }, + { + "epoch": 0.18005540166204986, + "grad_norm": 0.6022338271141052, + "learning_rate": 4.9919757368742895e-06, + "loss": 0.7349, + "step": 650 + }, + { + "epoch": 0.18033240997229916, + "grad_norm": 0.630414605140686, + "learning_rate": 4.991946546899685e-06, + "loss": 0.7336, + "step": 651 + }, + { + "epoch": 0.18060941828254848, + "grad_norm": 0.593589186668396, + "learning_rate": 4.991917304014691e-06, + "loss": 0.7004, + "step": 652 + }, + { + "epoch": 0.18088642659279777, + "grad_norm": 0.6065483093261719, + "learning_rate": 4.99188800821993e-06, + "loss": 0.759, + "step": 653 + }, + { + "epoch": 0.1811634349030471, + "grad_norm": 0.6059734225273132, + "learning_rate": 4.9918586595160214e-06, + "loss": 0.7643, + "step": 654 + }, + { + "epoch": 0.1814404432132964, + "grad_norm": 0.5951828360557556, + "learning_rate": 4.991829257903591e-06, + "loss": 0.7215, + "step": 655 + }, + { + "epoch": 0.1817174515235457, + "grad_norm": 0.5957050919532776, + "learning_rate": 4.991799803383262e-06, + "loss": 0.7198, + "step": 656 + }, + { + "epoch": 0.181994459833795, + "grad_norm": 0.6348684430122375, + "learning_rate": 4.991770295955659e-06, + "loss": 0.7064, + "step": 657 + }, + { + "epoch": 0.18227146814404432, + "grad_norm": 0.5806444883346558, + "learning_rate": 4.99174073562141e-06, + "loss": 0.7122, + "step": 658 + }, + { + "epoch": 0.18254847645429362, + "grad_norm": 0.614686131477356, + "learning_rate": 4.991711122381142e-06, + "loss": 0.7286, + "step": 659 + }, + { + "epoch": 0.18282548476454294, + "grad_norm": 0.612411618232727, + "learning_rate": 4.991681456235483e-06, + "loss": 0.7621, + "step": 660 + }, + { + "epoch": 0.18310249307479223, + "grad_norm": 0.5956481099128723, + "learning_rate": 4.9916517371850646e-06, + "loss": 0.7495, + "step": 661 + }, + { + "epoch": 0.18337950138504155, + "grad_norm": 0.5914007425308228, + "learning_rate": 4.991621965230516e-06, + "loss": 0.7157, + "step": 662 + }, + { + "epoch": 0.18365650969529085, + "grad_norm": 0.6067460179328918, + "learning_rate": 4.991592140372471e-06, + "loss": 0.7401, + "step": 663 + }, + { + "epoch": 0.18393351800554017, + "grad_norm": 0.6207283139228821, + "learning_rate": 4.991562262611562e-06, + "loss": 0.7777, + "step": 664 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.6362254619598389, + "learning_rate": 4.991532331948423e-06, + "loss": 0.756, + "step": 665 + }, + { + "epoch": 0.18448753462603878, + "grad_norm": 0.5996198654174805, + "learning_rate": 4.99150234838369e-06, + "loss": 0.7505, + "step": 666 + }, + { + "epoch": 0.18476454293628808, + "grad_norm": 0.6403332352638245, + "learning_rate": 4.9914723119179995e-06, + "loss": 0.7641, + "step": 667 + }, + { + "epoch": 0.1850415512465374, + "grad_norm": 0.5878950357437134, + "learning_rate": 4.99144222255199e-06, + "loss": 0.7176, + "step": 668 + }, + { + "epoch": 0.1853185595567867, + "grad_norm": 0.5872005224227905, + "learning_rate": 4.991412080286299e-06, + "loss": 0.7196, + "step": 669 + }, + { + "epoch": 0.18559556786703602, + "grad_norm": 0.607134222984314, + "learning_rate": 4.991381885121567e-06, + "loss": 0.6374, + "step": 670 + }, + { + "epoch": 0.1858725761772853, + "grad_norm": 0.6430603861808777, + "learning_rate": 4.991351637058435e-06, + "loss": 0.7413, + "step": 671 + }, + { + "epoch": 0.18614958448753463, + "grad_norm": 0.6369723081588745, + "learning_rate": 4.991321336097546e-06, + "loss": 0.695, + "step": 672 + }, + { + "epoch": 0.18642659279778392, + "grad_norm": 0.6280376315116882, + "learning_rate": 4.991290982239543e-06, + "loss": 0.7364, + "step": 673 + }, + { + "epoch": 0.18670360110803325, + "grad_norm": 0.6183046102523804, + "learning_rate": 4.99126057548507e-06, + "loss": 0.7221, + "step": 674 + }, + { + "epoch": 0.18698060941828254, + "grad_norm": 0.6224685311317444, + "learning_rate": 4.991230115834773e-06, + "loss": 0.7818, + "step": 675 + }, + { + "epoch": 0.18725761772853186, + "grad_norm": 0.6200198531150818, + "learning_rate": 4.991199603289299e-06, + "loss": 0.7769, + "step": 676 + }, + { + "epoch": 0.18753462603878115, + "grad_norm": 0.6149685978889465, + "learning_rate": 4.991169037849296e-06, + "loss": 0.7371, + "step": 677 + }, + { + "epoch": 0.18781163434903048, + "grad_norm": 0.6446099877357483, + "learning_rate": 4.991138419515412e-06, + "loss": 0.7273, + "step": 678 + }, + { + "epoch": 0.18808864265927977, + "grad_norm": 0.6008735299110413, + "learning_rate": 4.991107748288297e-06, + "loss": 0.751, + "step": 679 + }, + { + "epoch": 0.1883656509695291, + "grad_norm": 0.6307796239852905, + "learning_rate": 4.9910770241686045e-06, + "loss": 0.7407, + "step": 680 + }, + { + "epoch": 0.18864265927977839, + "grad_norm": 0.5771614909172058, + "learning_rate": 4.991046247156984e-06, + "loss": 0.7596, + "step": 681 + }, + { + "epoch": 0.1889196675900277, + "grad_norm": 0.5759564638137817, + "learning_rate": 4.991015417254092e-06, + "loss": 0.731, + "step": 682 + }, + { + "epoch": 0.189196675900277, + "grad_norm": 0.5892442464828491, + "learning_rate": 4.9909845344605786e-06, + "loss": 0.712, + "step": 683 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.6216954588890076, + "learning_rate": 4.990953598777104e-06, + "loss": 0.7522, + "step": 684 + }, + { + "epoch": 0.18975069252077562, + "grad_norm": 0.6018840670585632, + "learning_rate": 4.9909226102043226e-06, + "loss": 0.6901, + "step": 685 + }, + { + "epoch": 0.19002770083102494, + "grad_norm": 0.609626054763794, + "learning_rate": 4.9908915687428935e-06, + "loss": 0.7855, + "step": 686 + }, + { + "epoch": 0.19030470914127423, + "grad_norm": 0.604314386844635, + "learning_rate": 4.990860474393476e-06, + "loss": 0.7274, + "step": 687 + }, + { + "epoch": 0.19058171745152355, + "grad_norm": 0.6209576725959778, + "learning_rate": 4.990829327156729e-06, + "loss": 0.7528, + "step": 688 + }, + { + "epoch": 0.19085872576177285, + "grad_norm": 0.6131774187088013, + "learning_rate": 4.990798127033314e-06, + "loss": 0.7726, + "step": 689 + }, + { + "epoch": 0.19113573407202217, + "grad_norm": 0.5770470499992371, + "learning_rate": 4.990766874023894e-06, + "loss": 0.7164, + "step": 690 + }, + { + "epoch": 0.19141274238227146, + "grad_norm": 0.6258401274681091, + "learning_rate": 4.990735568129133e-06, + "loss": 0.7613, + "step": 691 + }, + { + "epoch": 0.19168975069252078, + "grad_norm": 0.5941904783248901, + "learning_rate": 4.990704209349696e-06, + "loss": 0.7429, + "step": 692 + }, + { + "epoch": 0.19196675900277008, + "grad_norm": 0.633047878742218, + "learning_rate": 4.990672797686247e-06, + "loss": 0.707, + "step": 693 + }, + { + "epoch": 0.1922437673130194, + "grad_norm": 0.6325700283050537, + "learning_rate": 4.990641333139455e-06, + "loss": 0.7246, + "step": 694 + }, + { + "epoch": 0.1925207756232687, + "grad_norm": 0.6009621024131775, + "learning_rate": 4.990609815709985e-06, + "loss": 0.7101, + "step": 695 + }, + { + "epoch": 0.192797783933518, + "grad_norm": 0.5919104218482971, + "learning_rate": 4.990578245398511e-06, + "loss": 0.6893, + "step": 696 + }, + { + "epoch": 0.1930747922437673, + "grad_norm": 0.6189974546432495, + "learning_rate": 4.990546622205698e-06, + "loss": 0.756, + "step": 697 + }, + { + "epoch": 0.19335180055401663, + "grad_norm": 0.632219135761261, + "learning_rate": 4.990514946132222e-06, + "loss": 0.7153, + "step": 698 + }, + { + "epoch": 0.19362880886426592, + "grad_norm": 0.6028069853782654, + "learning_rate": 4.990483217178753e-06, + "loss": 0.7601, + "step": 699 + }, + { + "epoch": 0.19390581717451524, + "grad_norm": 0.6530799865722656, + "learning_rate": 4.9904514353459656e-06, + "loss": 0.7119, + "step": 700 + }, + { + "epoch": 0.19418282548476454, + "grad_norm": 0.624106764793396, + "learning_rate": 4.990419600634534e-06, + "loss": 0.7418, + "step": 701 + }, + { + "epoch": 0.19445983379501386, + "grad_norm": 0.651628851890564, + "learning_rate": 4.990387713045134e-06, + "loss": 0.7631, + "step": 702 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 0.6010842323303223, + "learning_rate": 4.990355772578444e-06, + "loss": 0.7143, + "step": 703 + }, + { + "epoch": 0.19501385041551247, + "grad_norm": 0.5809913873672485, + "learning_rate": 4.990323779235141e-06, + "loss": 0.7532, + "step": 704 + }, + { + "epoch": 0.19529085872576177, + "grad_norm": 0.610136866569519, + "learning_rate": 4.990291733015904e-06, + "loss": 0.7433, + "step": 705 + }, + { + "epoch": 0.1955678670360111, + "grad_norm": 0.6137512922286987, + "learning_rate": 4.990259633921414e-06, + "loss": 0.7503, + "step": 706 + }, + { + "epoch": 0.19584487534626038, + "grad_norm": 0.5880718231201172, + "learning_rate": 4.990227481952353e-06, + "loss": 0.7344, + "step": 707 + }, + { + "epoch": 0.1961218836565097, + "grad_norm": 0.6280733346939087, + "learning_rate": 4.990195277109403e-06, + "loss": 0.7246, + "step": 708 + }, + { + "epoch": 0.196398891966759, + "grad_norm": 0.7048695683479309, + "learning_rate": 4.9901630193932485e-06, + "loss": 0.7129, + "step": 709 + }, + { + "epoch": 0.19667590027700832, + "grad_norm": 0.5997457504272461, + "learning_rate": 4.990130708804573e-06, + "loss": 0.7224, + "step": 710 + }, + { + "epoch": 0.1969529085872576, + "grad_norm": 0.5950084328651428, + "learning_rate": 4.990098345344064e-06, + "loss": 0.7666, + "step": 711 + }, + { + "epoch": 0.19722991689750694, + "grad_norm": 0.5888988375663757, + "learning_rate": 4.9900659290124085e-06, + "loss": 0.7168, + "step": 712 + }, + { + "epoch": 0.19750692520775623, + "grad_norm": 0.5948391556739807, + "learning_rate": 4.990033459810294e-06, + "loss": 0.6632, + "step": 713 + }, + { + "epoch": 0.19778393351800555, + "grad_norm": 0.5904232263565063, + "learning_rate": 4.9900009377384094e-06, + "loss": 0.6901, + "step": 714 + }, + { + "epoch": 0.19806094182825484, + "grad_norm": 0.6024726033210754, + "learning_rate": 4.9899683627974475e-06, + "loss": 0.7396, + "step": 715 + }, + { + "epoch": 0.19833795013850417, + "grad_norm": 0.6176775693893433, + "learning_rate": 4.989935734988098e-06, + "loss": 0.7345, + "step": 716 + }, + { + "epoch": 0.19861495844875346, + "grad_norm": 0.6300408840179443, + "learning_rate": 4.989903054311055e-06, + "loss": 0.7106, + "step": 717 + }, + { + "epoch": 0.19889196675900278, + "grad_norm": 0.6302835941314697, + "learning_rate": 4.9898703207670106e-06, + "loss": 0.7154, + "step": 718 + }, + { + "epoch": 0.19916897506925207, + "grad_norm": 0.5916628241539001, + "learning_rate": 4.989837534356661e-06, + "loss": 0.7096, + "step": 719 + }, + { + "epoch": 0.1994459833795014, + "grad_norm": 0.6278592944145203, + "learning_rate": 4.989804695080703e-06, + "loss": 0.7551, + "step": 720 + }, + { + "epoch": 0.1997229916897507, + "grad_norm": 0.6208764314651489, + "learning_rate": 4.989771802939831e-06, + "loss": 0.7568, + "step": 721 + }, + { + "epoch": 0.2, + "grad_norm": 0.6690845489501953, + "learning_rate": 4.989738857934747e-06, + "loss": 0.7503, + "step": 722 + }, + { + "epoch": 0.2002770083102493, + "grad_norm": 0.6054900288581848, + "learning_rate": 4.989705860066149e-06, + "loss": 0.709, + "step": 723 + }, + { + "epoch": 0.20055401662049863, + "grad_norm": 0.6240448951721191, + "learning_rate": 4.989672809334738e-06, + "loss": 0.6887, + "step": 724 + }, + { + "epoch": 0.20083102493074792, + "grad_norm": 0.5848708748817444, + "learning_rate": 4.989639705741214e-06, + "loss": 0.733, + "step": 725 + }, + { + "epoch": 0.20110803324099724, + "grad_norm": 0.6031309962272644, + "learning_rate": 4.989606549286282e-06, + "loss": 0.6822, + "step": 726 + }, + { + "epoch": 0.20138504155124654, + "grad_norm": 0.6088325381278992, + "learning_rate": 4.989573339970645e-06, + "loss": 0.7376, + "step": 727 + }, + { + "epoch": 0.20166204986149586, + "grad_norm": 0.6401178240776062, + "learning_rate": 4.989540077795009e-06, + "loss": 0.6911, + "step": 728 + }, + { + "epoch": 0.20193905817174515, + "grad_norm": 0.642235517501831, + "learning_rate": 4.989506762760079e-06, + "loss": 0.7624, + "step": 729 + }, + { + "epoch": 0.20221606648199447, + "grad_norm": 0.6320951581001282, + "learning_rate": 4.989473394866563e-06, + "loss": 0.7246, + "step": 730 + }, + { + "epoch": 0.20249307479224377, + "grad_norm": 0.6383892893791199, + "learning_rate": 4.989439974115169e-06, + "loss": 0.7692, + "step": 731 + }, + { + "epoch": 0.2027700831024931, + "grad_norm": 0.5928354263305664, + "learning_rate": 4.989406500506607e-06, + "loss": 0.7087, + "step": 732 + }, + { + "epoch": 0.20304709141274238, + "grad_norm": 0.6310860514640808, + "learning_rate": 4.989372974041589e-06, + "loss": 0.7506, + "step": 733 + }, + { + "epoch": 0.2033240997229917, + "grad_norm": 0.5935929417610168, + "learning_rate": 4.989339394720824e-06, + "loss": 0.7258, + "step": 734 + }, + { + "epoch": 0.203601108033241, + "grad_norm": 0.6174879670143127, + "learning_rate": 4.989305762545027e-06, + "loss": 0.7278, + "step": 735 + }, + { + "epoch": 0.20387811634349032, + "grad_norm": 0.6096144318580627, + "learning_rate": 4.989272077514912e-06, + "loss": 0.7432, + "step": 736 + }, + { + "epoch": 0.2041551246537396, + "grad_norm": 0.6143541932106018, + "learning_rate": 4.9892383396311936e-06, + "loss": 0.6882, + "step": 737 + }, + { + "epoch": 0.20443213296398893, + "grad_norm": 0.6350477337837219, + "learning_rate": 4.989204548894589e-06, + "loss": 0.7577, + "step": 738 + }, + { + "epoch": 0.20470914127423823, + "grad_norm": 0.5977352857589722, + "learning_rate": 4.989170705305815e-06, + "loss": 0.765, + "step": 739 + }, + { + "epoch": 0.20498614958448755, + "grad_norm": 0.613018274307251, + "learning_rate": 4.98913680886559e-06, + "loss": 0.7169, + "step": 740 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 0.6501178741455078, + "learning_rate": 4.989102859574634e-06, + "loss": 0.7803, + "step": 741 + }, + { + "epoch": 0.20554016620498616, + "grad_norm": 0.6331166625022888, + "learning_rate": 4.9890688574336675e-06, + "loss": 0.7322, + "step": 742 + }, + { + "epoch": 0.20581717451523546, + "grad_norm": 0.6125751733779907, + "learning_rate": 4.989034802443413e-06, + "loss": 0.7569, + "step": 743 + }, + { + "epoch": 0.20609418282548475, + "grad_norm": 0.6181766390800476, + "learning_rate": 4.989000694604593e-06, + "loss": 0.7721, + "step": 744 + }, + { + "epoch": 0.20637119113573407, + "grad_norm": 0.6107795834541321, + "learning_rate": 4.988966533917933e-06, + "loss": 0.7308, + "step": 745 + }, + { + "epoch": 0.20664819944598337, + "grad_norm": 0.6483302116394043, + "learning_rate": 4.988932320384157e-06, + "loss": 0.78, + "step": 746 + }, + { + "epoch": 0.2069252077562327, + "grad_norm": 0.6239367723464966, + "learning_rate": 4.988898054003991e-06, + "loss": 0.7576, + "step": 747 + }, + { + "epoch": 0.20720221606648198, + "grad_norm": 0.5802024602890015, + "learning_rate": 4.988863734778165e-06, + "loss": 0.6888, + "step": 748 + }, + { + "epoch": 0.2074792243767313, + "grad_norm": 0.5955031514167786, + "learning_rate": 4.988829362707404e-06, + "loss": 0.7015, + "step": 749 + }, + { + "epoch": 0.2077562326869806, + "grad_norm": 0.651870846748352, + "learning_rate": 4.98879493779244e-06, + "loss": 0.724, + "step": 750 + }, + { + "epoch": 0.20803324099722992, + "grad_norm": 0.6392204165458679, + "learning_rate": 4.9887604600340055e-06, + "loss": 0.7452, + "step": 751 + }, + { + "epoch": 0.2083102493074792, + "grad_norm": 0.5952602028846741, + "learning_rate": 4.98872592943283e-06, + "loss": 0.7044, + "step": 752 + }, + { + "epoch": 0.20858725761772853, + "grad_norm": 0.6271218061447144, + "learning_rate": 4.9886913459896475e-06, + "loss": 0.7442, + "step": 753 + }, + { + "epoch": 0.20886426592797783, + "grad_norm": 0.6348155736923218, + "learning_rate": 4.988656709705192e-06, + "loss": 0.6851, + "step": 754 + }, + { + "epoch": 0.20914127423822715, + "grad_norm": 0.6025179028511047, + "learning_rate": 4.9886220205802e-06, + "loss": 0.7291, + "step": 755 + }, + { + "epoch": 0.20941828254847644, + "grad_norm": 0.607294499874115, + "learning_rate": 4.9885872786154065e-06, + "loss": 0.7674, + "step": 756 + }, + { + "epoch": 0.20969529085872576, + "grad_norm": 0.5688076615333557, + "learning_rate": 4.98855248381155e-06, + "loss": 0.6903, + "step": 757 + }, + { + "epoch": 0.20997229916897506, + "grad_norm": 0.6307339072227478, + "learning_rate": 4.988517636169369e-06, + "loss": 0.7723, + "step": 758 + }, + { + "epoch": 0.21024930747922438, + "grad_norm": 0.569485068321228, + "learning_rate": 4.988482735689605e-06, + "loss": 0.6656, + "step": 759 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.6402713060379028, + "learning_rate": 4.988447782372996e-06, + "loss": 0.75, + "step": 760 + }, + { + "epoch": 0.210803324099723, + "grad_norm": 0.63001549243927, + "learning_rate": 4.988412776220286e-06, + "loss": 0.7192, + "step": 761 + }, + { + "epoch": 0.2110803324099723, + "grad_norm": 0.6070324778556824, + "learning_rate": 4.988377717232219e-06, + "loss": 0.7489, + "step": 762 + }, + { + "epoch": 0.2113573407202216, + "grad_norm": 0.5573827624320984, + "learning_rate": 4.9883426054095375e-06, + "loss": 0.6487, + "step": 763 + }, + { + "epoch": 0.2116343490304709, + "grad_norm": 0.6198872327804565, + "learning_rate": 4.988307440752989e-06, + "loss": 0.7175, + "step": 764 + }, + { + "epoch": 0.21191135734072022, + "grad_norm": 0.6368589401245117, + "learning_rate": 4.988272223263317e-06, + "loss": 0.7331, + "step": 765 + }, + { + "epoch": 0.21218836565096952, + "grad_norm": 0.6162055730819702, + "learning_rate": 4.988236952941273e-06, + "loss": 0.7289, + "step": 766 + }, + { + "epoch": 0.21246537396121884, + "grad_norm": 0.6234159469604492, + "learning_rate": 4.988201629787605e-06, + "loss": 0.7229, + "step": 767 + }, + { + "epoch": 0.21274238227146813, + "grad_norm": 0.6246781945228577, + "learning_rate": 4.98816625380306e-06, + "loss": 0.7409, + "step": 768 + }, + { + "epoch": 0.21301939058171745, + "grad_norm": 0.5837066173553467, + "learning_rate": 4.988130824988393e-06, + "loss": 0.6673, + "step": 769 + }, + { + "epoch": 0.21329639889196675, + "grad_norm": 0.6310555934906006, + "learning_rate": 4.988095343344354e-06, + "loss": 0.7375, + "step": 770 + }, + { + "epoch": 0.21357340720221607, + "grad_norm": 0.5986537933349609, + "learning_rate": 4.988059808871697e-06, + "loss": 0.6994, + "step": 771 + }, + { + "epoch": 0.21385041551246536, + "grad_norm": 0.6631371974945068, + "learning_rate": 4.988024221571177e-06, + "loss": 0.746, + "step": 772 + }, + { + "epoch": 0.21412742382271469, + "grad_norm": 0.627317488193512, + "learning_rate": 4.987988581443548e-06, + "loss": 0.7713, + "step": 773 + }, + { + "epoch": 0.21440443213296398, + "grad_norm": 0.5771815776824951, + "learning_rate": 4.987952888489569e-06, + "loss": 0.7191, + "step": 774 + }, + { + "epoch": 0.2146814404432133, + "grad_norm": 0.5975073575973511, + "learning_rate": 4.9879171427099956e-06, + "loss": 0.7169, + "step": 775 + }, + { + "epoch": 0.2149584487534626, + "grad_norm": 0.6115068197250366, + "learning_rate": 4.987881344105589e-06, + "loss": 0.7416, + "step": 776 + }, + { + "epoch": 0.21523545706371192, + "grad_norm": 0.633957028388977, + "learning_rate": 4.987845492677107e-06, + "loss": 0.7591, + "step": 777 + }, + { + "epoch": 0.2155124653739612, + "grad_norm": 0.6105467677116394, + "learning_rate": 4.987809588425312e-06, + "loss": 0.7584, + "step": 778 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 0.6133649945259094, + "learning_rate": 4.987773631350966e-06, + "loss": 0.7357, + "step": 779 + }, + { + "epoch": 0.21606648199445982, + "grad_norm": 0.607979416847229, + "learning_rate": 4.9877376214548335e-06, + "loss": 0.6934, + "step": 780 + }, + { + "epoch": 0.21634349030470915, + "grad_norm": 0.5923281311988831, + "learning_rate": 4.987701558737678e-06, + "loss": 0.7748, + "step": 781 + }, + { + "epoch": 0.21662049861495844, + "grad_norm": 0.6248568296432495, + "learning_rate": 4.9876654432002655e-06, + "loss": 0.7222, + "step": 782 + }, + { + "epoch": 0.21689750692520776, + "grad_norm": 0.624470055103302, + "learning_rate": 4.987629274843363e-06, + "loss": 0.7217, + "step": 783 + }, + { + "epoch": 0.21717451523545706, + "grad_norm": 0.6328449249267578, + "learning_rate": 4.9875930536677385e-06, + "loss": 0.7603, + "step": 784 + }, + { + "epoch": 0.21745152354570638, + "grad_norm": 0.608456552028656, + "learning_rate": 4.987556779674161e-06, + "loss": 0.7228, + "step": 785 + }, + { + "epoch": 0.21772853185595567, + "grad_norm": 0.8374035954475403, + "learning_rate": 4.9875204528633995e-06, + "loss": 0.7098, + "step": 786 + }, + { + "epoch": 0.218005540166205, + "grad_norm": 0.6189671158790588, + "learning_rate": 4.987484073236227e-06, + "loss": 0.7367, + "step": 787 + }, + { + "epoch": 0.21828254847645429, + "grad_norm": 0.6146777272224426, + "learning_rate": 4.987447640793415e-06, + "loss": 0.7274, + "step": 788 + }, + { + "epoch": 0.2185595567867036, + "grad_norm": 0.6332618594169617, + "learning_rate": 4.987411155535737e-06, + "loss": 0.7184, + "step": 789 + }, + { + "epoch": 0.2188365650969529, + "grad_norm": 0.6429280638694763, + "learning_rate": 4.9873746174639695e-06, + "loss": 0.7441, + "step": 790 + }, + { + "epoch": 0.21911357340720222, + "grad_norm": 0.6396970748901367, + "learning_rate": 4.987338026578885e-06, + "loss": 0.7532, + "step": 791 + }, + { + "epoch": 0.21939058171745152, + "grad_norm": 0.609308123588562, + "learning_rate": 4.987301382881263e-06, + "loss": 0.719, + "step": 792 + }, + { + "epoch": 0.21966759002770084, + "grad_norm": 0.6831649541854858, + "learning_rate": 4.987264686371881e-06, + "loss": 0.7554, + "step": 793 + }, + { + "epoch": 0.21994459833795013, + "grad_norm": 0.6445385813713074, + "learning_rate": 4.987227937051518e-06, + "loss": 0.7446, + "step": 794 + }, + { + "epoch": 0.22022160664819945, + "grad_norm": 0.6014829277992249, + "learning_rate": 4.987191134920954e-06, + "loss": 0.7416, + "step": 795 + }, + { + "epoch": 0.22049861495844875, + "grad_norm": 0.6108548045158386, + "learning_rate": 4.987154279980971e-06, + "loss": 0.7034, + "step": 796 + }, + { + "epoch": 0.22077562326869807, + "grad_norm": 0.6140116453170776, + "learning_rate": 4.9871173722323504e-06, + "loss": 0.6733, + "step": 797 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 0.6687027215957642, + "learning_rate": 4.987080411675877e-06, + "loss": 0.7755, + "step": 798 + }, + { + "epoch": 0.22132963988919668, + "grad_norm": 0.6575332880020142, + "learning_rate": 4.987043398312335e-06, + "loss": 0.7536, + "step": 799 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 0.6619399785995483, + "learning_rate": 4.98700633214251e-06, + "loss": 0.7511, + "step": 800 + }, + { + "epoch": 0.2218836565096953, + "grad_norm": 0.6481317281723022, + "learning_rate": 4.986969213167191e-06, + "loss": 0.7763, + "step": 801 + }, + { + "epoch": 0.2221606648199446, + "grad_norm": 0.6060312390327454, + "learning_rate": 4.986932041387163e-06, + "loss": 0.709, + "step": 802 + }, + { + "epoch": 0.2224376731301939, + "grad_norm": 0.6025817394256592, + "learning_rate": 4.986894816803217e-06, + "loss": 0.759, + "step": 803 + }, + { + "epoch": 0.2227146814404432, + "grad_norm": 0.6360222697257996, + "learning_rate": 4.986857539416144e-06, + "loss": 0.7556, + "step": 804 + }, + { + "epoch": 0.22299168975069253, + "grad_norm": 0.6374087333679199, + "learning_rate": 4.986820209226734e-06, + "loss": 0.7051, + "step": 805 + }, + { + "epoch": 0.22326869806094182, + "grad_norm": 0.6589286923408508, + "learning_rate": 4.9867828262357805e-06, + "loss": 0.7401, + "step": 806 + }, + { + "epoch": 0.22354570637119114, + "grad_norm": 0.6293102502822876, + "learning_rate": 4.986745390444077e-06, + "loss": 0.6715, + "step": 807 + }, + { + "epoch": 0.22382271468144044, + "grad_norm": 0.6365495324134827, + "learning_rate": 4.986707901852419e-06, + "loss": 0.7471, + "step": 808 + }, + { + "epoch": 0.22409972299168976, + "grad_norm": 0.5922935605049133, + "learning_rate": 4.986670360461601e-06, + "loss": 0.7118, + "step": 809 + }, + { + "epoch": 0.22437673130193905, + "grad_norm": 0.6289665102958679, + "learning_rate": 4.986632766272421e-06, + "loss": 0.7085, + "step": 810 + }, + { + "epoch": 0.22465373961218837, + "grad_norm": 0.6416603922843933, + "learning_rate": 4.986595119285678e-06, + "loss": 0.6975, + "step": 811 + }, + { + "epoch": 0.22493074792243767, + "grad_norm": 0.6333436965942383, + "learning_rate": 4.986557419502171e-06, + "loss": 0.6943, + "step": 812 + }, + { + "epoch": 0.225207756232687, + "grad_norm": 0.6685619354248047, + "learning_rate": 4.986519666922698e-06, + "loss": 0.7649, + "step": 813 + }, + { + "epoch": 0.22548476454293628, + "grad_norm": 0.6346633434295654, + "learning_rate": 4.986481861548064e-06, + "loss": 0.6692, + "step": 814 + }, + { + "epoch": 0.2257617728531856, + "grad_norm": 0.6223672032356262, + "learning_rate": 4.98644400337907e-06, + "loss": 0.7081, + "step": 815 + }, + { + "epoch": 0.2260387811634349, + "grad_norm": 0.6629369258880615, + "learning_rate": 4.986406092416521e-06, + "loss": 0.7281, + "step": 816 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 0.629992663860321, + "learning_rate": 4.98636812866122e-06, + "loss": 0.731, + "step": 817 + }, + { + "epoch": 0.22659279778393351, + "grad_norm": 0.6937189102172852, + "learning_rate": 4.986330112113974e-06, + "loss": 0.7315, + "step": 818 + }, + { + "epoch": 0.22686980609418284, + "grad_norm": 0.6419219374656677, + "learning_rate": 4.9862920427755905e-06, + "loss": 0.7175, + "step": 819 + }, + { + "epoch": 0.22714681440443213, + "grad_norm": 0.6525835990905762, + "learning_rate": 4.986253920646878e-06, + "loss": 0.719, + "step": 820 + }, + { + "epoch": 0.22742382271468145, + "grad_norm": 0.6850484013557434, + "learning_rate": 4.986215745728645e-06, + "loss": 0.761, + "step": 821 + }, + { + "epoch": 0.22770083102493074, + "grad_norm": 0.6086088418960571, + "learning_rate": 4.986177518021703e-06, + "loss": 0.7367, + "step": 822 + }, + { + "epoch": 0.22797783933518007, + "grad_norm": 0.6392209529876709, + "learning_rate": 4.986139237526863e-06, + "loss": 0.7346, + "step": 823 + }, + { + "epoch": 0.22825484764542936, + "grad_norm": 0.6084146499633789, + "learning_rate": 4.9861009042449385e-06, + "loss": 0.7371, + "step": 824 + }, + { + "epoch": 0.22853185595567868, + "grad_norm": 0.6334027051925659, + "learning_rate": 4.986062518176743e-06, + "loss": 0.7736, + "step": 825 + }, + { + "epoch": 0.22880886426592797, + "grad_norm": 0.610819935798645, + "learning_rate": 4.986024079323092e-06, + "loss": 0.7199, + "step": 826 + }, + { + "epoch": 0.2290858725761773, + "grad_norm": 0.6382309198379517, + "learning_rate": 4.9859855876848e-06, + "loss": 0.7258, + "step": 827 + }, + { + "epoch": 0.2293628808864266, + "grad_norm": 0.6213927268981934, + "learning_rate": 4.985947043262686e-06, + "loss": 0.7467, + "step": 828 + }, + { + "epoch": 0.2296398891966759, + "grad_norm": 0.66412752866745, + "learning_rate": 4.985908446057568e-06, + "loss": 0.741, + "step": 829 + }, + { + "epoch": 0.2299168975069252, + "grad_norm": 0.6357157826423645, + "learning_rate": 4.985869796070265e-06, + "loss": 0.7309, + "step": 830 + }, + { + "epoch": 0.23019390581717453, + "grad_norm": 0.6128206849098206, + "learning_rate": 4.985831093301599e-06, + "loss": 0.7136, + "step": 831 + }, + { + "epoch": 0.23047091412742382, + "grad_norm": 0.6191337704658508, + "learning_rate": 4.98579233775239e-06, + "loss": 0.707, + "step": 832 + }, + { + "epoch": 0.23074792243767314, + "grad_norm": 0.6454660892486572, + "learning_rate": 4.9857535294234615e-06, + "loss": 0.7063, + "step": 833 + }, + { + "epoch": 0.23102493074792244, + "grad_norm": 0.6227606534957886, + "learning_rate": 4.985714668315637e-06, + "loss": 0.6786, + "step": 834 + }, + { + "epoch": 0.23130193905817176, + "grad_norm": 0.6117094159126282, + "learning_rate": 4.985675754429744e-06, + "loss": 0.6787, + "step": 835 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.6687617301940918, + "learning_rate": 4.985636787766605e-06, + "loss": 0.7481, + "step": 836 + }, + { + "epoch": 0.23185595567867037, + "grad_norm": 0.6518902778625488, + "learning_rate": 4.985597768327051e-06, + "loss": 0.6721, + "step": 837 + }, + { + "epoch": 0.23213296398891967, + "grad_norm": 0.6399495005607605, + "learning_rate": 4.9855586961119086e-06, + "loss": 0.7481, + "step": 838 + }, + { + "epoch": 0.232409972299169, + "grad_norm": 0.6600784063339233, + "learning_rate": 4.985519571122007e-06, + "loss": 0.7392, + "step": 839 + }, + { + "epoch": 0.23268698060941828, + "grad_norm": 0.6567347049713135, + "learning_rate": 4.985480393358178e-06, + "loss": 0.7524, + "step": 840 + }, + { + "epoch": 0.2329639889196676, + "grad_norm": 0.6045350432395935, + "learning_rate": 4.985441162821253e-06, + "loss": 0.6875, + "step": 841 + }, + { + "epoch": 0.2332409972299169, + "grad_norm": 0.6198522448539734, + "learning_rate": 4.985401879512065e-06, + "loss": 0.7168, + "step": 842 + }, + { + "epoch": 0.23351800554016622, + "grad_norm": 0.6399564146995544, + "learning_rate": 4.985362543431449e-06, + "loss": 0.7649, + "step": 843 + }, + { + "epoch": 0.2337950138504155, + "grad_norm": 0.6472421884536743, + "learning_rate": 4.985323154580237e-06, + "loss": 0.7257, + "step": 844 + }, + { + "epoch": 0.23407202216066483, + "grad_norm": 0.6288352608680725, + "learning_rate": 4.98528371295927e-06, + "loss": 0.7581, + "step": 845 + }, + { + "epoch": 0.23434903047091413, + "grad_norm": 0.664064884185791, + "learning_rate": 4.985244218569382e-06, + "loss": 0.7395, + "step": 846 + }, + { + "epoch": 0.23462603878116345, + "grad_norm": 0.6480230689048767, + "learning_rate": 4.985204671411412e-06, + "loss": 0.7486, + "step": 847 + }, + { + "epoch": 0.23490304709141274, + "grad_norm": 0.661375105381012, + "learning_rate": 4.985165071486201e-06, + "loss": 0.7107, + "step": 848 + }, + { + "epoch": 0.23518005540166206, + "grad_norm": 0.6101104021072388, + "learning_rate": 4.985125418794589e-06, + "loss": 0.6472, + "step": 849 + }, + { + "epoch": 0.23545706371191136, + "grad_norm": 0.6327786445617676, + "learning_rate": 4.985085713337418e-06, + "loss": 0.7367, + "step": 850 + }, + { + "epoch": 0.23573407202216065, + "grad_norm": 0.645991325378418, + "learning_rate": 4.9850459551155316e-06, + "loss": 0.7331, + "step": 851 + }, + { + "epoch": 0.23601108033240997, + "grad_norm": 0.6416239738464355, + "learning_rate": 4.985006144129772e-06, + "loss": 0.7287, + "step": 852 + }, + { + "epoch": 0.23628808864265927, + "grad_norm": 0.6345062851905823, + "learning_rate": 4.984966280380987e-06, + "loss": 0.7065, + "step": 853 + }, + { + "epoch": 0.2365650969529086, + "grad_norm": 0.627337634563446, + "learning_rate": 4.984926363870023e-06, + "loss": 0.7425, + "step": 854 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.5949876308441162, + "learning_rate": 4.984886394597725e-06, + "loss": 0.6768, + "step": 855 + }, + { + "epoch": 0.2371191135734072, + "grad_norm": 0.6694568395614624, + "learning_rate": 4.984846372564943e-06, + "loss": 0.6219, + "step": 856 + }, + { + "epoch": 0.2373961218836565, + "grad_norm": 0.6464795470237732, + "learning_rate": 4.9848062977725275e-06, + "loss": 0.7199, + "step": 857 + }, + { + "epoch": 0.23767313019390582, + "grad_norm": 0.6215987801551819, + "learning_rate": 4.984766170221329e-06, + "loss": 0.6829, + "step": 858 + }, + { + "epoch": 0.2379501385041551, + "grad_norm": 0.6603294014930725, + "learning_rate": 4.9847259899121995e-06, + "loss": 0.7445, + "step": 859 + }, + { + "epoch": 0.23822714681440443, + "grad_norm": 0.6341590881347656, + "learning_rate": 4.984685756845991e-06, + "loss": 0.7204, + "step": 860 + }, + { + "epoch": 0.23850415512465373, + "grad_norm": 0.6417688727378845, + "learning_rate": 4.984645471023559e-06, + "loss": 0.7094, + "step": 861 + }, + { + "epoch": 0.23878116343490305, + "grad_norm": 0.6469441056251526, + "learning_rate": 4.984605132445759e-06, + "loss": 0.7574, + "step": 862 + }, + { + "epoch": 0.23905817174515234, + "grad_norm": 0.6261152625083923, + "learning_rate": 4.984564741113447e-06, + "loss": 0.7736, + "step": 863 + }, + { + "epoch": 0.23933518005540166, + "grad_norm": 0.6391873955726624, + "learning_rate": 4.98452429702748e-06, + "loss": 0.7671, + "step": 864 + }, + { + "epoch": 0.23961218836565096, + "grad_norm": 0.6160810589790344, + "learning_rate": 4.984483800188718e-06, + "loss": 0.7399, + "step": 865 + }, + { + "epoch": 0.23988919667590028, + "grad_norm": 0.6426335573196411, + "learning_rate": 4.98444325059802e-06, + "loss": 0.6894, + "step": 866 + }, + { + "epoch": 0.24016620498614957, + "grad_norm": 0.6205769777297974, + "learning_rate": 4.984402648256247e-06, + "loss": 0.73, + "step": 867 + }, + { + "epoch": 0.2404432132963989, + "grad_norm": 0.6322343349456787, + "learning_rate": 4.9843619931642615e-06, + "loss": 0.7144, + "step": 868 + }, + { + "epoch": 0.2407202216066482, + "grad_norm": 0.6147611737251282, + "learning_rate": 4.984321285322927e-06, + "loss": 0.7145, + "step": 869 + }, + { + "epoch": 0.2409972299168975, + "grad_norm": 0.6592707633972168, + "learning_rate": 4.984280524733107e-06, + "loss": 0.7235, + "step": 870 + }, + { + "epoch": 0.2412742382271468, + "grad_norm": 0.6175403594970703, + "learning_rate": 4.984239711395667e-06, + "loss": 0.7058, + "step": 871 + }, + { + "epoch": 0.24155124653739612, + "grad_norm": 0.6178988218307495, + "learning_rate": 4.984198845311474e-06, + "loss": 0.7353, + "step": 872 + }, + { + "epoch": 0.24182825484764542, + "grad_norm": 0.6353136897087097, + "learning_rate": 4.984157926481396e-06, + "loss": 0.6959, + "step": 873 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 0.6403223276138306, + "learning_rate": 4.9841169549063e-06, + "loss": 0.71, + "step": 874 + }, + { + "epoch": 0.24238227146814403, + "grad_norm": 0.7069513201713562, + "learning_rate": 4.984075930587059e-06, + "loss": 0.7327, + "step": 875 + }, + { + "epoch": 0.24265927977839336, + "grad_norm": 0.6697827577590942, + "learning_rate": 4.984034853524542e-06, + "loss": 0.7323, + "step": 876 + }, + { + "epoch": 0.24293628808864265, + "grad_norm": 0.6514555811882019, + "learning_rate": 4.983993723719621e-06, + "loss": 0.6917, + "step": 877 + }, + { + "epoch": 0.24321329639889197, + "grad_norm": 0.6172816157341003, + "learning_rate": 4.9839525411731705e-06, + "loss": 0.7335, + "step": 878 + }, + { + "epoch": 0.24349030470914126, + "grad_norm": 0.6580623388290405, + "learning_rate": 4.983911305886063e-06, + "loss": 0.7731, + "step": 879 + }, + { + "epoch": 0.24376731301939059, + "grad_norm": 0.6681740880012512, + "learning_rate": 4.983870017859177e-06, + "loss": 0.7812, + "step": 880 + }, + { + "epoch": 0.24404432132963988, + "grad_norm": 0.6525234580039978, + "learning_rate": 4.983828677093386e-06, + "loss": 0.6724, + "step": 881 + }, + { + "epoch": 0.2443213296398892, + "grad_norm": 0.6404354572296143, + "learning_rate": 4.98378728358957e-06, + "loss": 0.7743, + "step": 882 + }, + { + "epoch": 0.2445983379501385, + "grad_norm": 0.6499460935592651, + "learning_rate": 4.983745837348606e-06, + "loss": 0.7337, + "step": 883 + }, + { + "epoch": 0.24487534626038782, + "grad_norm": 0.6775996685028076, + "learning_rate": 4.983704338371375e-06, + "loss": 0.7413, + "step": 884 + }, + { + "epoch": 0.2451523545706371, + "grad_norm": 0.6776655912399292, + "learning_rate": 4.98366278665876e-06, + "loss": 0.7211, + "step": 885 + }, + { + "epoch": 0.24542936288088643, + "grad_norm": 0.6310790777206421, + "learning_rate": 4.9836211822116405e-06, + "loss": 0.6957, + "step": 886 + }, + { + "epoch": 0.24570637119113573, + "grad_norm": 0.5974736213684082, + "learning_rate": 4.983579525030901e-06, + "loss": 0.7169, + "step": 887 + }, + { + "epoch": 0.24598337950138505, + "grad_norm": 0.6585884690284729, + "learning_rate": 4.983537815117425e-06, + "loss": 0.7598, + "step": 888 + }, + { + "epoch": 0.24626038781163434, + "grad_norm": 0.6152383685112, + "learning_rate": 4.9834960524721e-06, + "loss": 0.7222, + "step": 889 + }, + { + "epoch": 0.24653739612188366, + "grad_norm": 0.6026103496551514, + "learning_rate": 4.98345423709581e-06, + "loss": 0.7385, + "step": 890 + }, + { + "epoch": 0.24681440443213296, + "grad_norm": 0.5877223610877991, + "learning_rate": 4.983412368989447e-06, + "loss": 0.6195, + "step": 891 + }, + { + "epoch": 0.24709141274238228, + "grad_norm": 0.648893415927887, + "learning_rate": 4.983370448153896e-06, + "loss": 0.7223, + "step": 892 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 0.6290050745010376, + "learning_rate": 4.983328474590048e-06, + "loss": 0.6909, + "step": 893 + }, + { + "epoch": 0.2476454293628809, + "grad_norm": 0.6397932171821594, + "learning_rate": 4.983286448298795e-06, + "loss": 0.7623, + "step": 894 + }, + { + "epoch": 0.24792243767313019, + "grad_norm": 0.6201431155204773, + "learning_rate": 4.98324436928103e-06, + "loss": 0.6838, + "step": 895 + }, + { + "epoch": 0.2481994459833795, + "grad_norm": 0.636030375957489, + "learning_rate": 4.983202237537645e-06, + "loss": 0.7118, + "step": 896 + }, + { + "epoch": 0.2484764542936288, + "grad_norm": 0.6343379020690918, + "learning_rate": 4.9831600530695345e-06, + "loss": 0.7253, + "step": 897 + }, + { + "epoch": 0.24875346260387812, + "grad_norm": 0.6185975670814514, + "learning_rate": 4.983117815877596e-06, + "loss": 0.679, + "step": 898 + }, + { + "epoch": 0.24903047091412742, + "grad_norm": 0.6295162439346313, + "learning_rate": 4.983075525962724e-06, + "loss": 0.6748, + "step": 899 + }, + { + "epoch": 0.24930747922437674, + "grad_norm": 0.6754584312438965, + "learning_rate": 4.983033183325818e-06, + "loss": 0.6952, + "step": 900 + }, + { + "epoch": 0.24958448753462603, + "grad_norm": 0.6608603000640869, + "learning_rate": 4.982990787967777e-06, + "loss": 0.6945, + "step": 901 + }, + { + "epoch": 0.24986149584487535, + "grad_norm": 0.6505388617515564, + "learning_rate": 4.982948339889499e-06, + "loss": 0.6935, + "step": 902 + }, + { + "epoch": 0.2501385041551247, + "grad_norm": 0.6118568778038025, + "learning_rate": 4.9829058390918885e-06, + "loss": 0.7013, + "step": 903 + }, + { + "epoch": 0.25041551246537397, + "grad_norm": 0.6262096762657166, + "learning_rate": 4.982863285575846e-06, + "loss": 0.7221, + "step": 904 + }, + { + "epoch": 0.25069252077562326, + "grad_norm": 0.6136447787284851, + "learning_rate": 4.982820679342276e-06, + "loss": 0.6722, + "step": 905 + }, + { + "epoch": 0.25096952908587256, + "grad_norm": 0.6317170858383179, + "learning_rate": 4.982778020392082e-06, + "loss": 0.712, + "step": 906 + }, + { + "epoch": 0.2512465373961219, + "grad_norm": 0.6549311876296997, + "learning_rate": 4.98273530872617e-06, + "loss": 0.7045, + "step": 907 + }, + { + "epoch": 0.2515235457063712, + "grad_norm": 0.6151458621025085, + "learning_rate": 4.982692544345448e-06, + "loss": 0.663, + "step": 908 + }, + { + "epoch": 0.2518005540166205, + "grad_norm": 0.6190466284751892, + "learning_rate": 4.9826497272508226e-06, + "loss": 0.6824, + "step": 909 + }, + { + "epoch": 0.2520775623268698, + "grad_norm": 0.6535850763320923, + "learning_rate": 4.982606857443203e-06, + "loss": 0.6979, + "step": 910 + }, + { + "epoch": 0.25235457063711914, + "grad_norm": 0.6364226341247559, + "learning_rate": 4.982563934923501e-06, + "loss": 0.7438, + "step": 911 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.6416041254997253, + "learning_rate": 4.982520959692626e-06, + "loss": 0.7186, + "step": 912 + }, + { + "epoch": 0.2529085872576177, + "grad_norm": 0.6356797814369202, + "learning_rate": 4.982477931751492e-06, + "loss": 0.7231, + "step": 913 + }, + { + "epoch": 0.253185595567867, + "grad_norm": 0.6105824708938599, + "learning_rate": 4.9824348511010115e-06, + "loss": 0.687, + "step": 914 + }, + { + "epoch": 0.25346260387811637, + "grad_norm": 0.6553460955619812, + "learning_rate": 4.9823917177421e-06, + "loss": 0.749, + "step": 915 + }, + { + "epoch": 0.25373961218836566, + "grad_norm": 0.6527493000030518, + "learning_rate": 4.982348531675673e-06, + "loss": 0.7491, + "step": 916 + }, + { + "epoch": 0.25401662049861495, + "grad_norm": 0.6494553685188293, + "learning_rate": 4.982305292902647e-06, + "loss": 0.7337, + "step": 917 + }, + { + "epoch": 0.25429362880886425, + "grad_norm": 0.6618565320968628, + "learning_rate": 4.982262001423941e-06, + "loss": 0.72, + "step": 918 + }, + { + "epoch": 0.2545706371191136, + "grad_norm": 0.6225207448005676, + "learning_rate": 4.982218657240473e-06, + "loss": 0.7314, + "step": 919 + }, + { + "epoch": 0.2548476454293629, + "grad_norm": 0.6372534036636353, + "learning_rate": 4.982175260353164e-06, + "loss": 0.7407, + "step": 920 + }, + { + "epoch": 0.2551246537396122, + "grad_norm": 0.633301317691803, + "learning_rate": 4.982131810762937e-06, + "loss": 0.6913, + "step": 921 + }, + { + "epoch": 0.2554016620498615, + "grad_norm": 0.6848824620246887, + "learning_rate": 4.982088308470712e-06, + "loss": 0.7883, + "step": 922 + }, + { + "epoch": 0.2556786703601108, + "grad_norm": 0.6465626955032349, + "learning_rate": 4.982044753477413e-06, + "loss": 0.7233, + "step": 923 + }, + { + "epoch": 0.2559556786703601, + "grad_norm": 0.6425303220748901, + "learning_rate": 4.982001145783966e-06, + "loss": 0.7849, + "step": 924 + }, + { + "epoch": 0.2562326869806094, + "grad_norm": 0.6424234509468079, + "learning_rate": 4.981957485391297e-06, + "loss": 0.7498, + "step": 925 + }, + { + "epoch": 0.2565096952908587, + "grad_norm": 0.6227364540100098, + "learning_rate": 4.981913772300332e-06, + "loss": 0.6802, + "step": 926 + }, + { + "epoch": 0.25678670360110806, + "grad_norm": 0.6455993056297302, + "learning_rate": 4.981870006511999e-06, + "loss": 0.7114, + "step": 927 + }, + { + "epoch": 0.25706371191135735, + "grad_norm": 0.6288533806800842, + "learning_rate": 4.981826188027228e-06, + "loss": 0.6904, + "step": 928 + }, + { + "epoch": 0.25734072022160664, + "grad_norm": 0.6399247646331787, + "learning_rate": 4.981782316846948e-06, + "loss": 0.7539, + "step": 929 + }, + { + "epoch": 0.25761772853185594, + "grad_norm": 0.6298650503158569, + "learning_rate": 4.981738392972093e-06, + "loss": 0.7166, + "step": 930 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 0.6303683519363403, + "learning_rate": 4.9816944164035944e-06, + "loss": 0.6957, + "step": 931 + }, + { + "epoch": 0.2581717451523546, + "grad_norm": 0.6419861316680908, + "learning_rate": 4.981650387142385e-06, + "loss": 0.7509, + "step": 932 + }, + { + "epoch": 0.2584487534626039, + "grad_norm": 0.6510540843009949, + "learning_rate": 4.981606305189402e-06, + "loss": 0.7132, + "step": 933 + }, + { + "epoch": 0.25872576177285317, + "grad_norm": 0.6482325792312622, + "learning_rate": 4.981562170545578e-06, + "loss": 0.6979, + "step": 934 + }, + { + "epoch": 0.2590027700831025, + "grad_norm": 0.6792583465576172, + "learning_rate": 4.9815179832118525e-06, + "loss": 0.7653, + "step": 935 + }, + { + "epoch": 0.2592797783933518, + "grad_norm": 0.6865032911300659, + "learning_rate": 4.981473743189163e-06, + "loss": 0.7413, + "step": 936 + }, + { + "epoch": 0.2595567867036011, + "grad_norm": 0.6258604526519775, + "learning_rate": 4.9814294504784484e-06, + "loss": 0.7152, + "step": 937 + }, + { + "epoch": 0.2598337950138504, + "grad_norm": 0.6872163414955139, + "learning_rate": 4.98138510508065e-06, + "loss": 0.7077, + "step": 938 + }, + { + "epoch": 0.26011080332409975, + "grad_norm": 0.6078726649284363, + "learning_rate": 4.98134070699671e-06, + "loss": 0.662, + "step": 939 + }, + { + "epoch": 0.26038781163434904, + "grad_norm": 0.6605315804481506, + "learning_rate": 4.98129625622757e-06, + "loss": 0.759, + "step": 940 + }, + { + "epoch": 0.26066481994459834, + "grad_norm": 0.691207230091095, + "learning_rate": 4.981251752774173e-06, + "loss": 0.7287, + "step": 941 + }, + { + "epoch": 0.26094182825484763, + "grad_norm": 0.6527872681617737, + "learning_rate": 4.981207196637465e-06, + "loss": 0.696, + "step": 942 + }, + { + "epoch": 0.261218836565097, + "grad_norm": 0.6540459990501404, + "learning_rate": 4.981162587818393e-06, + "loss": 0.7408, + "step": 943 + }, + { + "epoch": 0.2614958448753463, + "grad_norm": 0.6336873769760132, + "learning_rate": 4.981117926317902e-06, + "loss": 0.774, + "step": 944 + }, + { + "epoch": 0.26177285318559557, + "grad_norm": 0.6425933837890625, + "learning_rate": 4.981073212136942e-06, + "loss": 0.7575, + "step": 945 + }, + { + "epoch": 0.26204986149584486, + "grad_norm": 0.6415480971336365, + "learning_rate": 4.981028445276461e-06, + "loss": 0.7147, + "step": 946 + }, + { + "epoch": 0.2623268698060942, + "grad_norm": 0.6138240694999695, + "learning_rate": 4.980983625737411e-06, + "loss": 0.7271, + "step": 947 + }, + { + "epoch": 0.2626038781163435, + "grad_norm": 0.6276071667671204, + "learning_rate": 4.980938753520742e-06, + "loss": 0.7136, + "step": 948 + }, + { + "epoch": 0.2628808864265928, + "grad_norm": 0.6506159901618958, + "learning_rate": 4.980893828627408e-06, + "loss": 0.687, + "step": 949 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.6397688388824463, + "learning_rate": 4.980848851058363e-06, + "loss": 0.7399, + "step": 950 + }, + { + "epoch": 0.26343490304709144, + "grad_norm": 0.6355700492858887, + "learning_rate": 4.980803820814562e-06, + "loss": 0.7082, + "step": 951 + }, + { + "epoch": 0.26371191135734073, + "grad_norm": 0.6730945706367493, + "learning_rate": 4.980758737896959e-06, + "loss": 0.7546, + "step": 952 + }, + { + "epoch": 0.26398891966759, + "grad_norm": 0.6312103271484375, + "learning_rate": 4.980713602306514e-06, + "loss": 0.6864, + "step": 953 + }, + { + "epoch": 0.2642659279778393, + "grad_norm": 0.640343189239502, + "learning_rate": 4.980668414044183e-06, + "loss": 0.6764, + "step": 954 + }, + { + "epoch": 0.26454293628808867, + "grad_norm": 0.6530020833015442, + "learning_rate": 4.980623173110928e-06, + "loss": 0.7476, + "step": 955 + }, + { + "epoch": 0.26481994459833796, + "grad_norm": 0.6287079453468323, + "learning_rate": 4.9805778795077074e-06, + "loss": 0.7068, + "step": 956 + }, + { + "epoch": 0.26509695290858726, + "grad_norm": 0.6119797825813293, + "learning_rate": 4.980532533235484e-06, + "loss": 0.7638, + "step": 957 + }, + { + "epoch": 0.26537396121883655, + "grad_norm": 0.6286291480064392, + "learning_rate": 4.98048713429522e-06, + "loss": 0.7404, + "step": 958 + }, + { + "epoch": 0.26565096952908585, + "grad_norm": 0.6268792748451233, + "learning_rate": 4.98044168268788e-06, + "loss": 0.7177, + "step": 959 + }, + { + "epoch": 0.2659279778393352, + "grad_norm": 0.639512300491333, + "learning_rate": 4.980396178414429e-06, + "loss": 0.7573, + "step": 960 + }, + { + "epoch": 0.2662049861495845, + "grad_norm": 0.6351445317268372, + "learning_rate": 4.980350621475832e-06, + "loss": 0.7487, + "step": 961 + }, + { + "epoch": 0.2664819944598338, + "grad_norm": 0.8666406273841858, + "learning_rate": 4.980305011873058e-06, + "loss": 0.6926, + "step": 962 + }, + { + "epoch": 0.2667590027700831, + "grad_norm": 0.6362487077713013, + "learning_rate": 4.980259349607074e-06, + "loss": 0.7169, + "step": 963 + }, + { + "epoch": 0.2670360110803324, + "grad_norm": 0.6595889925956726, + "learning_rate": 4.98021363467885e-06, + "loss": 0.7173, + "step": 964 + }, + { + "epoch": 0.2673130193905817, + "grad_norm": 0.6907031536102295, + "learning_rate": 4.9801678670893575e-06, + "loss": 0.7666, + "step": 965 + }, + { + "epoch": 0.267590027700831, + "grad_norm": 0.6077108979225159, + "learning_rate": 4.980122046839567e-06, + "loss": 0.6915, + "step": 966 + }, + { + "epoch": 0.2678670360110803, + "grad_norm": 0.685006320476532, + "learning_rate": 4.980076173930452e-06, + "loss": 0.7328, + "step": 967 + }, + { + "epoch": 0.26814404432132966, + "grad_norm": 0.65442955493927, + "learning_rate": 4.980030248362987e-06, + "loss": 0.7229, + "step": 968 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 0.6928679943084717, + "learning_rate": 4.979984270138146e-06, + "loss": 0.7643, + "step": 969 + }, + { + "epoch": 0.26869806094182824, + "grad_norm": 0.6740157008171082, + "learning_rate": 4.979938239256905e-06, + "loss": 0.6848, + "step": 970 + }, + { + "epoch": 0.26897506925207754, + "grad_norm": 0.675374448299408, + "learning_rate": 4.979892155720243e-06, + "loss": 0.725, + "step": 971 + }, + { + "epoch": 0.2692520775623269, + "grad_norm": 0.6164215207099915, + "learning_rate": 4.979846019529138e-06, + "loss": 0.6557, + "step": 972 + }, + { + "epoch": 0.2695290858725762, + "grad_norm": 0.6450408101081848, + "learning_rate": 4.979799830684568e-06, + "loss": 0.7266, + "step": 973 + }, + { + "epoch": 0.2698060941828255, + "grad_norm": 0.6351450085639954, + "learning_rate": 4.9797535891875156e-06, + "loss": 0.7281, + "step": 974 + }, + { + "epoch": 0.27008310249307477, + "grad_norm": 0.6664936542510986, + "learning_rate": 4.9797072950389625e-06, + "loss": 0.6896, + "step": 975 + }, + { + "epoch": 0.2703601108033241, + "grad_norm": 0.6203340888023376, + "learning_rate": 4.97966094823989e-06, + "loss": 0.6914, + "step": 976 + }, + { + "epoch": 0.2706371191135734, + "grad_norm": 0.6529099941253662, + "learning_rate": 4.979614548791283e-06, + "loss": 0.727, + "step": 977 + }, + { + "epoch": 0.2709141274238227, + "grad_norm": 0.6582198739051819, + "learning_rate": 4.979568096694128e-06, + "loss": 0.7072, + "step": 978 + }, + { + "epoch": 0.271191135734072, + "grad_norm": 0.6922197937965393, + "learning_rate": 4.979521591949409e-06, + "loss": 0.7379, + "step": 979 + }, + { + "epoch": 0.27146814404432135, + "grad_norm": 0.5990357398986816, + "learning_rate": 4.979475034558115e-06, + "loss": 0.6885, + "step": 980 + }, + { + "epoch": 0.27174515235457064, + "grad_norm": 0.5973677635192871, + "learning_rate": 4.979428424521235e-06, + "loss": 0.711, + "step": 981 + }, + { + "epoch": 0.27202216066481993, + "grad_norm": 0.6486063599586487, + "learning_rate": 4.979381761839757e-06, + "loss": 0.722, + "step": 982 + }, + { + "epoch": 0.27229916897506923, + "grad_norm": 0.6575403809547424, + "learning_rate": 4.979335046514673e-06, + "loss": 0.7246, + "step": 983 + }, + { + "epoch": 0.2725761772853186, + "grad_norm": 0.6764856576919556, + "learning_rate": 4.979288278546974e-06, + "loss": 0.7574, + "step": 984 + }, + { + "epoch": 0.27285318559556787, + "grad_norm": 0.6495998501777649, + "learning_rate": 4.979241457937653e-06, + "loss": 0.7221, + "step": 985 + }, + { + "epoch": 0.27313019390581716, + "grad_norm": 0.6304432153701782, + "learning_rate": 4.979194584687706e-06, + "loss": 0.7644, + "step": 986 + }, + { + "epoch": 0.27340720221606646, + "grad_norm": 0.7039188146591187, + "learning_rate": 4.9791476587981255e-06, + "loss": 0.71, + "step": 987 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 0.6315945386886597, + "learning_rate": 4.979100680269909e-06, + "loss": 0.7015, + "step": 988 + }, + { + "epoch": 0.2739612188365651, + "grad_norm": 0.6366348266601562, + "learning_rate": 4.979053649104056e-06, + "loss": 0.7001, + "step": 989 + }, + { + "epoch": 0.2742382271468144, + "grad_norm": 0.5995786786079407, + "learning_rate": 4.979006565301561e-06, + "loss": 0.6959, + "step": 990 + }, + { + "epoch": 0.2745152354570637, + "grad_norm": 0.6954519152641296, + "learning_rate": 4.978959428863427e-06, + "loss": 0.7767, + "step": 991 + }, + { + "epoch": 0.27479224376731304, + "grad_norm": 0.6461866497993469, + "learning_rate": 4.978912239790653e-06, + "loss": 0.7315, + "step": 992 + }, + { + "epoch": 0.27506925207756233, + "grad_norm": 0.6643589735031128, + "learning_rate": 4.978864998084243e-06, + "loss": 0.7343, + "step": 993 + }, + { + "epoch": 0.2753462603878116, + "grad_norm": 0.6773586869239807, + "learning_rate": 4.978817703745198e-06, + "loss": 0.7251, + "step": 994 + }, + { + "epoch": 0.2756232686980609, + "grad_norm": 0.6450729370117188, + "learning_rate": 4.978770356774523e-06, + "loss": 0.7223, + "step": 995 + }, + { + "epoch": 0.27590027700831027, + "grad_norm": 0.6580459475517273, + "learning_rate": 4.978722957173222e-06, + "loss": 0.7083, + "step": 996 + }, + { + "epoch": 0.27617728531855956, + "grad_norm": 0.6939603090286255, + "learning_rate": 4.978675504942304e-06, + "loss": 0.6858, + "step": 997 + }, + { + "epoch": 0.27645429362880886, + "grad_norm": 0.623268187046051, + "learning_rate": 4.9786280000827754e-06, + "loss": 0.7417, + "step": 998 + }, + { + "epoch": 0.27673130193905815, + "grad_norm": 0.6503739356994629, + "learning_rate": 4.978580442595644e-06, + "loss": 0.7275, + "step": 999 + }, + { + "epoch": 0.2770083102493075, + "grad_norm": 0.6480211019515991, + "learning_rate": 4.978532832481921e-06, + "loss": 0.7379, + "step": 1000 + }, + { + "epoch": 0.2772853185595568, + "grad_norm": 0.6500074863433838, + "learning_rate": 4.9784851697426155e-06, + "loss": 0.6806, + "step": 1001 + }, + { + "epoch": 0.2775623268698061, + "grad_norm": 0.6740723848342896, + "learning_rate": 4.978437454378741e-06, + "loss": 0.7479, + "step": 1002 + }, + { + "epoch": 0.2778393351800554, + "grad_norm": 0.6438689827919006, + "learning_rate": 4.97838968639131e-06, + "loss": 0.6952, + "step": 1003 + }, + { + "epoch": 0.27811634349030473, + "grad_norm": 0.6317799687385559, + "learning_rate": 4.978341865781336e-06, + "loss": 0.7218, + "step": 1004 + }, + { + "epoch": 0.278393351800554, + "grad_norm": 0.6593930721282959, + "learning_rate": 4.978293992549837e-06, + "loss": 0.7173, + "step": 1005 + }, + { + "epoch": 0.2786703601108033, + "grad_norm": 0.6122359037399292, + "learning_rate": 4.9782460666978265e-06, + "loss": 0.7294, + "step": 1006 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 0.6612704992294312, + "learning_rate": 4.978198088226324e-06, + "loss": 0.7177, + "step": 1007 + }, + { + "epoch": 0.27922437673130196, + "grad_norm": 0.6342014670372009, + "learning_rate": 4.9781500571363465e-06, + "loss": 0.6808, + "step": 1008 + }, + { + "epoch": 0.27950138504155125, + "grad_norm": 0.6799290776252747, + "learning_rate": 4.978101973428915e-06, + "loss": 0.7603, + "step": 1009 + }, + { + "epoch": 0.27977839335180055, + "grad_norm": 0.6468144655227661, + "learning_rate": 4.97805383710505e-06, + "loss": 0.692, + "step": 1010 + }, + { + "epoch": 0.28005540166204984, + "grad_norm": 0.6199710369110107, + "learning_rate": 4.978005648165774e-06, + "loss": 0.7059, + "step": 1011 + }, + { + "epoch": 0.2803324099722992, + "grad_norm": 0.6647071242332458, + "learning_rate": 4.97795740661211e-06, + "loss": 0.7496, + "step": 1012 + }, + { + "epoch": 0.2806094182825485, + "grad_norm": 0.6575323939323425, + "learning_rate": 4.977909112445083e-06, + "loss": 0.7271, + "step": 1013 + }, + { + "epoch": 0.2808864265927978, + "grad_norm": 0.6441571116447449, + "learning_rate": 4.977860765665717e-06, + "loss": 0.7272, + "step": 1014 + }, + { + "epoch": 0.28116343490304707, + "grad_norm": 0.6441684365272522, + "learning_rate": 4.97781236627504e-06, + "loss": 0.7068, + "step": 1015 + }, + { + "epoch": 0.2814404432132964, + "grad_norm": 0.6357406377792358, + "learning_rate": 4.977763914274077e-06, + "loss": 0.7275, + "step": 1016 + }, + { + "epoch": 0.2817174515235457, + "grad_norm": 0.6525790691375732, + "learning_rate": 4.9777154096638605e-06, + "loss": 0.7323, + "step": 1017 + }, + { + "epoch": 0.281994459833795, + "grad_norm": 0.6308574676513672, + "learning_rate": 4.977666852445418e-06, + "loss": 0.6859, + "step": 1018 + }, + { + "epoch": 0.2822714681440443, + "grad_norm": 0.6342403292655945, + "learning_rate": 4.97761824261978e-06, + "loss": 0.7752, + "step": 1019 + }, + { + "epoch": 0.28254847645429365, + "grad_norm": 0.6695272922515869, + "learning_rate": 4.97756958018798e-06, + "loss": 0.6806, + "step": 1020 + }, + { + "epoch": 0.28282548476454294, + "grad_norm": 0.6361978054046631, + "learning_rate": 4.9775208651510505e-06, + "loss": 0.719, + "step": 1021 + }, + { + "epoch": 0.28310249307479224, + "grad_norm": 0.6587631106376648, + "learning_rate": 4.977472097510027e-06, + "loss": 0.7863, + "step": 1022 + }, + { + "epoch": 0.28337950138504153, + "grad_norm": 0.6483465433120728, + "learning_rate": 4.977423277265943e-06, + "loss": 0.7512, + "step": 1023 + }, + { + "epoch": 0.2836565096952909, + "grad_norm": 0.6564074754714966, + "learning_rate": 4.977374404419838e-06, + "loss": 0.7031, + "step": 1024 + }, + { + "epoch": 0.2839335180055402, + "grad_norm": 0.6523749828338623, + "learning_rate": 4.977325478972746e-06, + "loss": 0.68, + "step": 1025 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 0.7020553946495056, + "learning_rate": 4.977276500925708e-06, + "loss": 0.7362, + "step": 1026 + }, + { + "epoch": 0.28448753462603876, + "grad_norm": 0.7095143795013428, + "learning_rate": 4.977227470279764e-06, + "loss": 0.7392, + "step": 1027 + }, + { + "epoch": 0.2847645429362881, + "grad_norm": 0.6133536696434021, + "learning_rate": 4.977178387035955e-06, + "loss": 0.6867, + "step": 1028 + }, + { + "epoch": 0.2850415512465374, + "grad_norm": 0.6639204025268555, + "learning_rate": 4.977129251195322e-06, + "loss": 0.7362, + "step": 1029 + }, + { + "epoch": 0.2853185595567867, + "grad_norm": 0.6284003853797913, + "learning_rate": 4.97708006275891e-06, + "loss": 0.7444, + "step": 1030 + }, + { + "epoch": 0.285595567867036, + "grad_norm": 0.6329095363616943, + "learning_rate": 4.977030821727762e-06, + "loss": 0.6701, + "step": 1031 + }, + { + "epoch": 0.28587257617728534, + "grad_norm": 0.6395769715309143, + "learning_rate": 4.976981528102924e-06, + "loss": 0.7316, + "step": 1032 + }, + { + "epoch": 0.28614958448753464, + "grad_norm": 0.6371200084686279, + "learning_rate": 4.976932181885443e-06, + "loss": 0.7063, + "step": 1033 + }, + { + "epoch": 0.28642659279778393, + "grad_norm": 0.6195498704910278, + "learning_rate": 4.976882783076366e-06, + "loss": 0.6956, + "step": 1034 + }, + { + "epoch": 0.2867036011080332, + "grad_norm": 0.6352466940879822, + "learning_rate": 4.976833331676742e-06, + "loss": 0.756, + "step": 1035 + }, + { + "epoch": 0.2869806094182826, + "grad_norm": 0.6565999388694763, + "learning_rate": 4.976783827687622e-06, + "loss": 0.7226, + "step": 1036 + }, + { + "epoch": 0.28725761772853187, + "grad_norm": 0.6376464366912842, + "learning_rate": 4.9767342711100555e-06, + "loss": 0.7284, + "step": 1037 + }, + { + "epoch": 0.28753462603878116, + "grad_norm": 0.6268242001533508, + "learning_rate": 4.976684661945096e-06, + "loss": 0.7511, + "step": 1038 + }, + { + "epoch": 0.28781163434903045, + "grad_norm": 0.610745370388031, + "learning_rate": 4.976635000193796e-06, + "loss": 0.6825, + "step": 1039 + }, + { + "epoch": 0.2880886426592798, + "grad_norm": 0.6473029851913452, + "learning_rate": 4.976585285857211e-06, + "loss": 0.7226, + "step": 1040 + }, + { + "epoch": 0.2883656509695291, + "grad_norm": 0.6430656313896179, + "learning_rate": 4.976535518936395e-06, + "loss": 0.7128, + "step": 1041 + }, + { + "epoch": 0.2886426592797784, + "grad_norm": 0.6196282505989075, + "learning_rate": 4.9764856994324065e-06, + "loss": 0.7277, + "step": 1042 + }, + { + "epoch": 0.2889196675900277, + "grad_norm": 0.6294348835945129, + "learning_rate": 4.976435827346301e-06, + "loss": 0.66, + "step": 1043 + }, + { + "epoch": 0.28919667590027703, + "grad_norm": 0.640680193901062, + "learning_rate": 4.976385902679139e-06, + "loss": 0.7064, + "step": 1044 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.6399988532066345, + "learning_rate": 4.976335925431982e-06, + "loss": 0.6775, + "step": 1045 + }, + { + "epoch": 0.2897506925207756, + "grad_norm": 0.6854329109191895, + "learning_rate": 4.976285895605888e-06, + "loss": 0.7241, + "step": 1046 + }, + { + "epoch": 0.2900277008310249, + "grad_norm": 0.6851531267166138, + "learning_rate": 4.976235813201921e-06, + "loss": 0.7943, + "step": 1047 + }, + { + "epoch": 0.29030470914127426, + "grad_norm": 0.6644576191902161, + "learning_rate": 4.976185678221144e-06, + "loss": 0.6906, + "step": 1048 + }, + { + "epoch": 0.29058171745152356, + "grad_norm": 0.6867406964302063, + "learning_rate": 4.976135490664622e-06, + "loss": 0.7291, + "step": 1049 + }, + { + "epoch": 0.29085872576177285, + "grad_norm": 0.6741674542427063, + "learning_rate": 4.976085250533419e-06, + "loss": 0.7314, + "step": 1050 + }, + { + "epoch": 0.29113573407202215, + "grad_norm": 0.6466678977012634, + "learning_rate": 4.976034957828604e-06, + "loss": 0.719, + "step": 1051 + }, + { + "epoch": 0.2914127423822715, + "grad_norm": 0.6052623987197876, + "learning_rate": 4.975984612551243e-06, + "loss": 0.6441, + "step": 1052 + }, + { + "epoch": 0.2916897506925208, + "grad_norm": 0.62298583984375, + "learning_rate": 4.975934214702407e-06, + "loss": 0.6747, + "step": 1053 + }, + { + "epoch": 0.2919667590027701, + "grad_norm": 0.6360480785369873, + "learning_rate": 4.975883764283164e-06, + "loss": 0.7081, + "step": 1054 + }, + { + "epoch": 0.2922437673130194, + "grad_norm": 0.674088180065155, + "learning_rate": 4.9758332612945855e-06, + "loss": 0.7371, + "step": 1055 + }, + { + "epoch": 0.2925207756232687, + "grad_norm": 0.6503378748893738, + "learning_rate": 4.975782705737745e-06, + "loss": 0.7175, + "step": 1056 + }, + { + "epoch": 0.292797783933518, + "grad_norm": 0.6464011073112488, + "learning_rate": 4.975732097613715e-06, + "loss": 0.7175, + "step": 1057 + }, + { + "epoch": 0.2930747922437673, + "grad_norm": 0.6450788974761963, + "learning_rate": 4.97568143692357e-06, + "loss": 0.7429, + "step": 1058 + }, + { + "epoch": 0.2933518005540166, + "grad_norm": 0.672749936580658, + "learning_rate": 4.975630723668385e-06, + "loss": 0.7046, + "step": 1059 + }, + { + "epoch": 0.29362880886426596, + "grad_norm": 0.638154923915863, + "learning_rate": 4.975579957849239e-06, + "loss": 0.6932, + "step": 1060 + }, + { + "epoch": 0.29390581717451525, + "grad_norm": 0.6246246099472046, + "learning_rate": 4.975529139467208e-06, + "loss": 0.7035, + "step": 1061 + }, + { + "epoch": 0.29418282548476454, + "grad_norm": 0.6619797348976135, + "learning_rate": 4.975478268523372e-06, + "loss": 0.7464, + "step": 1062 + }, + { + "epoch": 0.29445983379501384, + "grad_norm": 0.6233973503112793, + "learning_rate": 4.97542734501881e-06, + "loss": 0.7053, + "step": 1063 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 0.6285495162010193, + "learning_rate": 4.975376368954605e-06, + "loss": 0.7718, + "step": 1064 + }, + { + "epoch": 0.2950138504155125, + "grad_norm": 0.6749695539474487, + "learning_rate": 4.9753253403318375e-06, + "loss": 0.7252, + "step": 1065 + }, + { + "epoch": 0.2952908587257618, + "grad_norm": 0.6395245790481567, + "learning_rate": 4.975274259151592e-06, + "loss": 0.6891, + "step": 1066 + }, + { + "epoch": 0.29556786703601107, + "grad_norm": 0.6453225612640381, + "learning_rate": 4.975223125414952e-06, + "loss": 0.7353, + "step": 1067 + }, + { + "epoch": 0.29584487534626036, + "grad_norm": 0.6453205943107605, + "learning_rate": 4.9751719391230055e-06, + "loss": 0.7296, + "step": 1068 + }, + { + "epoch": 0.2961218836565097, + "grad_norm": 0.6330751776695251, + "learning_rate": 4.975120700276837e-06, + "loss": 0.7129, + "step": 1069 + }, + { + "epoch": 0.296398891966759, + "grad_norm": 0.6840337514877319, + "learning_rate": 4.975069408877535e-06, + "loss": 0.7683, + "step": 1070 + }, + { + "epoch": 0.2966759002770083, + "grad_norm": 0.6366419196128845, + "learning_rate": 4.975018064926189e-06, + "loss": 0.682, + "step": 1071 + }, + { + "epoch": 0.2969529085872576, + "grad_norm": 0.6742862462997437, + "learning_rate": 4.974966668423889e-06, + "loss": 0.6991, + "step": 1072 + }, + { + "epoch": 0.29722991689750694, + "grad_norm": 0.6317266225814819, + "learning_rate": 4.974915219371726e-06, + "loss": 0.7467, + "step": 1073 + }, + { + "epoch": 0.29750692520775623, + "grad_norm": 0.6813490390777588, + "learning_rate": 4.974863717770793e-06, + "loss": 0.7729, + "step": 1074 + }, + { + "epoch": 0.29778393351800553, + "grad_norm": 0.6082154512405396, + "learning_rate": 4.974812163622183e-06, + "loss": 0.7586, + "step": 1075 + }, + { + "epoch": 0.2980609418282548, + "grad_norm": 0.6370641589164734, + "learning_rate": 4.974760556926991e-06, + "loss": 0.7545, + "step": 1076 + }, + { + "epoch": 0.29833795013850417, + "grad_norm": 0.6440302729606628, + "learning_rate": 4.974708897686312e-06, + "loss": 0.7222, + "step": 1077 + }, + { + "epoch": 0.29861495844875346, + "grad_norm": 0.6516856551170349, + "learning_rate": 4.974657185901245e-06, + "loss": 0.694, + "step": 1078 + }, + { + "epoch": 0.29889196675900276, + "grad_norm": 0.6376709938049316, + "learning_rate": 4.974605421572885e-06, + "loss": 0.6705, + "step": 1079 + }, + { + "epoch": 0.29916897506925205, + "grad_norm": 0.637450098991394, + "learning_rate": 4.974553604702332e-06, + "loss": 0.7093, + "step": 1080 + }, + { + "epoch": 0.2994459833795014, + "grad_norm": 0.6694522500038147, + "learning_rate": 4.974501735290688e-06, + "loss": 0.7674, + "step": 1081 + }, + { + "epoch": 0.2997229916897507, + "grad_norm": 0.6781630516052246, + "learning_rate": 4.9744498133390515e-06, + "loss": 0.7496, + "step": 1082 + }, + { + "epoch": 0.3, + "grad_norm": 0.647311270236969, + "learning_rate": 4.974397838848527e-06, + "loss": 0.7152, + "step": 1083 + }, + { + "epoch": 0.3002770083102493, + "grad_norm": 0.6689607501029968, + "learning_rate": 4.974345811820218e-06, + "loss": 0.6752, + "step": 1084 + }, + { + "epoch": 0.30055401662049863, + "grad_norm": 0.697877049446106, + "learning_rate": 4.9742937322552285e-06, + "loss": 0.7165, + "step": 1085 + }, + { + "epoch": 0.3008310249307479, + "grad_norm": 0.6767156720161438, + "learning_rate": 4.974241600154664e-06, + "loss": 0.7574, + "step": 1086 + }, + { + "epoch": 0.3011080332409972, + "grad_norm": 0.6254011988639832, + "learning_rate": 4.974189415519632e-06, + "loss": 0.6666, + "step": 1087 + }, + { + "epoch": 0.3013850415512465, + "grad_norm": 0.6448764204978943, + "learning_rate": 4.9741371783512395e-06, + "loss": 0.6697, + "step": 1088 + }, + { + "epoch": 0.30166204986149586, + "grad_norm": 0.6668729186058044, + "learning_rate": 4.974084888650597e-06, + "loss": 0.7035, + "step": 1089 + }, + { + "epoch": 0.30193905817174516, + "grad_norm": 0.6547879576683044, + "learning_rate": 4.974032546418816e-06, + "loss": 0.7492, + "step": 1090 + }, + { + "epoch": 0.30221606648199445, + "grad_norm": 0.646012544631958, + "learning_rate": 4.9739801516570035e-06, + "loss": 0.6893, + "step": 1091 + }, + { + "epoch": 0.30249307479224374, + "grad_norm": 0.6936621069908142, + "learning_rate": 4.973927704366275e-06, + "loss": 0.783, + "step": 1092 + }, + { + "epoch": 0.3027700831024931, + "grad_norm": 0.5984358787536621, + "learning_rate": 4.973875204547744e-06, + "loss": 0.6532, + "step": 1093 + }, + { + "epoch": 0.3030470914127424, + "grad_norm": 0.6124839782714844, + "learning_rate": 4.973822652202526e-06, + "loss": 0.6487, + "step": 1094 + }, + { + "epoch": 0.3033240997229917, + "grad_norm": 0.6612206101417542, + "learning_rate": 4.973770047331734e-06, + "loss": 0.7466, + "step": 1095 + }, + { + "epoch": 0.303601108033241, + "grad_norm": 0.6440140008926392, + "learning_rate": 4.973717389936487e-06, + "loss": 0.6941, + "step": 1096 + }, + { + "epoch": 0.3038781163434903, + "grad_norm": 0.6640320420265198, + "learning_rate": 4.973664680017903e-06, + "loss": 0.7211, + "step": 1097 + }, + { + "epoch": 0.3041551246537396, + "grad_norm": 0.6480273604393005, + "learning_rate": 4.9736119175771005e-06, + "loss": 0.7297, + "step": 1098 + }, + { + "epoch": 0.3044321329639889, + "grad_norm": 0.6550040245056152, + "learning_rate": 4.973559102615199e-06, + "loss": 0.6963, + "step": 1099 + }, + { + "epoch": 0.3047091412742382, + "grad_norm": 0.6494340896606445, + "learning_rate": 4.973506235133323e-06, + "loss": 0.7035, + "step": 1100 + }, + { + "epoch": 0.30498614958448755, + "grad_norm": 0.6211956143379211, + "learning_rate": 4.973453315132592e-06, + "loss": 0.6444, + "step": 1101 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 0.6703330874443054, + "learning_rate": 4.9734003426141305e-06, + "loss": 0.6802, + "step": 1102 + }, + { + "epoch": 0.30554016620498614, + "grad_norm": 0.6802040934562683, + "learning_rate": 4.973347317579064e-06, + "loss": 0.6729, + "step": 1103 + }, + { + "epoch": 0.30581717451523543, + "grad_norm": 0.6808921098709106, + "learning_rate": 4.973294240028517e-06, + "loss": 0.6976, + "step": 1104 + }, + { + "epoch": 0.3060941828254848, + "grad_norm": 0.6503647565841675, + "learning_rate": 4.973241109963618e-06, + "loss": 0.7282, + "step": 1105 + }, + { + "epoch": 0.3063711911357341, + "grad_norm": 0.6694557070732117, + "learning_rate": 4.973187927385493e-06, + "loss": 0.716, + "step": 1106 + }, + { + "epoch": 0.30664819944598337, + "grad_norm": 0.6967760920524597, + "learning_rate": 4.973134692295273e-06, + "loss": 0.7956, + "step": 1107 + }, + { + "epoch": 0.30692520775623267, + "grad_norm": 0.6802443861961365, + "learning_rate": 4.973081404694088e-06, + "loss": 0.7406, + "step": 1108 + }, + { + "epoch": 0.307202216066482, + "grad_norm": 0.6713878512382507, + "learning_rate": 4.973028064583069e-06, + "loss": 0.7273, + "step": 1109 + }, + { + "epoch": 0.3074792243767313, + "grad_norm": 0.6317265629768372, + "learning_rate": 4.972974671963349e-06, + "loss": 0.7583, + "step": 1110 + }, + { + "epoch": 0.3077562326869806, + "grad_norm": 0.6492122411727905, + "learning_rate": 4.972921226836061e-06, + "loss": 0.7243, + "step": 1111 + }, + { + "epoch": 0.3080332409972299, + "grad_norm": 0.630206286907196, + "learning_rate": 4.9728677292023405e-06, + "loss": 0.6688, + "step": 1112 + }, + { + "epoch": 0.30831024930747924, + "grad_norm": 0.69725102186203, + "learning_rate": 4.972814179063323e-06, + "loss": 0.7342, + "step": 1113 + }, + { + "epoch": 0.30858725761772854, + "grad_norm": 0.68944251537323, + "learning_rate": 4.9727605764201455e-06, + "loss": 0.7534, + "step": 1114 + }, + { + "epoch": 0.30886426592797783, + "grad_norm": 0.6732880473136902, + "learning_rate": 4.972706921273947e-06, + "loss": 0.7125, + "step": 1115 + }, + { + "epoch": 0.3091412742382271, + "grad_norm": 0.6394022703170776, + "learning_rate": 4.972653213625865e-06, + "loss": 0.7135, + "step": 1116 + }, + { + "epoch": 0.3094182825484765, + "grad_norm": 0.6659450531005859, + "learning_rate": 4.972599453477041e-06, + "loss": 0.7337, + "step": 1117 + }, + { + "epoch": 0.30969529085872577, + "grad_norm": 0.6496633887290955, + "learning_rate": 4.972545640828617e-06, + "loss": 0.7601, + "step": 1118 + }, + { + "epoch": 0.30997229916897506, + "grad_norm": 0.6592468619346619, + "learning_rate": 4.972491775681735e-06, + "loss": 0.7553, + "step": 1119 + }, + { + "epoch": 0.31024930747922436, + "grad_norm": 0.6824886202812195, + "learning_rate": 4.972437858037538e-06, + "loss": 0.7442, + "step": 1120 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 0.7013871073722839, + "learning_rate": 4.972383887897171e-06, + "loss": 0.7405, + "step": 1121 + }, + { + "epoch": 0.310803324099723, + "grad_norm": 0.7087879180908203, + "learning_rate": 4.97232986526178e-06, + "loss": 0.7416, + "step": 1122 + }, + { + "epoch": 0.3110803324099723, + "grad_norm": 0.6554746031761169, + "learning_rate": 4.972275790132513e-06, + "loss": 0.7197, + "step": 1123 + }, + { + "epoch": 0.3113573407202216, + "grad_norm": 0.6627053022384644, + "learning_rate": 4.972221662510517e-06, + "loss": 0.687, + "step": 1124 + }, + { + "epoch": 0.31163434903047094, + "grad_norm": 0.6275670528411865, + "learning_rate": 4.9721674823969425e-06, + "loss": 0.7252, + "step": 1125 + }, + { + "epoch": 0.31191135734072023, + "grad_norm": 0.6724969744682312, + "learning_rate": 4.972113249792938e-06, + "loss": 0.6879, + "step": 1126 + }, + { + "epoch": 0.3121883656509695, + "grad_norm": 0.6502825021743774, + "learning_rate": 4.972058964699658e-06, + "loss": 0.7256, + "step": 1127 + }, + { + "epoch": 0.3124653739612188, + "grad_norm": 0.6835956573486328, + "learning_rate": 4.972004627118251e-06, + "loss": 0.6942, + "step": 1128 + }, + { + "epoch": 0.31274238227146817, + "grad_norm": 0.6625363230705261, + "learning_rate": 4.971950237049874e-06, + "loss": 0.7493, + "step": 1129 + }, + { + "epoch": 0.31301939058171746, + "grad_norm": 0.6615988612174988, + "learning_rate": 4.971895794495681e-06, + "loss": 0.6836, + "step": 1130 + }, + { + "epoch": 0.31329639889196675, + "grad_norm": 0.6387788653373718, + "learning_rate": 4.971841299456827e-06, + "loss": 0.7134, + "step": 1131 + }, + { + "epoch": 0.31357340720221605, + "grad_norm": 0.628496527671814, + "learning_rate": 4.971786751934471e-06, + "loss": 0.7054, + "step": 1132 + }, + { + "epoch": 0.3138504155124654, + "grad_norm": 0.6888827681541443, + "learning_rate": 4.971732151929769e-06, + "loss": 0.7168, + "step": 1133 + }, + { + "epoch": 0.3141274238227147, + "grad_norm": 0.67254638671875, + "learning_rate": 4.971677499443882e-06, + "loss": 0.7034, + "step": 1134 + }, + { + "epoch": 0.314404432132964, + "grad_norm": 0.6414274573326111, + "learning_rate": 4.971622794477969e-06, + "loss": 0.6905, + "step": 1135 + }, + { + "epoch": 0.3146814404432133, + "grad_norm": 0.6497774720191956, + "learning_rate": 4.971568037033193e-06, + "loss": 0.6351, + "step": 1136 + }, + { + "epoch": 0.3149584487534626, + "grad_norm": 0.6576498746871948, + "learning_rate": 4.971513227110714e-06, + "loss": 0.72, + "step": 1137 + }, + { + "epoch": 0.3152354570637119, + "grad_norm": 0.6286632418632507, + "learning_rate": 4.9714583647117e-06, + "loss": 0.6696, + "step": 1138 + }, + { + "epoch": 0.3155124653739612, + "grad_norm": 0.6436337828636169, + "learning_rate": 4.971403449837313e-06, + "loss": 0.6442, + "step": 1139 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.650290310382843, + "learning_rate": 4.971348482488719e-06, + "loss": 0.7054, + "step": 1140 + }, + { + "epoch": 0.31606648199445986, + "grad_norm": 0.6691767573356628, + "learning_rate": 4.971293462667085e-06, + "loss": 0.7066, + "step": 1141 + }, + { + "epoch": 0.31634349030470915, + "grad_norm": 0.6176061034202576, + "learning_rate": 4.971238390373581e-06, + "loss": 0.6673, + "step": 1142 + }, + { + "epoch": 0.31662049861495845, + "grad_norm": 0.6956735253334045, + "learning_rate": 4.971183265609374e-06, + "loss": 0.7598, + "step": 1143 + }, + { + "epoch": 0.31689750692520774, + "grad_norm": 0.6023033261299133, + "learning_rate": 4.971128088375636e-06, + "loss": 0.6681, + "step": 1144 + }, + { + "epoch": 0.3171745152354571, + "grad_norm": 0.6559221744537354, + "learning_rate": 4.971072858673539e-06, + "loss": 0.6867, + "step": 1145 + }, + { + "epoch": 0.3174515235457064, + "grad_norm": 0.6444343328475952, + "learning_rate": 4.971017576504254e-06, + "loss": 0.7175, + "step": 1146 + }, + { + "epoch": 0.3177285318559557, + "grad_norm": 0.7063049077987671, + "learning_rate": 4.970962241868955e-06, + "loss": 0.6948, + "step": 1147 + }, + { + "epoch": 0.31800554016620497, + "grad_norm": 0.6646923422813416, + "learning_rate": 4.9709068547688186e-06, + "loss": 0.7128, + "step": 1148 + }, + { + "epoch": 0.3182825484764543, + "grad_norm": 0.6788060665130615, + "learning_rate": 4.970851415205018e-06, + "loss": 0.7153, + "step": 1149 + }, + { + "epoch": 0.3185595567867036, + "grad_norm": 0.6538786292076111, + "learning_rate": 4.970795923178734e-06, + "loss": 0.7012, + "step": 1150 + }, + { + "epoch": 0.3188365650969529, + "grad_norm": 0.6571323871612549, + "learning_rate": 4.970740378691141e-06, + "loss": 0.7051, + "step": 1151 + }, + { + "epoch": 0.3191135734072022, + "grad_norm": 0.623737096786499, + "learning_rate": 4.970684781743422e-06, + "loss": 0.6843, + "step": 1152 + }, + { + "epoch": 0.31939058171745155, + "grad_norm": 0.6650036573410034, + "learning_rate": 4.970629132336755e-06, + "loss": 0.7052, + "step": 1153 + }, + { + "epoch": 0.31966759002770084, + "grad_norm": 0.6788172721862793, + "learning_rate": 4.9705734304723215e-06, + "loss": 0.679, + "step": 1154 + }, + { + "epoch": 0.31994459833795014, + "grad_norm": 0.7293535470962524, + "learning_rate": 4.970517676151305e-06, + "loss": 0.7561, + "step": 1155 + }, + { + "epoch": 0.32022160664819943, + "grad_norm": 0.7336902022361755, + "learning_rate": 4.97046186937489e-06, + "loss": 0.7271, + "step": 1156 + }, + { + "epoch": 0.3204986149584488, + "grad_norm": 0.676217257976532, + "learning_rate": 4.970406010144259e-06, + "loss": 0.7176, + "step": 1157 + }, + { + "epoch": 0.3207756232686981, + "grad_norm": 0.6698077321052551, + "learning_rate": 4.970350098460602e-06, + "loss": 0.7252, + "step": 1158 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 0.6944154500961304, + "learning_rate": 4.970294134325102e-06, + "loss": 0.7368, + "step": 1159 + }, + { + "epoch": 0.32132963988919666, + "grad_norm": 0.6670475006103516, + "learning_rate": 4.97023811773895e-06, + "loss": 0.7018, + "step": 1160 + }, + { + "epoch": 0.321606648199446, + "grad_norm": 0.6607210040092468, + "learning_rate": 4.970182048703335e-06, + "loss": 0.7113, + "step": 1161 + }, + { + "epoch": 0.3218836565096953, + "grad_norm": 0.7043123245239258, + "learning_rate": 4.970125927219446e-06, + "loss": 0.6951, + "step": 1162 + }, + { + "epoch": 0.3221606648199446, + "grad_norm": 0.6905112266540527, + "learning_rate": 4.970069753288476e-06, + "loss": 0.7125, + "step": 1163 + }, + { + "epoch": 0.3224376731301939, + "grad_norm": 0.6552289724349976, + "learning_rate": 4.970013526911617e-06, + "loss": 0.7344, + "step": 1164 + }, + { + "epoch": 0.32271468144044324, + "grad_norm": 0.6786649227142334, + "learning_rate": 4.969957248090062e-06, + "loss": 0.7108, + "step": 1165 + }, + { + "epoch": 0.32299168975069253, + "grad_norm": 0.6703157424926758, + "learning_rate": 4.969900916825009e-06, + "loss": 0.7072, + "step": 1166 + }, + { + "epoch": 0.32326869806094183, + "grad_norm": 0.6406513452529907, + "learning_rate": 4.969844533117651e-06, + "loss": 0.6988, + "step": 1167 + }, + { + "epoch": 0.3235457063711911, + "grad_norm": 0.6920865178108215, + "learning_rate": 4.969788096969186e-06, + "loss": 0.7192, + "step": 1168 + }, + { + "epoch": 0.3238227146814404, + "grad_norm": 0.7104857563972473, + "learning_rate": 4.969731608380814e-06, + "loss": 0.7087, + "step": 1169 + }, + { + "epoch": 0.32409972299168976, + "grad_norm": 0.6691415905952454, + "learning_rate": 4.9696750673537315e-06, + "loss": 0.6976, + "step": 1170 + }, + { + "epoch": 0.32437673130193906, + "grad_norm": 0.7121404409408569, + "learning_rate": 4.969618473889141e-06, + "loss": 0.7591, + "step": 1171 + }, + { + "epoch": 0.32465373961218835, + "grad_norm": 0.655210554599762, + "learning_rate": 4.969561827988244e-06, + "loss": 0.7052, + "step": 1172 + }, + { + "epoch": 0.32493074792243765, + "grad_norm": 0.6501975059509277, + "learning_rate": 4.969505129652242e-06, + "loss": 0.735, + "step": 1173 + }, + { + "epoch": 0.325207756232687, + "grad_norm": 0.639132559299469, + "learning_rate": 4.96944837888234e-06, + "loss": 0.7081, + "step": 1174 + }, + { + "epoch": 0.3254847645429363, + "grad_norm": 0.6479787826538086, + "learning_rate": 4.969391575679743e-06, + "loss": 0.7136, + "step": 1175 + }, + { + "epoch": 0.3257617728531856, + "grad_norm": 0.6485690474510193, + "learning_rate": 4.969334720045656e-06, + "loss": 0.6554, + "step": 1176 + }, + { + "epoch": 0.3260387811634349, + "grad_norm": 0.6593156456947327, + "learning_rate": 4.969277811981287e-06, + "loss": 0.6959, + "step": 1177 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 0.6410382986068726, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.6804, + "step": 1178 + }, + { + "epoch": 0.3265927977839335, + "grad_norm": 0.6194038391113281, + "learning_rate": 4.969163838566537e-06, + "loss": 0.6745, + "step": 1179 + }, + { + "epoch": 0.3268698060941828, + "grad_norm": 0.6443800926208496, + "learning_rate": 4.969106773218577e-06, + "loss": 0.7266, + "step": 1180 + }, + { + "epoch": 0.3271468144044321, + "grad_norm": 0.6541767120361328, + "learning_rate": 4.969049655445174e-06, + "loss": 0.7135, + "step": 1181 + }, + { + "epoch": 0.32742382271468146, + "grad_norm": 0.6695902347564697, + "learning_rate": 4.968992485247541e-06, + "loss": 0.7551, + "step": 1182 + }, + { + "epoch": 0.32770083102493075, + "grad_norm": 0.6857314109802246, + "learning_rate": 4.9689352626268935e-06, + "loss": 0.7272, + "step": 1183 + }, + { + "epoch": 0.32797783933518004, + "grad_norm": 0.6714820861816406, + "learning_rate": 4.968877987584445e-06, + "loss": 0.7372, + "step": 1184 + }, + { + "epoch": 0.32825484764542934, + "grad_norm": 0.7039293646812439, + "learning_rate": 4.968820660121412e-06, + "loss": 0.7096, + "step": 1185 + }, + { + "epoch": 0.3285318559556787, + "grad_norm": 0.658528745174408, + "learning_rate": 4.968763280239012e-06, + "loss": 0.7096, + "step": 1186 + }, + { + "epoch": 0.328808864265928, + "grad_norm": 0.6193217635154724, + "learning_rate": 4.968705847938462e-06, + "loss": 0.6464, + "step": 1187 + }, + { + "epoch": 0.3290858725761773, + "grad_norm": 0.6533862352371216, + "learning_rate": 4.968648363220984e-06, + "loss": 0.7257, + "step": 1188 + }, + { + "epoch": 0.32936288088642657, + "grad_norm": 0.6659977436065674, + "learning_rate": 4.9685908260877955e-06, + "loss": 0.7223, + "step": 1189 + }, + { + "epoch": 0.3296398891966759, + "grad_norm": 0.6677512526512146, + "learning_rate": 4.9685332365401205e-06, + "loss": 0.7226, + "step": 1190 + }, + { + "epoch": 0.3299168975069252, + "grad_norm": 0.6727151274681091, + "learning_rate": 4.968475594579181e-06, + "loss": 0.6983, + "step": 1191 + }, + { + "epoch": 0.3301939058171745, + "grad_norm": 0.6492422223091125, + "learning_rate": 4.9684179002062e-06, + "loss": 0.6865, + "step": 1192 + }, + { + "epoch": 0.3304709141274238, + "grad_norm": 0.6984854936599731, + "learning_rate": 4.968360153422404e-06, + "loss": 0.7665, + "step": 1193 + }, + { + "epoch": 0.33074792243767315, + "grad_norm": 0.6713538765907288, + "learning_rate": 4.9683023542290184e-06, + "loss": 0.7214, + "step": 1194 + }, + { + "epoch": 0.33102493074792244, + "grad_norm": 0.685975968837738, + "learning_rate": 4.96824450262727e-06, + "loss": 0.7515, + "step": 1195 + }, + { + "epoch": 0.33130193905817173, + "grad_norm": 0.6598302125930786, + "learning_rate": 4.968186598618388e-06, + "loss": 0.6666, + "step": 1196 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 0.6653509736061096, + "learning_rate": 4.968128642203602e-06, + "loss": 0.7141, + "step": 1197 + }, + { + "epoch": 0.3318559556786704, + "grad_norm": 0.649482011795044, + "learning_rate": 4.968070633384141e-06, + "loss": 0.6893, + "step": 1198 + }, + { + "epoch": 0.33213296398891967, + "grad_norm": 0.6657583117485046, + "learning_rate": 4.968012572161239e-06, + "loss": 0.7252, + "step": 1199 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 0.659673273563385, + "learning_rate": 4.967954458536126e-06, + "loss": 0.7276, + "step": 1200 + }, + { + "epoch": 0.33268698060941826, + "grad_norm": 0.646888792514801, + "learning_rate": 4.9678962925100375e-06, + "loss": 0.6894, + "step": 1201 + }, + { + "epoch": 0.3329639889196676, + "grad_norm": 0.6243681311607361, + "learning_rate": 4.967838074084209e-06, + "loss": 0.6723, + "step": 1202 + }, + { + "epoch": 0.3332409972299169, + "grad_norm": 0.6913838386535645, + "learning_rate": 4.9677798032598754e-06, + "loss": 0.7173, + "step": 1203 + }, + { + "epoch": 0.3335180055401662, + "grad_norm": 0.7205289602279663, + "learning_rate": 4.967721480038276e-06, + "loss": 0.7796, + "step": 1204 + }, + { + "epoch": 0.3337950138504155, + "grad_norm": 0.6805700063705444, + "learning_rate": 4.967663104420647e-06, + "loss": 0.724, + "step": 1205 + }, + { + "epoch": 0.33407202216066484, + "grad_norm": 0.6902198791503906, + "learning_rate": 4.967604676408228e-06, + "loss": 0.7242, + "step": 1206 + }, + { + "epoch": 0.33434903047091413, + "grad_norm": 0.7033138275146484, + "learning_rate": 4.96754619600226e-06, + "loss": 0.7286, + "step": 1207 + }, + { + "epoch": 0.3346260387811634, + "grad_norm": 0.6692147254943848, + "learning_rate": 4.967487663203985e-06, + "loss": 0.7379, + "step": 1208 + }, + { + "epoch": 0.3349030470914127, + "grad_norm": 0.6471096277236938, + "learning_rate": 4.967429078014646e-06, + "loss": 0.6782, + "step": 1209 + }, + { + "epoch": 0.33518005540166207, + "grad_norm": 0.6385185718536377, + "learning_rate": 4.967370440435486e-06, + "loss": 0.7313, + "step": 1210 + }, + { + "epoch": 0.33545706371191136, + "grad_norm": 0.6591436266899109, + "learning_rate": 4.967311750467751e-06, + "loss": 0.709, + "step": 1211 + }, + { + "epoch": 0.33573407202216066, + "grad_norm": 0.6667162179946899, + "learning_rate": 4.967253008112688e-06, + "loss": 0.7272, + "step": 1212 + }, + { + "epoch": 0.33601108033240995, + "grad_norm": 0.690365195274353, + "learning_rate": 4.967194213371541e-06, + "loss": 0.7758, + "step": 1213 + }, + { + "epoch": 0.3362880886426593, + "grad_norm": 0.658421516418457, + "learning_rate": 4.967135366245562e-06, + "loss": 0.6383, + "step": 1214 + }, + { + "epoch": 0.3365650969529086, + "grad_norm": 0.6660083532333374, + "learning_rate": 4.967076466735997e-06, + "loss": 0.6689, + "step": 1215 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 0.631989598274231, + "learning_rate": 4.9670175148440995e-06, + "loss": 0.6726, + "step": 1216 + }, + { + "epoch": 0.3371191135734072, + "grad_norm": 0.6711675524711609, + "learning_rate": 4.966958510571119e-06, + "loss": 0.6839, + "step": 1217 + }, + { + "epoch": 0.33739612188365653, + "grad_norm": 0.7011405229568481, + "learning_rate": 4.966899453918311e-06, + "loss": 0.7689, + "step": 1218 + }, + { + "epoch": 0.3376731301939058, + "grad_norm": 0.6415798664093018, + "learning_rate": 4.966840344886927e-06, + "loss": 0.6256, + "step": 1219 + }, + { + "epoch": 0.3379501385041551, + "grad_norm": 0.672176718711853, + "learning_rate": 4.966781183478223e-06, + "loss": 0.7184, + "step": 1220 + }, + { + "epoch": 0.3382271468144044, + "grad_norm": 0.6688186526298523, + "learning_rate": 4.966721969693455e-06, + "loss": 0.7292, + "step": 1221 + }, + { + "epoch": 0.33850415512465376, + "grad_norm": 0.6519632935523987, + "learning_rate": 4.96666270353388e-06, + "loss": 0.6372, + "step": 1222 + }, + { + "epoch": 0.33878116343490305, + "grad_norm": 0.6690004467964172, + "learning_rate": 4.966603385000756e-06, + "loss": 0.6819, + "step": 1223 + }, + { + "epoch": 0.33905817174515235, + "grad_norm": 0.674385130405426, + "learning_rate": 4.966544014095345e-06, + "loss": 0.6891, + "step": 1224 + }, + { + "epoch": 0.33933518005540164, + "grad_norm": 0.6642457246780396, + "learning_rate": 4.966484590818904e-06, + "loss": 0.6747, + "step": 1225 + }, + { + "epoch": 0.339612188365651, + "grad_norm": 0.6616514325141907, + "learning_rate": 4.9664251151726975e-06, + "loss": 0.7419, + "step": 1226 + }, + { + "epoch": 0.3398891966759003, + "grad_norm": 0.6353458762168884, + "learning_rate": 4.966365587157986e-06, + "loss": 0.6848, + "step": 1227 + }, + { + "epoch": 0.3401662049861496, + "grad_norm": 0.6132497191429138, + "learning_rate": 4.966306006776036e-06, + "loss": 0.6811, + "step": 1228 + }, + { + "epoch": 0.34044321329639887, + "grad_norm": 0.6806102395057678, + "learning_rate": 4.96624637402811e-06, + "loss": 0.6822, + "step": 1229 + }, + { + "epoch": 0.3407202216066482, + "grad_norm": 0.6873536109924316, + "learning_rate": 4.9661866889154766e-06, + "loss": 0.7172, + "step": 1230 + }, + { + "epoch": 0.3409972299168975, + "grad_norm": 0.6916210055351257, + "learning_rate": 4.966126951439401e-06, + "loss": 0.7106, + "step": 1231 + }, + { + "epoch": 0.3412742382271468, + "grad_norm": 0.6565237641334534, + "learning_rate": 4.966067161601152e-06, + "loss": 0.7058, + "step": 1232 + }, + { + "epoch": 0.3415512465373961, + "grad_norm": 0.6750245690345764, + "learning_rate": 4.9660073194020005e-06, + "loss": 0.7502, + "step": 1233 + }, + { + "epoch": 0.34182825484764545, + "grad_norm": 0.6854074001312256, + "learning_rate": 4.965947424843216e-06, + "loss": 0.7278, + "step": 1234 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.6503132581710815, + "learning_rate": 4.96588747792607e-06, + "loss": 0.7383, + "step": 1235 + }, + { + "epoch": 0.34238227146814404, + "grad_norm": 0.6417048573493958, + "learning_rate": 4.965827478651836e-06, + "loss": 0.6405, + "step": 1236 + }, + { + "epoch": 0.34265927977839333, + "grad_norm": 0.6622879505157471, + "learning_rate": 4.965767427021787e-06, + "loss": 0.6652, + "step": 1237 + }, + { + "epoch": 0.3429362880886427, + "grad_norm": 0.7433401346206665, + "learning_rate": 4.965707323037199e-06, + "loss": 0.7381, + "step": 1238 + }, + { + "epoch": 0.343213296398892, + "grad_norm": 0.7252357602119446, + "learning_rate": 4.965647166699348e-06, + "loss": 0.7311, + "step": 1239 + }, + { + "epoch": 0.34349030470914127, + "grad_norm": 0.7050260901451111, + "learning_rate": 4.965586958009511e-06, + "loss": 0.7243, + "step": 1240 + }, + { + "epoch": 0.34376731301939056, + "grad_norm": 0.6850735545158386, + "learning_rate": 4.965526696968966e-06, + "loss": 0.6879, + "step": 1241 + }, + { + "epoch": 0.3440443213296399, + "grad_norm": 0.673936128616333, + "learning_rate": 4.965466383578994e-06, + "loss": 0.718, + "step": 1242 + }, + { + "epoch": 0.3443213296398892, + "grad_norm": 0.6669742465019226, + "learning_rate": 4.965406017840873e-06, + "loss": 0.6727, + "step": 1243 + }, + { + "epoch": 0.3445983379501385, + "grad_norm": 0.6629636287689209, + "learning_rate": 4.965345599755888e-06, + "loss": 0.7094, + "step": 1244 + }, + { + "epoch": 0.3448753462603878, + "grad_norm": 0.647504448890686, + "learning_rate": 4.965285129325319e-06, + "loss": 0.6986, + "step": 1245 + }, + { + "epoch": 0.34515235457063714, + "grad_norm": 0.6685324311256409, + "learning_rate": 4.965224606550452e-06, + "loss": 0.6984, + "step": 1246 + }, + { + "epoch": 0.34542936288088644, + "grad_norm": 0.6271531581878662, + "learning_rate": 4.96516403143257e-06, + "loss": 0.6799, + "step": 1247 + }, + { + "epoch": 0.34570637119113573, + "grad_norm": 0.6847185492515564, + "learning_rate": 4.96510340397296e-06, + "loss": 0.7843, + "step": 1248 + }, + { + "epoch": 0.345983379501385, + "grad_norm": 0.6534786224365234, + "learning_rate": 4.96504272417291e-06, + "loss": 0.6948, + "step": 1249 + }, + { + "epoch": 0.3462603878116344, + "grad_norm": 0.6473841667175293, + "learning_rate": 4.964981992033709e-06, + "loss": 0.6932, + "step": 1250 + }, + { + "epoch": 0.34653739612188367, + "grad_norm": 0.657150149345398, + "learning_rate": 4.964921207556645e-06, + "loss": 0.7882, + "step": 1251 + }, + { + "epoch": 0.34681440443213296, + "grad_norm": 0.6625207662582397, + "learning_rate": 4.9648603707430085e-06, + "loss": 0.742, + "step": 1252 + }, + { + "epoch": 0.34709141274238225, + "grad_norm": 0.6801496148109436, + "learning_rate": 4.9647994815940916e-06, + "loss": 0.6557, + "step": 1253 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 0.6741940379142761, + "learning_rate": 4.964738540111187e-06, + "loss": 0.6849, + "step": 1254 + }, + { + "epoch": 0.3476454293628809, + "grad_norm": 0.6735914349555969, + "learning_rate": 4.96467754629559e-06, + "loss": 0.6854, + "step": 1255 + }, + { + "epoch": 0.3479224376731302, + "grad_norm": 0.6577902436256409, + "learning_rate": 4.964616500148594e-06, + "loss": 0.7293, + "step": 1256 + }, + { + "epoch": 0.3481994459833795, + "grad_norm": 0.6584300398826599, + "learning_rate": 4.964555401671496e-06, + "loss": 0.6902, + "step": 1257 + }, + { + "epoch": 0.34847645429362883, + "grad_norm": 0.613704264163971, + "learning_rate": 4.964494250865593e-06, + "loss": 0.6745, + "step": 1258 + }, + { + "epoch": 0.34875346260387813, + "grad_norm": 0.6942754983901978, + "learning_rate": 4.964433047732183e-06, + "loss": 0.7453, + "step": 1259 + }, + { + "epoch": 0.3490304709141274, + "grad_norm": 0.6607272624969482, + "learning_rate": 4.964371792272566e-06, + "loss": 0.7024, + "step": 1260 + }, + { + "epoch": 0.3493074792243767, + "grad_norm": 0.6645615100860596, + "learning_rate": 4.964310484488044e-06, + "loss": 0.7461, + "step": 1261 + }, + { + "epoch": 0.34958448753462606, + "grad_norm": 0.6562692523002625, + "learning_rate": 4.964249124379915e-06, + "loss": 0.6633, + "step": 1262 + }, + { + "epoch": 0.34986149584487536, + "grad_norm": 0.6694803833961487, + "learning_rate": 4.964187711949486e-06, + "loss": 0.6676, + "step": 1263 + }, + { + "epoch": 0.35013850415512465, + "grad_norm": 0.6911068558692932, + "learning_rate": 4.964126247198058e-06, + "loss": 0.7291, + "step": 1264 + }, + { + "epoch": 0.35041551246537395, + "grad_norm": 0.6374126076698303, + "learning_rate": 4.964064730126937e-06, + "loss": 0.766, + "step": 1265 + }, + { + "epoch": 0.3506925207756233, + "grad_norm": 0.6650824546813965, + "learning_rate": 4.964003160737429e-06, + "loss": 0.7294, + "step": 1266 + }, + { + "epoch": 0.3509695290858726, + "grad_norm": 0.6754046678543091, + "learning_rate": 4.963941539030842e-06, + "loss": 0.6993, + "step": 1267 + }, + { + "epoch": 0.3512465373961219, + "grad_norm": 0.6406125426292419, + "learning_rate": 4.963879865008484e-06, + "loss": 0.6803, + "step": 1268 + }, + { + "epoch": 0.3515235457063712, + "grad_norm": 0.6519691944122314, + "learning_rate": 4.963818138671664e-06, + "loss": 0.7288, + "step": 1269 + }, + { + "epoch": 0.3518005540166205, + "grad_norm": 0.6560389399528503, + "learning_rate": 4.963756360021694e-06, + "loss": 0.7381, + "step": 1270 + }, + { + "epoch": 0.3520775623268698, + "grad_norm": 0.6318975687026978, + "learning_rate": 4.963694529059884e-06, + "loss": 0.7164, + "step": 1271 + }, + { + "epoch": 0.3523545706371191, + "grad_norm": 0.6151561737060547, + "learning_rate": 4.9636326457875475e-06, + "loss": 0.6685, + "step": 1272 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 0.6672805547714233, + "learning_rate": 4.9635707102059985e-06, + "loss": 0.7358, + "step": 1273 + }, + { + "epoch": 0.35290858725761776, + "grad_norm": 0.6632043123245239, + "learning_rate": 4.963508722316552e-06, + "loss": 0.6698, + "step": 1274 + }, + { + "epoch": 0.35318559556786705, + "grad_norm": 0.6473809480667114, + "learning_rate": 4.963446682120525e-06, + "loss": 0.6826, + "step": 1275 + }, + { + "epoch": 0.35346260387811634, + "grad_norm": 0.6828525066375732, + "learning_rate": 4.963384589619233e-06, + "loss": 0.7255, + "step": 1276 + }, + { + "epoch": 0.35373961218836564, + "grad_norm": 0.6744553446769714, + "learning_rate": 4.963322444813995e-06, + "loss": 0.6756, + "step": 1277 + }, + { + "epoch": 0.35401662049861493, + "grad_norm": 0.6551529765129089, + "learning_rate": 4.963260247706132e-06, + "loss": 0.6939, + "step": 1278 + }, + { + "epoch": 0.3542936288088643, + "grad_norm": 0.6692739129066467, + "learning_rate": 4.963197998296964e-06, + "loss": 0.7031, + "step": 1279 + }, + { + "epoch": 0.3545706371191136, + "grad_norm": 0.6385998129844666, + "learning_rate": 4.96313569658781e-06, + "loss": 0.7017, + "step": 1280 + }, + { + "epoch": 0.35484764542936287, + "grad_norm": 0.6609788537025452, + "learning_rate": 4.9630733425799965e-06, + "loss": 0.6635, + "step": 1281 + }, + { + "epoch": 0.35512465373961216, + "grad_norm": 0.6580247282981873, + "learning_rate": 4.963010936274846e-06, + "loss": 0.722, + "step": 1282 + }, + { + "epoch": 0.3554016620498615, + "grad_norm": 0.6596221923828125, + "learning_rate": 4.9629484776736825e-06, + "loss": 0.6768, + "step": 1283 + }, + { + "epoch": 0.3556786703601108, + "grad_norm": 0.6421349048614502, + "learning_rate": 4.962885966777834e-06, + "loss": 0.6835, + "step": 1284 + }, + { + "epoch": 0.3559556786703601, + "grad_norm": 0.6218045949935913, + "learning_rate": 4.9628234035886265e-06, + "loss": 0.6314, + "step": 1285 + }, + { + "epoch": 0.3562326869806094, + "grad_norm": 0.6451228260993958, + "learning_rate": 4.962760788107389e-06, + "loss": 0.6934, + "step": 1286 + }, + { + "epoch": 0.35650969529085874, + "grad_norm": 0.6965139508247375, + "learning_rate": 4.96269812033545e-06, + "loss": 0.7118, + "step": 1287 + }, + { + "epoch": 0.35678670360110804, + "grad_norm": 0.6180694699287415, + "learning_rate": 4.9626354002741424e-06, + "loss": 0.6706, + "step": 1288 + }, + { + "epoch": 0.35706371191135733, + "grad_norm": 0.6663467884063721, + "learning_rate": 4.962572627924795e-06, + "loss": 0.6871, + "step": 1289 + }, + { + "epoch": 0.3573407202216066, + "grad_norm": 0.672592043876648, + "learning_rate": 4.962509803288743e-06, + "loss": 0.7238, + "step": 1290 + }, + { + "epoch": 0.35761772853185597, + "grad_norm": 0.6334097385406494, + "learning_rate": 4.962446926367319e-06, + "loss": 0.7161, + "step": 1291 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 0.6492845416069031, + "learning_rate": 4.962383997161858e-06, + "loss": 0.6651, + "step": 1292 + }, + { + "epoch": 0.35817174515235456, + "grad_norm": 0.6691606640815735, + "learning_rate": 4.9623210156736966e-06, + "loss": 0.6966, + "step": 1293 + }, + { + "epoch": 0.35844875346260385, + "grad_norm": 0.6494303941726685, + "learning_rate": 4.962257981904172e-06, + "loss": 0.6527, + "step": 1294 + }, + { + "epoch": 0.3587257617728532, + "grad_norm": 0.6685274839401245, + "learning_rate": 4.962194895854623e-06, + "loss": 0.7082, + "step": 1295 + }, + { + "epoch": 0.3590027700831025, + "grad_norm": 0.6498396992683411, + "learning_rate": 4.962131757526387e-06, + "loss": 0.6691, + "step": 1296 + }, + { + "epoch": 0.3592797783933518, + "grad_norm": 0.6530342698097229, + "learning_rate": 4.962068566920808e-06, + "loss": 0.7173, + "step": 1297 + }, + { + "epoch": 0.3595567867036011, + "grad_norm": 0.6080851554870605, + "learning_rate": 4.962005324039226e-06, + "loss": 0.6196, + "step": 1298 + }, + { + "epoch": 0.35983379501385043, + "grad_norm": 0.6345664858818054, + "learning_rate": 4.961942028882981e-06, + "loss": 0.6687, + "step": 1299 + }, + { + "epoch": 0.3601108033240997, + "grad_norm": 0.6660663485527039, + "learning_rate": 4.961878681453423e-06, + "loss": 0.6624, + "step": 1300 + }, + { + "epoch": 0.360387811634349, + "grad_norm": 0.6534367799758911, + "learning_rate": 4.961815281751892e-06, + "loss": 0.7117, + "step": 1301 + }, + { + "epoch": 0.3606648199445983, + "grad_norm": 0.6991186738014221, + "learning_rate": 4.9617518297797355e-06, + "loss": 0.7329, + "step": 1302 + }, + { + "epoch": 0.36094182825484766, + "grad_norm": 0.6646499037742615, + "learning_rate": 4.9616883255383015e-06, + "loss": 0.6842, + "step": 1303 + }, + { + "epoch": 0.36121883656509696, + "grad_norm": 0.6496413350105286, + "learning_rate": 4.961624769028938e-06, + "loss": 0.7275, + "step": 1304 + }, + { + "epoch": 0.36149584487534625, + "grad_norm": 0.6743121147155762, + "learning_rate": 4.961561160252994e-06, + "loss": 0.7148, + "step": 1305 + }, + { + "epoch": 0.36177285318559554, + "grad_norm": 0.6612362265586853, + "learning_rate": 4.96149749921182e-06, + "loss": 0.6799, + "step": 1306 + }, + { + "epoch": 0.3620498614958449, + "grad_norm": 0.6754231452941895, + "learning_rate": 4.961433785906769e-06, + "loss": 0.7226, + "step": 1307 + }, + { + "epoch": 0.3623268698060942, + "grad_norm": 0.6662765145301819, + "learning_rate": 4.961370020339193e-06, + "loss": 0.7089, + "step": 1308 + }, + { + "epoch": 0.3626038781163435, + "grad_norm": 0.6733217239379883, + "learning_rate": 4.9613062025104454e-06, + "loss": 0.6817, + "step": 1309 + }, + { + "epoch": 0.3628808864265928, + "grad_norm": 0.671064555644989, + "learning_rate": 4.9612423324218816e-06, + "loss": 0.6983, + "step": 1310 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 0.6356818079948425, + "learning_rate": 4.961178410074858e-06, + "loss": 0.6906, + "step": 1311 + }, + { + "epoch": 0.3634349030470914, + "grad_norm": 0.6797766089439392, + "learning_rate": 4.961114435470731e-06, + "loss": 0.7292, + "step": 1312 + }, + { + "epoch": 0.3637119113573407, + "grad_norm": 0.6782345771789551, + "learning_rate": 4.96105040861086e-06, + "loss": 0.7163, + "step": 1313 + }, + { + "epoch": 0.36398891966759, + "grad_norm": 0.6355205774307251, + "learning_rate": 4.960986329496605e-06, + "loss": 0.6805, + "step": 1314 + }, + { + "epoch": 0.36426592797783935, + "grad_norm": 0.6918122172355652, + "learning_rate": 4.960922198129323e-06, + "loss": 0.6954, + "step": 1315 + }, + { + "epoch": 0.36454293628808865, + "grad_norm": 0.6936689615249634, + "learning_rate": 4.96085801451038e-06, + "loss": 0.675, + "step": 1316 + }, + { + "epoch": 0.36481994459833794, + "grad_norm": 0.6387231349945068, + "learning_rate": 4.960793778641137e-06, + "loss": 0.6831, + "step": 1317 + }, + { + "epoch": 0.36509695290858724, + "grad_norm": 0.6836456060409546, + "learning_rate": 4.960729490522957e-06, + "loss": 0.7293, + "step": 1318 + }, + { + "epoch": 0.3653739612188366, + "grad_norm": 0.660629153251648, + "learning_rate": 4.9606651501572066e-06, + "loss": 0.6764, + "step": 1319 + }, + { + "epoch": 0.3656509695290859, + "grad_norm": 0.6675608158111572, + "learning_rate": 4.960600757545252e-06, + "loss": 0.6952, + "step": 1320 + }, + { + "epoch": 0.3659279778393352, + "grad_norm": 0.6543275117874146, + "learning_rate": 4.960536312688458e-06, + "loss": 0.7, + "step": 1321 + }, + { + "epoch": 0.36620498614958447, + "grad_norm": 0.6812704801559448, + "learning_rate": 4.960471815588196e-06, + "loss": 0.6978, + "step": 1322 + }, + { + "epoch": 0.3664819944598338, + "grad_norm": 0.6616919040679932, + "learning_rate": 4.9604072662458325e-06, + "loss": 0.7082, + "step": 1323 + }, + { + "epoch": 0.3667590027700831, + "grad_norm": 0.6452640295028687, + "learning_rate": 4.960342664662741e-06, + "loss": 0.7089, + "step": 1324 + }, + { + "epoch": 0.3670360110803324, + "grad_norm": 0.6687643527984619, + "learning_rate": 4.96027801084029e-06, + "loss": 0.6747, + "step": 1325 + }, + { + "epoch": 0.3673130193905817, + "grad_norm": 0.6758833527565002, + "learning_rate": 4.960213304779856e-06, + "loss": 0.7496, + "step": 1326 + }, + { + "epoch": 0.36759002770083105, + "grad_norm": 0.6907699108123779, + "learning_rate": 4.9601485464828095e-06, + "loss": 0.7585, + "step": 1327 + }, + { + "epoch": 0.36786703601108034, + "grad_norm": 0.6245575547218323, + "learning_rate": 4.960083735950527e-06, + "loss": 0.6814, + "step": 1328 + }, + { + "epoch": 0.36814404432132963, + "grad_norm": 0.6503439545631409, + "learning_rate": 4.960018873184385e-06, + "loss": 0.7128, + "step": 1329 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.6851927638053894, + "learning_rate": 4.959953958185759e-06, + "loss": 0.6877, + "step": 1330 + }, + { + "epoch": 0.3686980609418283, + "grad_norm": 0.653220534324646, + "learning_rate": 4.95988899095603e-06, + "loss": 0.6847, + "step": 1331 + }, + { + "epoch": 0.36897506925207757, + "grad_norm": 0.700905978679657, + "learning_rate": 4.959823971496575e-06, + "loss": 0.6939, + "step": 1332 + }, + { + "epoch": 0.36925207756232686, + "grad_norm": 0.6579366326332092, + "learning_rate": 4.959758899808775e-06, + "loss": 0.7145, + "step": 1333 + }, + { + "epoch": 0.36952908587257616, + "grad_norm": 0.6845544576644897, + "learning_rate": 4.959693775894012e-06, + "loss": 0.7305, + "step": 1334 + }, + { + "epoch": 0.3698060941828255, + "grad_norm": 0.6898097991943359, + "learning_rate": 4.95962859975367e-06, + "loss": 0.7307, + "step": 1335 + }, + { + "epoch": 0.3700831024930748, + "grad_norm": 0.6651111245155334, + "learning_rate": 4.95956337138913e-06, + "loss": 0.7074, + "step": 1336 + }, + { + "epoch": 0.3703601108033241, + "grad_norm": 0.7062095999717712, + "learning_rate": 4.9594980908017795e-06, + "loss": 0.7392, + "step": 1337 + }, + { + "epoch": 0.3706371191135734, + "grad_norm": 0.6984573006629944, + "learning_rate": 4.9594327579930034e-06, + "loss": 0.7396, + "step": 1338 + }, + { + "epoch": 0.37091412742382274, + "grad_norm": 0.7115591764450073, + "learning_rate": 4.95936737296419e-06, + "loss": 0.7084, + "step": 1339 + }, + { + "epoch": 0.37119113573407203, + "grad_norm": 0.6430774331092834, + "learning_rate": 4.959301935716725e-06, + "loss": 0.6557, + "step": 1340 + }, + { + "epoch": 0.3714681440443213, + "grad_norm": 0.6756094694137573, + "learning_rate": 4.959236446252001e-06, + "loss": 0.6998, + "step": 1341 + }, + { + "epoch": 0.3717451523545706, + "grad_norm": 0.6710341572761536, + "learning_rate": 4.959170904571406e-06, + "loss": 0.7517, + "step": 1342 + }, + { + "epoch": 0.37202216066481997, + "grad_norm": 0.7036310434341431, + "learning_rate": 4.9591053106763334e-06, + "loss": 0.7403, + "step": 1343 + }, + { + "epoch": 0.37229916897506926, + "grad_norm": 0.6434355974197388, + "learning_rate": 4.959039664568175e-06, + "loss": 0.6965, + "step": 1344 + }, + { + "epoch": 0.37257617728531855, + "grad_norm": 0.6460430026054382, + "learning_rate": 4.958973966248325e-06, + "loss": 0.6901, + "step": 1345 + }, + { + "epoch": 0.37285318559556785, + "grad_norm": 0.6600189208984375, + "learning_rate": 4.958908215718178e-06, + "loss": 0.7247, + "step": 1346 + }, + { + "epoch": 0.3731301939058172, + "grad_norm": 0.6722584962844849, + "learning_rate": 4.95884241297913e-06, + "loss": 0.6923, + "step": 1347 + }, + { + "epoch": 0.3734072022160665, + "grad_norm": 0.6506482362747192, + "learning_rate": 4.958776558032578e-06, + "loss": 0.7135, + "step": 1348 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 0.6147697567939758, + "learning_rate": 4.958710650879922e-06, + "loss": 0.6569, + "step": 1349 + }, + { + "epoch": 0.3739612188365651, + "grad_norm": 0.6361560821533203, + "learning_rate": 4.958644691522559e-06, + "loss": 0.7233, + "step": 1350 + }, + { + "epoch": 0.37423822714681443, + "grad_norm": 0.6788303852081299, + "learning_rate": 4.95857867996189e-06, + "loss": 0.7168, + "step": 1351 + }, + { + "epoch": 0.3745152354570637, + "grad_norm": 0.6241387724876404, + "learning_rate": 4.958512616199319e-06, + "loss": 0.712, + "step": 1352 + }, + { + "epoch": 0.374792243767313, + "grad_norm": 0.6549175977706909, + "learning_rate": 4.9584465002362455e-06, + "loss": 0.6646, + "step": 1353 + }, + { + "epoch": 0.3750692520775623, + "grad_norm": 0.6630549430847168, + "learning_rate": 4.958380332074074e-06, + "loss": 0.6819, + "step": 1354 + }, + { + "epoch": 0.37534626038781166, + "grad_norm": 0.6520205140113831, + "learning_rate": 4.958314111714211e-06, + "loss": 0.7394, + "step": 1355 + }, + { + "epoch": 0.37562326869806095, + "grad_norm": 0.6434879302978516, + "learning_rate": 4.958247839158061e-06, + "loss": 0.6922, + "step": 1356 + }, + { + "epoch": 0.37590027700831025, + "grad_norm": 0.7042064070701599, + "learning_rate": 4.958181514407033e-06, + "loss": 0.6784, + "step": 1357 + }, + { + "epoch": 0.37617728531855954, + "grad_norm": 0.6494789719581604, + "learning_rate": 4.958115137462533e-06, + "loss": 0.7081, + "step": 1358 + }, + { + "epoch": 0.3764542936288089, + "grad_norm": 0.6651448607444763, + "learning_rate": 4.958048708325971e-06, + "loss": 0.6476, + "step": 1359 + }, + { + "epoch": 0.3767313019390582, + "grad_norm": 0.6566942930221558, + "learning_rate": 4.957982226998758e-06, + "loss": 0.7013, + "step": 1360 + }, + { + "epoch": 0.3770083102493075, + "grad_norm": 0.6861854195594788, + "learning_rate": 4.957915693482305e-06, + "loss": 0.7128, + "step": 1361 + }, + { + "epoch": 0.37728531855955677, + "grad_norm": 0.6896612644195557, + "learning_rate": 4.957849107778026e-06, + "loss": 0.7239, + "step": 1362 + }, + { + "epoch": 0.3775623268698061, + "grad_norm": 0.6557917594909668, + "learning_rate": 4.957782469887332e-06, + "loss": 0.7025, + "step": 1363 + }, + { + "epoch": 0.3778393351800554, + "grad_norm": 0.6517524123191833, + "learning_rate": 4.957715779811642e-06, + "loss": 0.6948, + "step": 1364 + }, + { + "epoch": 0.3781163434903047, + "grad_norm": 0.7278082966804504, + "learning_rate": 4.957649037552368e-06, + "loss": 0.7248, + "step": 1365 + }, + { + "epoch": 0.378393351800554, + "grad_norm": 0.6831822395324707, + "learning_rate": 4.957582243110929e-06, + "loss": 0.7396, + "step": 1366 + }, + { + "epoch": 0.37867036011080335, + "grad_norm": 0.6876621842384338, + "learning_rate": 4.957515396488744e-06, + "loss": 0.7126, + "step": 1367 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 0.679372251033783, + "learning_rate": 4.957448497687229e-06, + "loss": 0.7149, + "step": 1368 + }, + { + "epoch": 0.37922437673130194, + "grad_norm": 0.662017285823822, + "learning_rate": 4.957381546707809e-06, + "loss": 0.7438, + "step": 1369 + }, + { + "epoch": 0.37950138504155123, + "grad_norm": 0.6179419159889221, + "learning_rate": 4.957314543551903e-06, + "loss": 0.6347, + "step": 1370 + }, + { + "epoch": 0.3797783933518006, + "grad_norm": 0.6868166923522949, + "learning_rate": 4.957247488220933e-06, + "loss": 0.696, + "step": 1371 + }, + { + "epoch": 0.3800554016620499, + "grad_norm": 0.7437651753425598, + "learning_rate": 4.957180380716324e-06, + "loss": 0.6938, + "step": 1372 + }, + { + "epoch": 0.38033240997229917, + "grad_norm": 0.6506156325340271, + "learning_rate": 4.957113221039501e-06, + "loss": 0.7085, + "step": 1373 + }, + { + "epoch": 0.38060941828254846, + "grad_norm": 0.6750197410583496, + "learning_rate": 4.957046009191889e-06, + "loss": 0.7394, + "step": 1374 + }, + { + "epoch": 0.3808864265927978, + "grad_norm": 0.6268515586853027, + "learning_rate": 4.956978745174916e-06, + "loss": 0.6687, + "step": 1375 + }, + { + "epoch": 0.3811634349030471, + "grad_norm": 0.6731964945793152, + "learning_rate": 4.95691142899001e-06, + "loss": 0.7027, + "step": 1376 + }, + { + "epoch": 0.3814404432132964, + "grad_norm": 0.6968632936477661, + "learning_rate": 4.9568440606386e-06, + "loss": 0.7307, + "step": 1377 + }, + { + "epoch": 0.3817174515235457, + "grad_norm": 0.6785265207290649, + "learning_rate": 4.956776640122116e-06, + "loss": 0.7197, + "step": 1378 + }, + { + "epoch": 0.38199445983379504, + "grad_norm": 0.6669479608535767, + "learning_rate": 4.956709167441991e-06, + "loss": 0.6942, + "step": 1379 + }, + { + "epoch": 0.38227146814404434, + "grad_norm": 0.7017685770988464, + "learning_rate": 4.9566416425996555e-06, + "loss": 0.7041, + "step": 1380 + }, + { + "epoch": 0.38254847645429363, + "grad_norm": 0.6782371401786804, + "learning_rate": 4.956574065596545e-06, + "loss": 0.7378, + "step": 1381 + }, + { + "epoch": 0.3828254847645429, + "grad_norm": 0.6499651074409485, + "learning_rate": 4.956506436434093e-06, + "loss": 0.6821, + "step": 1382 + }, + { + "epoch": 0.3831024930747922, + "grad_norm": 0.653150737285614, + "learning_rate": 4.956438755113737e-06, + "loss": 0.6934, + "step": 1383 + }, + { + "epoch": 0.38337950138504157, + "grad_norm": 0.6766655445098877, + "learning_rate": 4.956371021636912e-06, + "loss": 0.6898, + "step": 1384 + }, + { + "epoch": 0.38365650969529086, + "grad_norm": 0.6720975041389465, + "learning_rate": 4.956303236005059e-06, + "loss": 0.7228, + "step": 1385 + }, + { + "epoch": 0.38393351800554015, + "grad_norm": 0.6506999731063843, + "learning_rate": 4.956235398219614e-06, + "loss": 0.7127, + "step": 1386 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 0.6865505576133728, + "learning_rate": 4.956167508282021e-06, + "loss": 0.7291, + "step": 1387 + }, + { + "epoch": 0.3844875346260388, + "grad_norm": 0.6950021982192993, + "learning_rate": 4.9560995661937175e-06, + "loss": 0.6999, + "step": 1388 + }, + { + "epoch": 0.3847645429362881, + "grad_norm": 0.666877269744873, + "learning_rate": 4.956031571956148e-06, + "loss": 0.6909, + "step": 1389 + }, + { + "epoch": 0.3850415512465374, + "grad_norm": 0.6657650470733643, + "learning_rate": 4.955963525570757e-06, + "loss": 0.6762, + "step": 1390 + }, + { + "epoch": 0.3853185595567867, + "grad_norm": 0.6917940378189087, + "learning_rate": 4.955895427038988e-06, + "loss": 0.6762, + "step": 1391 + }, + { + "epoch": 0.385595567867036, + "grad_norm": 0.6938834190368652, + "learning_rate": 4.955827276362288e-06, + "loss": 0.734, + "step": 1392 + }, + { + "epoch": 0.3858725761772853, + "grad_norm": 0.7303954362869263, + "learning_rate": 4.955759073542104e-06, + "loss": 0.7666, + "step": 1393 + }, + { + "epoch": 0.3861495844875346, + "grad_norm": 0.6774318814277649, + "learning_rate": 4.9556908185798815e-06, + "loss": 0.7199, + "step": 1394 + }, + { + "epoch": 0.3864265927977839, + "grad_norm": 0.6571752429008484, + "learning_rate": 4.955622511477073e-06, + "loss": 0.7084, + "step": 1395 + }, + { + "epoch": 0.38670360110803326, + "grad_norm": 0.646608293056488, + "learning_rate": 4.955554152235128e-06, + "loss": 0.6636, + "step": 1396 + }, + { + "epoch": 0.38698060941828255, + "grad_norm": 0.6419099569320679, + "learning_rate": 4.955485740855497e-06, + "loss": 0.6862, + "step": 1397 + }, + { + "epoch": 0.38725761772853184, + "grad_norm": 0.6539189219474792, + "learning_rate": 4.955417277339633e-06, + "loss": 0.6339, + "step": 1398 + }, + { + "epoch": 0.38753462603878114, + "grad_norm": 0.6766827702522278, + "learning_rate": 4.95534876168899e-06, + "loss": 0.6939, + "step": 1399 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 0.6848649382591248, + "learning_rate": 4.955280193905023e-06, + "loss": 0.7284, + "step": 1400 + }, + { + "epoch": 0.3880886426592798, + "grad_norm": 0.6462283134460449, + "learning_rate": 4.955211573989186e-06, + "loss": 0.7049, + "step": 1401 + }, + { + "epoch": 0.3883656509695291, + "grad_norm": 0.6433562636375427, + "learning_rate": 4.955142901942939e-06, + "loss": 0.6847, + "step": 1402 + }, + { + "epoch": 0.38864265927977837, + "grad_norm": 0.7061001062393188, + "learning_rate": 4.955074177767737e-06, + "loss": 0.7501, + "step": 1403 + }, + { + "epoch": 0.3889196675900277, + "grad_norm": 0.6696621775627136, + "learning_rate": 4.955005401465042e-06, + "loss": 0.7088, + "step": 1404 + }, + { + "epoch": 0.389196675900277, + "grad_norm": 0.6648236513137817, + "learning_rate": 4.954936573036312e-06, + "loss": 0.7133, + "step": 1405 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 0.6691426038742065, + "learning_rate": 4.954867692483009e-06, + "loss": 0.7026, + "step": 1406 + }, + { + "epoch": 0.3897506925207756, + "grad_norm": 0.6885944604873657, + "learning_rate": 4.954798759806596e-06, + "loss": 0.6919, + "step": 1407 + }, + { + "epoch": 0.39002770083102495, + "grad_norm": 0.6416887640953064, + "learning_rate": 4.9547297750085375e-06, + "loss": 0.6601, + "step": 1408 + }, + { + "epoch": 0.39030470914127424, + "grad_norm": 0.6520400643348694, + "learning_rate": 4.954660738090297e-06, + "loss": 0.6856, + "step": 1409 + }, + { + "epoch": 0.39058171745152354, + "grad_norm": 0.6656164526939392, + "learning_rate": 4.95459164905334e-06, + "loss": 0.6803, + "step": 1410 + }, + { + "epoch": 0.39085872576177283, + "grad_norm": 0.7128254771232605, + "learning_rate": 4.954522507899133e-06, + "loss": 0.6548, + "step": 1411 + }, + { + "epoch": 0.3911357340720222, + "grad_norm": 0.6742157340049744, + "learning_rate": 4.954453314629146e-06, + "loss": 0.7053, + "step": 1412 + }, + { + "epoch": 0.3914127423822715, + "grad_norm": 0.6287198066711426, + "learning_rate": 4.954384069244848e-06, + "loss": 0.6742, + "step": 1413 + }, + { + "epoch": 0.39168975069252077, + "grad_norm": 0.6757739782333374, + "learning_rate": 4.954314771747707e-06, + "loss": 0.7236, + "step": 1414 + }, + { + "epoch": 0.39196675900277006, + "grad_norm": 0.68620365858078, + "learning_rate": 4.954245422139196e-06, + "loss": 0.6842, + "step": 1415 + }, + { + "epoch": 0.3922437673130194, + "grad_norm": 0.6858413815498352, + "learning_rate": 4.954176020420788e-06, + "loss": 0.7019, + "step": 1416 + }, + { + "epoch": 0.3925207756232687, + "grad_norm": 0.6674286723136902, + "learning_rate": 4.954106566593955e-06, + "loss": 0.7249, + "step": 1417 + }, + { + "epoch": 0.392797783933518, + "grad_norm": 0.6651579737663269, + "learning_rate": 4.954037060660173e-06, + "loss": 0.7113, + "step": 1418 + }, + { + "epoch": 0.3930747922437673, + "grad_norm": 0.6830371618270874, + "learning_rate": 4.953967502620916e-06, + "loss": 0.727, + "step": 1419 + }, + { + "epoch": 0.39335180055401664, + "grad_norm": 0.6857573986053467, + "learning_rate": 4.953897892477664e-06, + "loss": 0.7197, + "step": 1420 + }, + { + "epoch": 0.39362880886426593, + "grad_norm": 0.6651442646980286, + "learning_rate": 4.953828230231892e-06, + "loss": 0.6771, + "step": 1421 + }, + { + "epoch": 0.3939058171745152, + "grad_norm": 0.7004607319831848, + "learning_rate": 4.953758515885081e-06, + "loss": 0.7428, + "step": 1422 + }, + { + "epoch": 0.3941828254847645, + "grad_norm": 0.6818897724151611, + "learning_rate": 4.95368874943871e-06, + "loss": 0.7024, + "step": 1423 + }, + { + "epoch": 0.39445983379501387, + "grad_norm": 0.6543837189674377, + "learning_rate": 4.95361893089426e-06, + "loss": 0.7267, + "step": 1424 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.6330406665802002, + "learning_rate": 4.953549060253216e-06, + "loss": 0.7249, + "step": 1425 + }, + { + "epoch": 0.39501385041551246, + "grad_norm": 0.6749814748764038, + "learning_rate": 4.953479137517059e-06, + "loss": 0.7158, + "step": 1426 + }, + { + "epoch": 0.39529085872576175, + "grad_norm": 0.6552304029464722, + "learning_rate": 4.953409162687274e-06, + "loss": 0.7637, + "step": 1427 + }, + { + "epoch": 0.3955678670360111, + "grad_norm": 0.6880520582199097, + "learning_rate": 4.953339135765348e-06, + "loss": 0.7158, + "step": 1428 + }, + { + "epoch": 0.3958448753462604, + "grad_norm": 0.6481505036354065, + "learning_rate": 4.953269056752767e-06, + "loss": 0.7075, + "step": 1429 + }, + { + "epoch": 0.3961218836565097, + "grad_norm": 0.6551262736320496, + "learning_rate": 4.953198925651018e-06, + "loss": 0.6933, + "step": 1430 + }, + { + "epoch": 0.396398891966759, + "grad_norm": 0.6902202367782593, + "learning_rate": 4.953128742461592e-06, + "loss": 0.7142, + "step": 1431 + }, + { + "epoch": 0.39667590027700833, + "grad_norm": 0.6663317084312439, + "learning_rate": 4.9530585071859775e-06, + "loss": 0.7366, + "step": 1432 + }, + { + "epoch": 0.3969529085872576, + "grad_norm": 0.6709367036819458, + "learning_rate": 4.952988219825666e-06, + "loss": 0.7188, + "step": 1433 + }, + { + "epoch": 0.3972299168975069, + "grad_norm": 0.6727588176727295, + "learning_rate": 4.952917880382151e-06, + "loss": 0.7175, + "step": 1434 + }, + { + "epoch": 0.3975069252077562, + "grad_norm": 0.6796942949295044, + "learning_rate": 4.952847488856925e-06, + "loss": 0.6983, + "step": 1435 + }, + { + "epoch": 0.39778393351800556, + "grad_norm": 0.6838971376419067, + "learning_rate": 4.952777045251484e-06, + "loss": 0.65, + "step": 1436 + }, + { + "epoch": 0.39806094182825486, + "grad_norm": 0.6644639372825623, + "learning_rate": 4.952706549567322e-06, + "loss": 0.6994, + "step": 1437 + }, + { + "epoch": 0.39833795013850415, + "grad_norm": 0.6655013561248779, + "learning_rate": 4.952636001805936e-06, + "loss": 0.6803, + "step": 1438 + }, + { + "epoch": 0.39861495844875344, + "grad_norm": 0.6782181262969971, + "learning_rate": 4.952565401968824e-06, + "loss": 0.7076, + "step": 1439 + }, + { + "epoch": 0.3988919667590028, + "grad_norm": 0.8163707852363586, + "learning_rate": 4.952494750057487e-06, + "loss": 0.731, + "step": 1440 + }, + { + "epoch": 0.3991689750692521, + "grad_norm": 0.6641796231269836, + "learning_rate": 4.952424046073422e-06, + "loss": 0.7001, + "step": 1441 + }, + { + "epoch": 0.3994459833795014, + "grad_norm": 0.6930070519447327, + "learning_rate": 4.952353290018132e-06, + "loss": 0.6668, + "step": 1442 + }, + { + "epoch": 0.3997229916897507, + "grad_norm": 0.6547805070877075, + "learning_rate": 4.9522824818931195e-06, + "loss": 0.725, + "step": 1443 + }, + { + "epoch": 0.4, + "grad_norm": 0.6426491737365723, + "learning_rate": 4.9522116216998876e-06, + "loss": 0.6825, + "step": 1444 + }, + { + "epoch": 0.4002770083102493, + "grad_norm": 0.6577370762825012, + "learning_rate": 4.95214070943994e-06, + "loss": 0.6654, + "step": 1445 + }, + { + "epoch": 0.4005540166204986, + "grad_norm": 0.6261408925056458, + "learning_rate": 4.9520697451147825e-06, + "loss": 0.7009, + "step": 1446 + }, + { + "epoch": 0.4008310249307479, + "grad_norm": 0.6702444553375244, + "learning_rate": 4.951998728725924e-06, + "loss": 0.7356, + "step": 1447 + }, + { + "epoch": 0.40110803324099725, + "grad_norm": 0.6387395858764648, + "learning_rate": 4.95192766027487e-06, + "loss": 0.6711, + "step": 1448 + }, + { + "epoch": 0.40138504155124655, + "grad_norm": 0.685088038444519, + "learning_rate": 4.951856539763129e-06, + "loss": 0.7028, + "step": 1449 + }, + { + "epoch": 0.40166204986149584, + "grad_norm": 0.6737648248672485, + "learning_rate": 4.951785367192214e-06, + "loss": 0.7156, + "step": 1450 + }, + { + "epoch": 0.40193905817174513, + "grad_norm": 0.7130826115608215, + "learning_rate": 4.951714142563634e-06, + "loss": 0.7014, + "step": 1451 + }, + { + "epoch": 0.4022160664819945, + "grad_norm": 0.6648553013801575, + "learning_rate": 4.951642865878902e-06, + "loss": 0.7137, + "step": 1452 + }, + { + "epoch": 0.4024930747922438, + "grad_norm": 0.6709924340248108, + "learning_rate": 4.951571537139531e-06, + "loss": 0.7026, + "step": 1453 + }, + { + "epoch": 0.40277008310249307, + "grad_norm": 0.6769849061965942, + "learning_rate": 4.951500156347035e-06, + "loss": 0.7092, + "step": 1454 + }, + { + "epoch": 0.40304709141274236, + "grad_norm": 0.6583406329154968, + "learning_rate": 4.951428723502932e-06, + "loss": 0.7081, + "step": 1455 + }, + { + "epoch": 0.4033240997229917, + "grad_norm": 0.6608618497848511, + "learning_rate": 4.951357238608735e-06, + "loss": 0.6764, + "step": 1456 + }, + { + "epoch": 0.403601108033241, + "grad_norm": 0.6477077603340149, + "learning_rate": 4.9512857016659645e-06, + "loss": 0.7206, + "step": 1457 + }, + { + "epoch": 0.4038781163434903, + "grad_norm": 0.6960606575012207, + "learning_rate": 4.951214112676138e-06, + "loss": 0.6939, + "step": 1458 + }, + { + "epoch": 0.4041551246537396, + "grad_norm": 0.6560659408569336, + "learning_rate": 4.9511424716407765e-06, + "loss": 0.6741, + "step": 1459 + }, + { + "epoch": 0.40443213296398894, + "grad_norm": 0.7191847562789917, + "learning_rate": 4.951070778561401e-06, + "loss": 0.6589, + "step": 1460 + }, + { + "epoch": 0.40470914127423824, + "grad_norm": 0.6729449033737183, + "learning_rate": 4.950999033439534e-06, + "loss": 0.7091, + "step": 1461 + }, + { + "epoch": 0.40498614958448753, + "grad_norm": 0.6878858804702759, + "learning_rate": 4.950927236276698e-06, + "loss": 0.6943, + "step": 1462 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 0.6467249393463135, + "learning_rate": 4.950855387074417e-06, + "loss": 0.7039, + "step": 1463 + }, + { + "epoch": 0.4055401662049862, + "grad_norm": 0.6764515042304993, + "learning_rate": 4.950783485834218e-06, + "loss": 0.7004, + "step": 1464 + }, + { + "epoch": 0.40581717451523547, + "grad_norm": 0.6590722799301147, + "learning_rate": 4.950711532557627e-06, + "loss": 0.6954, + "step": 1465 + }, + { + "epoch": 0.40609418282548476, + "grad_norm": 0.6726288199424744, + "learning_rate": 4.950639527246172e-06, + "loss": 0.7311, + "step": 1466 + }, + { + "epoch": 0.40637119113573406, + "grad_norm": 0.6541720032691956, + "learning_rate": 4.950567469901382e-06, + "loss": 0.6552, + "step": 1467 + }, + { + "epoch": 0.4066481994459834, + "grad_norm": 0.6888921856880188, + "learning_rate": 4.950495360524786e-06, + "loss": 0.6341, + "step": 1468 + }, + { + "epoch": 0.4069252077562327, + "grad_norm": 0.7008625268936157, + "learning_rate": 4.950423199117915e-06, + "loss": 0.7526, + "step": 1469 + }, + { + "epoch": 0.407202216066482, + "grad_norm": 0.6698315143585205, + "learning_rate": 4.950350985682302e-06, + "loss": 0.6792, + "step": 1470 + }, + { + "epoch": 0.4074792243767313, + "grad_norm": 0.615836501121521, + "learning_rate": 4.95027872021948e-06, + "loss": 0.6562, + "step": 1471 + }, + { + "epoch": 0.40775623268698064, + "grad_norm": 0.6405780911445618, + "learning_rate": 4.950206402730984e-06, + "loss": 0.6895, + "step": 1472 + }, + { + "epoch": 0.40803324099722993, + "grad_norm": 0.6810288429260254, + "learning_rate": 4.950134033218349e-06, + "loss": 0.7323, + "step": 1473 + }, + { + "epoch": 0.4083102493074792, + "grad_norm": 0.6716300249099731, + "learning_rate": 4.950061611683111e-06, + "loss": 0.6745, + "step": 1474 + }, + { + "epoch": 0.4085872576177285, + "grad_norm": 0.6801728010177612, + "learning_rate": 4.949989138126809e-06, + "loss": 0.7043, + "step": 1475 + }, + { + "epoch": 0.40886426592797787, + "grad_norm": 0.7009851336479187, + "learning_rate": 4.949916612550979e-06, + "loss": 0.7346, + "step": 1476 + }, + { + "epoch": 0.40914127423822716, + "grad_norm": 0.7393293380737305, + "learning_rate": 4.949844034957165e-06, + "loss": 0.7276, + "step": 1477 + }, + { + "epoch": 0.40941828254847645, + "grad_norm": 0.6590290069580078, + "learning_rate": 4.949771405346905e-06, + "loss": 0.7004, + "step": 1478 + }, + { + "epoch": 0.40969529085872575, + "grad_norm": 0.6548259258270264, + "learning_rate": 4.949698723721742e-06, + "loss": 0.7383, + "step": 1479 + }, + { + "epoch": 0.4099722991689751, + "grad_norm": 0.7192888855934143, + "learning_rate": 4.94962599008322e-06, + "loss": 0.7471, + "step": 1480 + }, + { + "epoch": 0.4102493074792244, + "grad_norm": 0.6807044148445129, + "learning_rate": 4.949553204432882e-06, + "loss": 0.7549, + "step": 1481 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 0.6511745452880859, + "learning_rate": 4.949480366772274e-06, + "loss": 0.6995, + "step": 1482 + }, + { + "epoch": 0.410803324099723, + "grad_norm": 0.6546177268028259, + "learning_rate": 4.949407477102943e-06, + "loss": 0.6919, + "step": 1483 + }, + { + "epoch": 0.4110803324099723, + "grad_norm": 0.6770357489585876, + "learning_rate": 4.949334535426435e-06, + "loss": 0.7243, + "step": 1484 + }, + { + "epoch": 0.4113573407202216, + "grad_norm": 0.6877369284629822, + "learning_rate": 4.949261541744301e-06, + "loss": 0.6891, + "step": 1485 + }, + { + "epoch": 0.4116343490304709, + "grad_norm": 0.6853678822517395, + "learning_rate": 4.949188496058089e-06, + "loss": 0.7018, + "step": 1486 + }, + { + "epoch": 0.4119113573407202, + "grad_norm": 0.6893583536148071, + "learning_rate": 4.949115398369351e-06, + "loss": 0.6695, + "step": 1487 + }, + { + "epoch": 0.4121883656509695, + "grad_norm": 0.7509121894836426, + "learning_rate": 4.949042248679639e-06, + "loss": 0.7029, + "step": 1488 + }, + { + "epoch": 0.41246537396121885, + "grad_norm": 0.7181180715560913, + "learning_rate": 4.948969046990506e-06, + "loss": 0.6863, + "step": 1489 + }, + { + "epoch": 0.41274238227146814, + "grad_norm": 0.6763536930084229, + "learning_rate": 4.948895793303505e-06, + "loss": 0.7526, + "step": 1490 + }, + { + "epoch": 0.41301939058171744, + "grad_norm": 0.6821795701980591, + "learning_rate": 4.948822487620194e-06, + "loss": 0.6841, + "step": 1491 + }, + { + "epoch": 0.41329639889196673, + "grad_norm": 0.6655170321464539, + "learning_rate": 4.948749129942127e-06, + "loss": 0.6658, + "step": 1492 + }, + { + "epoch": 0.4135734072022161, + "grad_norm": 0.6946587562561035, + "learning_rate": 4.948675720270863e-06, + "loss": 0.7428, + "step": 1493 + }, + { + "epoch": 0.4138504155124654, + "grad_norm": 0.6431707143783569, + "learning_rate": 4.94860225860796e-06, + "loss": 0.6918, + "step": 1494 + }, + { + "epoch": 0.41412742382271467, + "grad_norm": 0.6807214021682739, + "learning_rate": 4.948528744954978e-06, + "loss": 0.6854, + "step": 1495 + }, + { + "epoch": 0.41440443213296396, + "grad_norm": 0.6434029936790466, + "learning_rate": 4.948455179313479e-06, + "loss": 0.7075, + "step": 1496 + }, + { + "epoch": 0.4146814404432133, + "grad_norm": 0.6675246953964233, + "learning_rate": 4.948381561685022e-06, + "loss": 0.6956, + "step": 1497 + }, + { + "epoch": 0.4149584487534626, + "grad_norm": 0.6771344542503357, + "learning_rate": 4.948307892071173e-06, + "loss": 0.702, + "step": 1498 + }, + { + "epoch": 0.4152354570637119, + "grad_norm": 0.7052770256996155, + "learning_rate": 4.948234170473495e-06, + "loss": 0.7251, + "step": 1499 + }, + { + "epoch": 0.4155124653739612, + "grad_norm": 0.6624787449836731, + "learning_rate": 4.948160396893553e-06, + "loss": 0.7037, + "step": 1500 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 0.6854806542396545, + "learning_rate": 4.948086571332914e-06, + "loss": 0.746, + "step": 1501 + }, + { + "epoch": 0.41606648199445984, + "grad_norm": 0.7364691495895386, + "learning_rate": 4.9480126937931465e-06, + "loss": 0.7121, + "step": 1502 + }, + { + "epoch": 0.41634349030470913, + "grad_norm": 0.6633167266845703, + "learning_rate": 4.947938764275817e-06, + "loss": 0.6913, + "step": 1503 + }, + { + "epoch": 0.4166204986149584, + "grad_norm": 0.686250627040863, + "learning_rate": 4.947864782782496e-06, + "loss": 0.7368, + "step": 1504 + }, + { + "epoch": 0.4168975069252078, + "grad_norm": 0.6367915272712708, + "learning_rate": 4.9477907493147546e-06, + "loss": 0.6641, + "step": 1505 + }, + { + "epoch": 0.41717451523545707, + "grad_norm": 0.6631414294242859, + "learning_rate": 4.947716663874164e-06, + "loss": 0.7006, + "step": 1506 + }, + { + "epoch": 0.41745152354570636, + "grad_norm": 0.6712923645973206, + "learning_rate": 4.9476425264623e-06, + "loss": 0.683, + "step": 1507 + }, + { + "epoch": 0.41772853185595565, + "grad_norm": 0.6794381141662598, + "learning_rate": 4.947568337080733e-06, + "loss": 0.6954, + "step": 1508 + }, + { + "epoch": 0.418005540166205, + "grad_norm": 0.6655251979827881, + "learning_rate": 4.947494095731039e-06, + "loss": 0.703, + "step": 1509 + }, + { + "epoch": 0.4182825484764543, + "grad_norm": 0.7107792496681213, + "learning_rate": 4.947419802414797e-06, + "loss": 0.7476, + "step": 1510 + }, + { + "epoch": 0.4185595567867036, + "grad_norm": 0.6949066519737244, + "learning_rate": 4.947345457133581e-06, + "loss": 0.7008, + "step": 1511 + }, + { + "epoch": 0.4188365650969529, + "grad_norm": 0.6868277192115784, + "learning_rate": 4.947271059888972e-06, + "loss": 0.7149, + "step": 1512 + }, + { + "epoch": 0.41911357340720223, + "grad_norm": 0.6967531442642212, + "learning_rate": 4.9471966106825485e-06, + "loss": 0.7146, + "step": 1513 + }, + { + "epoch": 0.4193905817174515, + "grad_norm": 0.6734687685966492, + "learning_rate": 4.9471221095158914e-06, + "loss": 0.6087, + "step": 1514 + }, + { + "epoch": 0.4196675900277008, + "grad_norm": 0.6532918214797974, + "learning_rate": 4.9470475563905825e-06, + "loss": 0.6585, + "step": 1515 + }, + { + "epoch": 0.4199445983379501, + "grad_norm": 0.6667916774749756, + "learning_rate": 4.946972951308206e-06, + "loss": 0.7538, + "step": 1516 + }, + { + "epoch": 0.42022160664819946, + "grad_norm": 0.6874780654907227, + "learning_rate": 4.946898294270343e-06, + "loss": 0.6466, + "step": 1517 + }, + { + "epoch": 0.42049861495844876, + "grad_norm": 0.6987433433532715, + "learning_rate": 4.9468235852785825e-06, + "loss": 0.7162, + "step": 1518 + }, + { + "epoch": 0.42077562326869805, + "grad_norm": 0.7233112454414368, + "learning_rate": 4.946748824334507e-06, + "loss": 0.7059, + "step": 1519 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.6833441257476807, + "learning_rate": 4.946674011439707e-06, + "loss": 0.6819, + "step": 1520 + }, + { + "epoch": 0.4213296398891967, + "grad_norm": 0.677476167678833, + "learning_rate": 4.946599146595769e-06, + "loss": 0.7347, + "step": 1521 + }, + { + "epoch": 0.421606648199446, + "grad_norm": 0.670714259147644, + "learning_rate": 4.946524229804284e-06, + "loss": 0.7048, + "step": 1522 + }, + { + "epoch": 0.4218836565096953, + "grad_norm": 0.676911473274231, + "learning_rate": 4.94644926106684e-06, + "loss": 0.7101, + "step": 1523 + }, + { + "epoch": 0.4221606648199446, + "grad_norm": 0.6721144914627075, + "learning_rate": 4.946374240385033e-06, + "loss": 0.6177, + "step": 1524 + }, + { + "epoch": 0.4224376731301939, + "grad_norm": 0.6739724278450012, + "learning_rate": 4.946299167760451e-06, + "loss": 0.6971, + "step": 1525 + }, + { + "epoch": 0.4227146814404432, + "grad_norm": 0.680992066860199, + "learning_rate": 4.9462240431946925e-06, + "loss": 0.7097, + "step": 1526 + }, + { + "epoch": 0.4229916897506925, + "grad_norm": 0.6601333618164062, + "learning_rate": 4.94614886668935e-06, + "loss": 0.6652, + "step": 1527 + }, + { + "epoch": 0.4232686980609418, + "grad_norm": 0.6684186458587646, + "learning_rate": 4.946073638246019e-06, + "loss": 0.7136, + "step": 1528 + }, + { + "epoch": 0.42354570637119116, + "grad_norm": 0.6920457482337952, + "learning_rate": 4.9459983578663e-06, + "loss": 0.7124, + "step": 1529 + }, + { + "epoch": 0.42382271468144045, + "grad_norm": 0.7031549215316772, + "learning_rate": 4.945923025551789e-06, + "loss": 0.7284, + "step": 1530 + }, + { + "epoch": 0.42409972299168974, + "grad_norm": 0.6548120975494385, + "learning_rate": 4.945847641304085e-06, + "loss": 0.7091, + "step": 1531 + }, + { + "epoch": 0.42437673130193904, + "grad_norm": 0.6406707763671875, + "learning_rate": 4.94577220512479e-06, + "loss": 0.6734, + "step": 1532 + }, + { + "epoch": 0.4246537396121884, + "grad_norm": 0.6942365169525146, + "learning_rate": 4.9456967170155056e-06, + "loss": 0.6622, + "step": 1533 + }, + { + "epoch": 0.4249307479224377, + "grad_norm": 0.643136739730835, + "learning_rate": 4.945621176977835e-06, + "loss": 0.6703, + "step": 1534 + }, + { + "epoch": 0.425207756232687, + "grad_norm": 0.677213728427887, + "learning_rate": 4.94554558501338e-06, + "loss": 0.6994, + "step": 1535 + }, + { + "epoch": 0.42548476454293627, + "grad_norm": 0.6650383472442627, + "learning_rate": 4.945469941123747e-06, + "loss": 0.692, + "step": 1536 + }, + { + "epoch": 0.4257617728531856, + "grad_norm": 0.6682860255241394, + "learning_rate": 4.945394245310543e-06, + "loss": 0.7249, + "step": 1537 + }, + { + "epoch": 0.4260387811634349, + "grad_norm": 0.7072668671607971, + "learning_rate": 4.945318497575375e-06, + "loss": 0.7446, + "step": 1538 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 0.654718816280365, + "learning_rate": 4.945242697919849e-06, + "loss": 0.7082, + "step": 1539 + }, + { + "epoch": 0.4265927977839335, + "grad_norm": 0.6829707622528076, + "learning_rate": 4.945166846345576e-06, + "loss": 0.6848, + "step": 1540 + }, + { + "epoch": 0.42686980609418285, + "grad_norm": 0.6674022674560547, + "learning_rate": 4.945090942854167e-06, + "loss": 0.6784, + "step": 1541 + }, + { + "epoch": 0.42714681440443214, + "grad_norm": 0.6758620142936707, + "learning_rate": 4.945014987447234e-06, + "loss": 0.7442, + "step": 1542 + }, + { + "epoch": 0.42742382271468143, + "grad_norm": 0.6620382070541382, + "learning_rate": 4.944938980126388e-06, + "loss": 0.6503, + "step": 1543 + }, + { + "epoch": 0.4277008310249307, + "grad_norm": 0.6612845659255981, + "learning_rate": 4.9448629208932446e-06, + "loss": 0.7045, + "step": 1544 + }, + { + "epoch": 0.4279778393351801, + "grad_norm": 0.7068538665771484, + "learning_rate": 4.944786809749417e-06, + "loss": 0.7549, + "step": 1545 + }, + { + "epoch": 0.42825484764542937, + "grad_norm": 0.6271569728851318, + "learning_rate": 4.944710646696522e-06, + "loss": 0.6106, + "step": 1546 + }, + { + "epoch": 0.42853185595567866, + "grad_norm": 0.6843795776367188, + "learning_rate": 4.944634431736177e-06, + "loss": 0.7204, + "step": 1547 + }, + { + "epoch": 0.42880886426592796, + "grad_norm": 0.6622493863105774, + "learning_rate": 4.9445581648700014e-06, + "loss": 0.6639, + "step": 1548 + }, + { + "epoch": 0.4290858725761773, + "grad_norm": 0.6531876921653748, + "learning_rate": 4.944481846099612e-06, + "loss": 0.6987, + "step": 1549 + }, + { + "epoch": 0.4293628808864266, + "grad_norm": 0.6773236989974976, + "learning_rate": 4.944405475426631e-06, + "loss": 0.6525, + "step": 1550 + }, + { + "epoch": 0.4296398891966759, + "grad_norm": 0.6729045510292053, + "learning_rate": 4.944329052852679e-06, + "loss": 0.6935, + "step": 1551 + }, + { + "epoch": 0.4299168975069252, + "grad_norm": 0.7236620783805847, + "learning_rate": 4.944252578379379e-06, + "loss": 0.7528, + "step": 1552 + }, + { + "epoch": 0.43019390581717454, + "grad_norm": 0.711037278175354, + "learning_rate": 4.944176052008355e-06, + "loss": 0.6764, + "step": 1553 + }, + { + "epoch": 0.43047091412742383, + "grad_norm": 0.6864672899246216, + "learning_rate": 4.944099473741232e-06, + "loss": 0.711, + "step": 1554 + }, + { + "epoch": 0.4307479224376731, + "grad_norm": 0.6570982933044434, + "learning_rate": 4.944022843579635e-06, + "loss": 0.6807, + "step": 1555 + }, + { + "epoch": 0.4310249307479224, + "grad_norm": 0.6892886757850647, + "learning_rate": 4.943946161525192e-06, + "loss": 0.6612, + "step": 1556 + }, + { + "epoch": 0.43130193905817177, + "grad_norm": 0.6675722002983093, + "learning_rate": 4.943869427579531e-06, + "loss": 0.7212, + "step": 1557 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 0.6782935261726379, + "learning_rate": 4.943792641744281e-06, + "loss": 0.6728, + "step": 1558 + }, + { + "epoch": 0.43185595567867036, + "grad_norm": 0.6436980366706848, + "learning_rate": 4.943715804021072e-06, + "loss": 0.6851, + "step": 1559 + }, + { + "epoch": 0.43213296398891965, + "grad_norm": 0.7117411494255066, + "learning_rate": 4.943638914411537e-06, + "loss": 0.7315, + "step": 1560 + }, + { + "epoch": 0.432409972299169, + "grad_norm": 0.6860139966011047, + "learning_rate": 4.943561972917307e-06, + "loss": 0.736, + "step": 1561 + }, + { + "epoch": 0.4326869806094183, + "grad_norm": 0.7034031748771667, + "learning_rate": 4.943484979540015e-06, + "loss": 0.7268, + "step": 1562 + }, + { + "epoch": 0.4329639889196676, + "grad_norm": 0.6490092873573303, + "learning_rate": 4.943407934281298e-06, + "loss": 0.6534, + "step": 1563 + }, + { + "epoch": 0.4332409972299169, + "grad_norm": 0.6792609691619873, + "learning_rate": 4.943330837142792e-06, + "loss": 0.7067, + "step": 1564 + }, + { + "epoch": 0.43351800554016623, + "grad_norm": 0.6546031832695007, + "learning_rate": 4.94325368812613e-06, + "loss": 0.6907, + "step": 1565 + }, + { + "epoch": 0.4337950138504155, + "grad_norm": 0.688127875328064, + "learning_rate": 4.943176487232955e-06, + "loss": 0.6616, + "step": 1566 + }, + { + "epoch": 0.4340720221606648, + "grad_norm": 0.6924400329589844, + "learning_rate": 4.943099234464902e-06, + "loss": 0.6945, + "step": 1567 + }, + { + "epoch": 0.4343490304709141, + "grad_norm": 0.7010718584060669, + "learning_rate": 4.943021929823615e-06, + "loss": 0.73, + "step": 1568 + }, + { + "epoch": 0.43462603878116346, + "grad_norm": 0.6934574246406555, + "learning_rate": 4.942944573310733e-06, + "loss": 0.7208, + "step": 1569 + }, + { + "epoch": 0.43490304709141275, + "grad_norm": 0.6866487860679626, + "learning_rate": 4.942867164927899e-06, + "loss": 0.6729, + "step": 1570 + }, + { + "epoch": 0.43518005540166205, + "grad_norm": 0.6813799142837524, + "learning_rate": 4.942789704676757e-06, + "loss": 0.6887, + "step": 1571 + }, + { + "epoch": 0.43545706371191134, + "grad_norm": 0.7103790640830994, + "learning_rate": 4.942712192558951e-06, + "loss": 0.684, + "step": 1572 + }, + { + "epoch": 0.4357340720221607, + "grad_norm": 0.7329322099685669, + "learning_rate": 4.942634628576127e-06, + "loss": 0.7417, + "step": 1573 + }, + { + "epoch": 0.43601108033241, + "grad_norm": 0.6444783806800842, + "learning_rate": 4.942557012729933e-06, + "loss": 0.7252, + "step": 1574 + }, + { + "epoch": 0.4362880886426593, + "grad_norm": 0.6684467196464539, + "learning_rate": 4.942479345022016e-06, + "loss": 0.6494, + "step": 1575 + }, + { + "epoch": 0.43656509695290857, + "grad_norm": 0.6751431822776794, + "learning_rate": 4.942401625454024e-06, + "loss": 0.6628, + "step": 1576 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 0.7155023217201233, + "learning_rate": 4.942323854027609e-06, + "loss": 0.7036, + "step": 1577 + }, + { + "epoch": 0.4371191135734072, + "grad_norm": 1.0883293151855469, + "learning_rate": 4.942246030744421e-06, + "loss": 0.6697, + "step": 1578 + }, + { + "epoch": 0.4373961218836565, + "grad_norm": 0.6593711972236633, + "learning_rate": 4.942168155606113e-06, + "loss": 0.6732, + "step": 1579 + }, + { + "epoch": 0.4376731301939058, + "grad_norm": 0.6591055989265442, + "learning_rate": 4.942090228614339e-06, + "loss": 0.7144, + "step": 1580 + }, + { + "epoch": 0.43795013850415515, + "grad_norm": 0.6900238990783691, + "learning_rate": 4.942012249770753e-06, + "loss": 0.7171, + "step": 1581 + }, + { + "epoch": 0.43822714681440444, + "grad_norm": 0.6809273958206177, + "learning_rate": 4.94193421907701e-06, + "loss": 0.7234, + "step": 1582 + }, + { + "epoch": 0.43850415512465374, + "grad_norm": 0.6896465420722961, + "learning_rate": 4.941856136534768e-06, + "loss": 0.7384, + "step": 1583 + }, + { + "epoch": 0.43878116343490303, + "grad_norm": 0.6935831904411316, + "learning_rate": 4.941778002145684e-06, + "loss": 0.6968, + "step": 1584 + }, + { + "epoch": 0.4390581717451524, + "grad_norm": 0.6583067178726196, + "learning_rate": 4.941699815911418e-06, + "loss": 0.6905, + "step": 1585 + }, + { + "epoch": 0.4393351800554017, + "grad_norm": 0.6565580368041992, + "learning_rate": 4.941621577833629e-06, + "loss": 0.6667, + "step": 1586 + }, + { + "epoch": 0.43961218836565097, + "grad_norm": 0.6886636018753052, + "learning_rate": 4.941543287913979e-06, + "loss": 0.7294, + "step": 1587 + }, + { + "epoch": 0.43988919667590026, + "grad_norm": 0.6666744351387024, + "learning_rate": 4.94146494615413e-06, + "loss": 0.665, + "step": 1588 + }, + { + "epoch": 0.4401662049861496, + "grad_norm": 0.6424551606178284, + "learning_rate": 4.941386552555745e-06, + "loss": 0.6942, + "step": 1589 + }, + { + "epoch": 0.4404432132963989, + "grad_norm": 0.6820762157440186, + "learning_rate": 4.941308107120489e-06, + "loss": 0.7457, + "step": 1590 + }, + { + "epoch": 0.4407202216066482, + "grad_norm": 0.6450785994529724, + "learning_rate": 4.941229609850028e-06, + "loss": 0.6497, + "step": 1591 + }, + { + "epoch": 0.4409972299168975, + "grad_norm": 0.6925249099731445, + "learning_rate": 4.941151060746028e-06, + "loss": 0.7306, + "step": 1592 + }, + { + "epoch": 0.4412742382271468, + "grad_norm": 0.6624888181686401, + "learning_rate": 4.941072459810157e-06, + "loss": 0.7172, + "step": 1593 + }, + { + "epoch": 0.44155124653739614, + "grad_norm": 0.699098527431488, + "learning_rate": 4.940993807044083e-06, + "loss": 0.6957, + "step": 1594 + }, + { + "epoch": 0.44182825484764543, + "grad_norm": 0.6670101881027222, + "learning_rate": 4.9409151024494775e-06, + "loss": 0.7194, + "step": 1595 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.6752918362617493, + "learning_rate": 4.940836346028011e-06, + "loss": 0.7369, + "step": 1596 + }, + { + "epoch": 0.442382271468144, + "grad_norm": 0.6847531199455261, + "learning_rate": 4.9407575377813555e-06, + "loss": 0.6431, + "step": 1597 + }, + { + "epoch": 0.44265927977839337, + "grad_norm": 0.6920828223228455, + "learning_rate": 4.9406786777111846e-06, + "loss": 0.703, + "step": 1598 + }, + { + "epoch": 0.44293628808864266, + "grad_norm": 0.6975665092468262, + "learning_rate": 4.940599765819172e-06, + "loss": 0.6772, + "step": 1599 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 0.7083530426025391, + "learning_rate": 4.940520802106994e-06, + "loss": 0.7113, + "step": 1600 + }, + { + "epoch": 0.44349030470914125, + "grad_norm": 0.6941363215446472, + "learning_rate": 4.940441786576328e-06, + "loss": 0.7125, + "step": 1601 + }, + { + "epoch": 0.4437673130193906, + "grad_norm": 0.6918179392814636, + "learning_rate": 4.94036271922885e-06, + "loss": 0.7073, + "step": 1602 + }, + { + "epoch": 0.4440443213296399, + "grad_norm": 0.7111680507659912, + "learning_rate": 4.940283600066239e-06, + "loss": 0.7133, + "step": 1603 + }, + { + "epoch": 0.4443213296398892, + "grad_norm": 0.6951150298118591, + "learning_rate": 4.9402044290901756e-06, + "loss": 0.733, + "step": 1604 + }, + { + "epoch": 0.4445983379501385, + "grad_norm": 0.7251552939414978, + "learning_rate": 4.9401252063023405e-06, + "loss": 0.6939, + "step": 1605 + }, + { + "epoch": 0.4448753462603878, + "grad_norm": 0.6868443489074707, + "learning_rate": 4.9400459317044155e-06, + "loss": 0.7334, + "step": 1606 + }, + { + "epoch": 0.4451523545706371, + "grad_norm": 0.6709305644035339, + "learning_rate": 4.939966605298085e-06, + "loss": 0.6766, + "step": 1607 + }, + { + "epoch": 0.4454293628808864, + "grad_norm": 0.7305266261100769, + "learning_rate": 4.939887227085032e-06, + "loss": 0.7549, + "step": 1608 + }, + { + "epoch": 0.4457063711911357, + "grad_norm": 0.6449017524719238, + "learning_rate": 4.939807797066943e-06, + "loss": 0.6886, + "step": 1609 + }, + { + "epoch": 0.44598337950138506, + "grad_norm": 0.6638166308403015, + "learning_rate": 4.939728315245503e-06, + "loss": 0.7256, + "step": 1610 + }, + { + "epoch": 0.44626038781163435, + "grad_norm": 0.6473711133003235, + "learning_rate": 4.9396487816224014e-06, + "loss": 0.684, + "step": 1611 + }, + { + "epoch": 0.44653739612188365, + "grad_norm": 0.6618096232414246, + "learning_rate": 4.939569196199325e-06, + "loss": 0.6679, + "step": 1612 + }, + { + "epoch": 0.44681440443213294, + "grad_norm": 0.6737478375434875, + "learning_rate": 4.939489558977965e-06, + "loss": 0.6957, + "step": 1613 + }, + { + "epoch": 0.4470914127423823, + "grad_norm": 0.7004351615905762, + "learning_rate": 4.9394098699600125e-06, + "loss": 0.7231, + "step": 1614 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.658447802066803, + "learning_rate": 4.939330129147158e-06, + "loss": 0.6537, + "step": 1615 + }, + { + "epoch": 0.4476454293628809, + "grad_norm": 0.6831145286560059, + "learning_rate": 4.939250336541096e-06, + "loss": 0.7153, + "step": 1616 + }, + { + "epoch": 0.44792243767313017, + "grad_norm": 0.6539595127105713, + "learning_rate": 4.939170492143521e-06, + "loss": 0.7445, + "step": 1617 + }, + { + "epoch": 0.4481994459833795, + "grad_norm": 0.682756781578064, + "learning_rate": 4.9390905959561254e-06, + "loss": 0.7261, + "step": 1618 + }, + { + "epoch": 0.4484764542936288, + "grad_norm": 0.6738179326057434, + "learning_rate": 4.939010647980609e-06, + "loss": 0.6729, + "step": 1619 + }, + { + "epoch": 0.4487534626038781, + "grad_norm": 0.6528425216674805, + "learning_rate": 4.938930648218667e-06, + "loss": 0.7388, + "step": 1620 + }, + { + "epoch": 0.4490304709141274, + "grad_norm": 0.7098863124847412, + "learning_rate": 4.9388505966720006e-06, + "loss": 0.6978, + "step": 1621 + }, + { + "epoch": 0.44930747922437675, + "grad_norm": 0.6853944659233093, + "learning_rate": 4.938770493342307e-06, + "loss": 0.7558, + "step": 1622 + }, + { + "epoch": 0.44958448753462604, + "grad_norm": 0.7014193534851074, + "learning_rate": 4.9386903382312876e-06, + "loss": 0.7404, + "step": 1623 + }, + { + "epoch": 0.44986149584487534, + "grad_norm": 0.6289151310920715, + "learning_rate": 4.9386101313406445e-06, + "loss": 0.6741, + "step": 1624 + }, + { + "epoch": 0.45013850415512463, + "grad_norm": 0.6713910698890686, + "learning_rate": 4.938529872672082e-06, + "loss": 0.6464, + "step": 1625 + }, + { + "epoch": 0.450415512465374, + "grad_norm": 0.6423241496086121, + "learning_rate": 4.938449562227302e-06, + "loss": 0.717, + "step": 1626 + }, + { + "epoch": 0.4506925207756233, + "grad_norm": 0.6668711304664612, + "learning_rate": 4.9383692000080105e-06, + "loss": 0.7101, + "step": 1627 + }, + { + "epoch": 0.45096952908587257, + "grad_norm": 0.6786472201347351, + "learning_rate": 4.9382887860159145e-06, + "loss": 0.7295, + "step": 1628 + }, + { + "epoch": 0.45124653739612186, + "grad_norm": 0.6926882863044739, + "learning_rate": 4.938208320252721e-06, + "loss": 0.7362, + "step": 1629 + }, + { + "epoch": 0.4515235457063712, + "grad_norm": 0.7050982713699341, + "learning_rate": 4.938127802720137e-06, + "loss": 0.6786, + "step": 1630 + }, + { + "epoch": 0.4518005540166205, + "grad_norm": 0.663986325263977, + "learning_rate": 4.938047233419875e-06, + "loss": 0.7133, + "step": 1631 + }, + { + "epoch": 0.4520775623268698, + "grad_norm": 0.6806838512420654, + "learning_rate": 4.937966612353644e-06, + "loss": 0.675, + "step": 1632 + }, + { + "epoch": 0.4523545706371191, + "grad_norm": 0.6535438895225525, + "learning_rate": 4.937885939523155e-06, + "loss": 0.6823, + "step": 1633 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 0.6611151695251465, + "learning_rate": 4.937805214930123e-06, + "loss": 0.6661, + "step": 1634 + }, + { + "epoch": 0.45290858725761773, + "grad_norm": 0.6430408358573914, + "learning_rate": 4.93772443857626e-06, + "loss": 0.6687, + "step": 1635 + }, + { + "epoch": 0.45318559556786703, + "grad_norm": 0.6679038405418396, + "learning_rate": 4.937643610463282e-06, + "loss": 0.7235, + "step": 1636 + }, + { + "epoch": 0.4534626038781163, + "grad_norm": 0.6662124395370483, + "learning_rate": 4.937562730592905e-06, + "loss": 0.6389, + "step": 1637 + }, + { + "epoch": 0.45373961218836567, + "grad_norm": 0.6467198729515076, + "learning_rate": 4.937481798966847e-06, + "loss": 0.6662, + "step": 1638 + }, + { + "epoch": 0.45401662049861496, + "grad_norm": 0.6903800964355469, + "learning_rate": 4.937400815586825e-06, + "loss": 0.7161, + "step": 1639 + }, + { + "epoch": 0.45429362880886426, + "grad_norm": 0.6393138766288757, + "learning_rate": 4.937319780454559e-06, + "loss": 0.6788, + "step": 1640 + }, + { + "epoch": 0.45457063711911355, + "grad_norm": 0.6818089485168457, + "learning_rate": 4.9372386935717705e-06, + "loss": 0.7274, + "step": 1641 + }, + { + "epoch": 0.4548476454293629, + "grad_norm": 0.6777812242507935, + "learning_rate": 4.937157554940181e-06, + "loss": 0.7134, + "step": 1642 + }, + { + "epoch": 0.4551246537396122, + "grad_norm": 0.6755929589271545, + "learning_rate": 4.937076364561512e-06, + "loss": 0.6949, + "step": 1643 + }, + { + "epoch": 0.4554016620498615, + "grad_norm": 0.6555960774421692, + "learning_rate": 4.936995122437488e-06, + "loss": 0.6567, + "step": 1644 + }, + { + "epoch": 0.4556786703601108, + "grad_norm": 0.6834290027618408, + "learning_rate": 4.9369138285698345e-06, + "loss": 0.6743, + "step": 1645 + }, + { + "epoch": 0.45595567867036013, + "grad_norm": 0.640485405921936, + "learning_rate": 4.936832482960278e-06, + "loss": 0.663, + "step": 1646 + }, + { + "epoch": 0.4562326869806094, + "grad_norm": 0.6723644137382507, + "learning_rate": 4.936751085610543e-06, + "loss": 0.6687, + "step": 1647 + }, + { + "epoch": 0.4565096952908587, + "grad_norm": 0.7141152620315552, + "learning_rate": 4.936669636522361e-06, + "loss": 0.6685, + "step": 1648 + }, + { + "epoch": 0.456786703601108, + "grad_norm": 0.6581059694290161, + "learning_rate": 4.93658813569746e-06, + "loss": 0.7371, + "step": 1649 + }, + { + "epoch": 0.45706371191135736, + "grad_norm": 0.6734123826026917, + "learning_rate": 4.93650658313757e-06, + "loss": 0.6696, + "step": 1650 + }, + { + "epoch": 0.45734072022160666, + "grad_norm": 0.6516708135604858, + "learning_rate": 4.936424978844424e-06, + "loss": 0.6508, + "step": 1651 + }, + { + "epoch": 0.45761772853185595, + "grad_norm": 0.68487149477005, + "learning_rate": 4.9363433228197524e-06, + "loss": 0.7158, + "step": 1652 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 0.667688250541687, + "learning_rate": 4.936261615065292e-06, + "loss": 0.679, + "step": 1653 + }, + { + "epoch": 0.4581717451523546, + "grad_norm": 0.7019890546798706, + "learning_rate": 4.9361798555827745e-06, + "loss": 0.6734, + "step": 1654 + }, + { + "epoch": 0.4584487534626039, + "grad_norm": 0.6553559303283691, + "learning_rate": 4.936098044373938e-06, + "loss": 0.7148, + "step": 1655 + }, + { + "epoch": 0.4587257617728532, + "grad_norm": 0.680182933807373, + "learning_rate": 4.936016181440518e-06, + "loss": 0.6797, + "step": 1656 + }, + { + "epoch": 0.4590027700831025, + "grad_norm": 0.6735982298851013, + "learning_rate": 4.935934266784254e-06, + "loss": 0.7265, + "step": 1657 + }, + { + "epoch": 0.4592797783933518, + "grad_norm": 0.6884268522262573, + "learning_rate": 4.935852300406886e-06, + "loss": 0.7491, + "step": 1658 + }, + { + "epoch": 0.4595567867036011, + "grad_norm": 0.7029476761817932, + "learning_rate": 4.935770282310152e-06, + "loss": 0.6959, + "step": 1659 + }, + { + "epoch": 0.4598337950138504, + "grad_norm": 0.6848331093788147, + "learning_rate": 4.935688212495795e-06, + "loss": 0.7447, + "step": 1660 + }, + { + "epoch": 0.4601108033240997, + "grad_norm": 0.6873815655708313, + "learning_rate": 4.935606090965557e-06, + "loss": 0.7022, + "step": 1661 + }, + { + "epoch": 0.46038781163434905, + "grad_norm": 0.6748994588851929, + "learning_rate": 4.935523917721182e-06, + "loss": 0.732, + "step": 1662 + }, + { + "epoch": 0.46066481994459835, + "grad_norm": 0.6672415733337402, + "learning_rate": 4.935441692764415e-06, + "loss": 0.7035, + "step": 1663 + }, + { + "epoch": 0.46094182825484764, + "grad_norm": 0.7236402034759521, + "learning_rate": 4.935359416097001e-06, + "loss": 0.7004, + "step": 1664 + }, + { + "epoch": 0.46121883656509693, + "grad_norm": 0.6981213688850403, + "learning_rate": 4.9352770877206865e-06, + "loss": 0.686, + "step": 1665 + }, + { + "epoch": 0.4614958448753463, + "grad_norm": 0.7025541067123413, + "learning_rate": 4.935194707637222e-06, + "loss": 0.7322, + "step": 1666 + }, + { + "epoch": 0.4617728531855956, + "grad_norm": 0.6887792348861694, + "learning_rate": 4.935112275848354e-06, + "loss": 0.6834, + "step": 1667 + }, + { + "epoch": 0.46204986149584487, + "grad_norm": 0.6790734529495239, + "learning_rate": 4.935029792355834e-06, + "loss": 0.6889, + "step": 1668 + }, + { + "epoch": 0.46232686980609417, + "grad_norm": 0.6590743064880371, + "learning_rate": 4.934947257161413e-06, + "loss": 0.6941, + "step": 1669 + }, + { + "epoch": 0.4626038781163435, + "grad_norm": 0.7474731802940369, + "learning_rate": 4.934864670266844e-06, + "loss": 0.7576, + "step": 1670 + }, + { + "epoch": 0.4628808864265928, + "grad_norm": 0.6757420897483826, + "learning_rate": 4.934782031673879e-06, + "loss": 0.7377, + "step": 1671 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.668341338634491, + "learning_rate": 4.9346993413842755e-06, + "loss": 0.7288, + "step": 1672 + }, + { + "epoch": 0.4634349030470914, + "grad_norm": 0.6508594751358032, + "learning_rate": 4.934616599399786e-06, + "loss": 0.6765, + "step": 1673 + }, + { + "epoch": 0.46371191135734074, + "grad_norm": 0.6187620759010315, + "learning_rate": 4.934533805722169e-06, + "loss": 0.6626, + "step": 1674 + }, + { + "epoch": 0.46398891966759004, + "grad_norm": 0.6785850524902344, + "learning_rate": 4.934450960353183e-06, + "loss": 0.6788, + "step": 1675 + }, + { + "epoch": 0.46426592797783933, + "grad_norm": 0.6822246313095093, + "learning_rate": 4.934368063294586e-06, + "loss": 0.7357, + "step": 1676 + }, + { + "epoch": 0.4645429362880886, + "grad_norm": 0.6765061616897583, + "learning_rate": 4.934285114548137e-06, + "loss": 0.6875, + "step": 1677 + }, + { + "epoch": 0.464819944598338, + "grad_norm": 0.713993489742279, + "learning_rate": 4.9342021141156e-06, + "loss": 0.7364, + "step": 1678 + }, + { + "epoch": 0.46509695290858727, + "grad_norm": 0.7380864024162292, + "learning_rate": 4.934119061998735e-06, + "loss": 0.6715, + "step": 1679 + }, + { + "epoch": 0.46537396121883656, + "grad_norm": 0.6553770303726196, + "learning_rate": 4.934035958199307e-06, + "loss": 0.6442, + "step": 1680 + }, + { + "epoch": 0.46565096952908586, + "grad_norm": 0.6353982090950012, + "learning_rate": 4.9339528027190795e-06, + "loss": 0.7155, + "step": 1681 + }, + { + "epoch": 0.4659279778393352, + "grad_norm": 0.669859766960144, + "learning_rate": 4.933869595559818e-06, + "loss": 0.6769, + "step": 1682 + }, + { + "epoch": 0.4662049861495845, + "grad_norm": 0.6884238719940186, + "learning_rate": 4.9337863367232896e-06, + "loss": 0.6827, + "step": 1683 + }, + { + "epoch": 0.4664819944598338, + "grad_norm": 0.6490418314933777, + "learning_rate": 4.933703026211262e-06, + "loss": 0.6591, + "step": 1684 + }, + { + "epoch": 0.4667590027700831, + "grad_norm": 0.6696502566337585, + "learning_rate": 4.933619664025505e-06, + "loss": 0.6958, + "step": 1685 + }, + { + "epoch": 0.46703601108033244, + "grad_norm": 0.6910261511802673, + "learning_rate": 4.9335362501677865e-06, + "loss": 0.7157, + "step": 1686 + }, + { + "epoch": 0.46731301939058173, + "grad_norm": 0.6383054256439209, + "learning_rate": 4.933452784639879e-06, + "loss": 0.6681, + "step": 1687 + }, + { + "epoch": 0.467590027700831, + "grad_norm": 0.7032589316368103, + "learning_rate": 4.933369267443555e-06, + "loss": 0.7133, + "step": 1688 + }, + { + "epoch": 0.4678670360110803, + "grad_norm": 0.6746159791946411, + "learning_rate": 4.933285698580588e-06, + "loss": 0.7086, + "step": 1689 + }, + { + "epoch": 0.46814404432132967, + "grad_norm": 0.6999416351318359, + "learning_rate": 4.93320207805275e-06, + "loss": 0.7308, + "step": 1690 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 0.6476268172264099, + "learning_rate": 4.93311840586182e-06, + "loss": 0.6892, + "step": 1691 + }, + { + "epoch": 0.46869806094182825, + "grad_norm": 0.6434255242347717, + "learning_rate": 4.933034682009572e-06, + "loss": 0.7122, + "step": 1692 + }, + { + "epoch": 0.46897506925207755, + "grad_norm": 0.6608331203460693, + "learning_rate": 4.932950906497784e-06, + "loss": 0.6961, + "step": 1693 + }, + { + "epoch": 0.4692520775623269, + "grad_norm": 0.6635152101516724, + "learning_rate": 4.932867079328235e-06, + "loss": 0.7058, + "step": 1694 + }, + { + "epoch": 0.4695290858725762, + "grad_norm": 0.6834961771965027, + "learning_rate": 4.932783200502705e-06, + "loss": 0.6907, + "step": 1695 + }, + { + "epoch": 0.4698060941828255, + "grad_norm": 0.6840958595275879, + "learning_rate": 4.932699270022976e-06, + "loss": 0.7066, + "step": 1696 + }, + { + "epoch": 0.4700831024930748, + "grad_norm": 0.659977912902832, + "learning_rate": 4.932615287890828e-06, + "loss": 0.6874, + "step": 1697 + }, + { + "epoch": 0.4703601108033241, + "grad_norm": 0.6732885837554932, + "learning_rate": 4.932531254108046e-06, + "loss": 0.7228, + "step": 1698 + }, + { + "epoch": 0.4706371191135734, + "grad_norm": 0.7419905662536621, + "learning_rate": 4.932447168676414e-06, + "loss": 0.6943, + "step": 1699 + }, + { + "epoch": 0.4709141274238227, + "grad_norm": 0.6802665591239929, + "learning_rate": 4.932363031597715e-06, + "loss": 0.7504, + "step": 1700 + }, + { + "epoch": 0.471191135734072, + "grad_norm": 0.6577280759811401, + "learning_rate": 4.9322788428737385e-06, + "loss": 0.7208, + "step": 1701 + }, + { + "epoch": 0.4714681440443213, + "grad_norm": 0.6920000910758972, + "learning_rate": 4.932194602506271e-06, + "loss": 0.7027, + "step": 1702 + }, + { + "epoch": 0.47174515235457065, + "grad_norm": 0.6805425882339478, + "learning_rate": 4.9321103104971e-06, + "loss": 0.7249, + "step": 1703 + }, + { + "epoch": 0.47202216066481995, + "grad_norm": 0.6763489842414856, + "learning_rate": 4.932025966848017e-06, + "loss": 0.7165, + "step": 1704 + }, + { + "epoch": 0.47229916897506924, + "grad_norm": 0.6553173661231995, + "learning_rate": 4.931941571560812e-06, + "loss": 0.6503, + "step": 1705 + }, + { + "epoch": 0.47257617728531853, + "grad_norm": 0.6607550382614136, + "learning_rate": 4.931857124637276e-06, + "loss": 0.6886, + "step": 1706 + }, + { + "epoch": 0.4728531855955679, + "grad_norm": 0.6413080096244812, + "learning_rate": 4.931772626079204e-06, + "loss": 0.6842, + "step": 1707 + }, + { + "epoch": 0.4731301939058172, + "grad_norm": 0.6337507367134094, + "learning_rate": 4.931688075888389e-06, + "loss": 0.6366, + "step": 1708 + }, + { + "epoch": 0.47340720221606647, + "grad_norm": 0.6673510074615479, + "learning_rate": 4.931603474066626e-06, + "loss": 0.6353, + "step": 1709 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.651106595993042, + "learning_rate": 4.931518820615712e-06, + "loss": 0.715, + "step": 1710 + }, + { + "epoch": 0.4739612188365651, + "grad_norm": 0.6641266942024231, + "learning_rate": 4.9314341155374425e-06, + "loss": 0.7022, + "step": 1711 + }, + { + "epoch": 0.4742382271468144, + "grad_norm": 0.6641416549682617, + "learning_rate": 4.9313493588336196e-06, + "loss": 0.6821, + "step": 1712 + }, + { + "epoch": 0.4745152354570637, + "grad_norm": 0.6898216009140015, + "learning_rate": 4.9312645505060396e-06, + "loss": 0.6994, + "step": 1713 + }, + { + "epoch": 0.474792243767313, + "grad_norm": 0.66060471534729, + "learning_rate": 4.931179690556504e-06, + "loss": 0.6867, + "step": 1714 + }, + { + "epoch": 0.47506925207756234, + "grad_norm": 0.673663318157196, + "learning_rate": 4.931094778986816e-06, + "loss": 0.737, + "step": 1715 + }, + { + "epoch": 0.47534626038781164, + "grad_norm": 0.695510745048523, + "learning_rate": 4.931009815798777e-06, + "loss": 0.7038, + "step": 1716 + }, + { + "epoch": 0.47562326869806093, + "grad_norm": 0.6945306062698364, + "learning_rate": 4.930924800994192e-06, + "loss": 0.7442, + "step": 1717 + }, + { + "epoch": 0.4759002770083102, + "grad_norm": 0.6963913440704346, + "learning_rate": 4.930839734574865e-06, + "loss": 0.6816, + "step": 1718 + }, + { + "epoch": 0.4761772853185596, + "grad_norm": 0.7170199751853943, + "learning_rate": 4.930754616542603e-06, + "loss": 0.7057, + "step": 1719 + }, + { + "epoch": 0.47645429362880887, + "grad_norm": 0.6967073082923889, + "learning_rate": 4.930669446899213e-06, + "loss": 0.7028, + "step": 1720 + }, + { + "epoch": 0.47673130193905816, + "grad_norm": 0.7398955225944519, + "learning_rate": 4.9305842256465035e-06, + "loss": 0.7457, + "step": 1721 + }, + { + "epoch": 0.47700831024930745, + "grad_norm": 0.6501532196998596, + "learning_rate": 4.930498952786284e-06, + "loss": 0.6815, + "step": 1722 + }, + { + "epoch": 0.4772853185595568, + "grad_norm": 0.6739403605461121, + "learning_rate": 4.930413628320364e-06, + "loss": 0.7195, + "step": 1723 + }, + { + "epoch": 0.4775623268698061, + "grad_norm": 0.7090854048728943, + "learning_rate": 4.930328252250557e-06, + "loss": 0.7021, + "step": 1724 + }, + { + "epoch": 0.4778393351800554, + "grad_norm": 0.6927851438522339, + "learning_rate": 4.930242824578675e-06, + "loss": 0.6947, + "step": 1725 + }, + { + "epoch": 0.4781163434903047, + "grad_norm": 0.6775885224342346, + "learning_rate": 4.930157345306531e-06, + "loss": 0.6831, + "step": 1726 + }, + { + "epoch": 0.47839335180055403, + "grad_norm": 0.6729276776313782, + "learning_rate": 4.930071814435941e-06, + "loss": 0.7166, + "step": 1727 + }, + { + "epoch": 0.47867036011080333, + "grad_norm": 0.6515985727310181, + "learning_rate": 4.92998623196872e-06, + "loss": 0.6913, + "step": 1728 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 0.6873002052307129, + "learning_rate": 4.9299005979066865e-06, + "loss": 0.7161, + "step": 1729 + }, + { + "epoch": 0.4792243767313019, + "grad_norm": 0.6676026582717896, + "learning_rate": 4.929814912251658e-06, + "loss": 0.6837, + "step": 1730 + }, + { + "epoch": 0.47950138504155126, + "grad_norm": 0.6411203742027283, + "learning_rate": 4.929729175005453e-06, + "loss": 0.6805, + "step": 1731 + }, + { + "epoch": 0.47977839335180056, + "grad_norm": 0.6711311936378479, + "learning_rate": 4.929643386169894e-06, + "loss": 0.7175, + "step": 1732 + }, + { + "epoch": 0.48005540166204985, + "grad_norm": 0.6989825963973999, + "learning_rate": 4.9295575457468e-06, + "loss": 0.7289, + "step": 1733 + }, + { + "epoch": 0.48033240997229915, + "grad_norm": 0.683743953704834, + "learning_rate": 4.9294716537379965e-06, + "loss": 0.7232, + "step": 1734 + }, + { + "epoch": 0.4806094182825485, + "grad_norm": 0.6814141273498535, + "learning_rate": 4.929385710145304e-06, + "loss": 0.7092, + "step": 1735 + }, + { + "epoch": 0.4808864265927978, + "grad_norm": 0.6862533688545227, + "learning_rate": 4.92929971497055e-06, + "loss": 0.756, + "step": 1736 + }, + { + "epoch": 0.4811634349030471, + "grad_norm": 0.7218449115753174, + "learning_rate": 4.929213668215559e-06, + "loss": 0.7135, + "step": 1737 + }, + { + "epoch": 0.4814404432132964, + "grad_norm": 0.683336615562439, + "learning_rate": 4.929127569882158e-06, + "loss": 0.7294, + "step": 1738 + }, + { + "epoch": 0.4817174515235457, + "grad_norm": 0.6259140968322754, + "learning_rate": 4.9290414199721755e-06, + "loss": 0.6527, + "step": 1739 + }, + { + "epoch": 0.481994459833795, + "grad_norm": 0.6623820662498474, + "learning_rate": 4.92895521848744e-06, + "loss": 0.6523, + "step": 1740 + }, + { + "epoch": 0.4822714681440443, + "grad_norm": 0.6865464448928833, + "learning_rate": 4.928868965429783e-06, + "loss": 0.7055, + "step": 1741 + }, + { + "epoch": 0.4825484764542936, + "grad_norm": 0.7001657485961914, + "learning_rate": 4.928782660801035e-06, + "loss": 0.7136, + "step": 1742 + }, + { + "epoch": 0.48282548476454296, + "grad_norm": 0.6874849200248718, + "learning_rate": 4.928696304603029e-06, + "loss": 0.7043, + "step": 1743 + }, + { + "epoch": 0.48310249307479225, + "grad_norm": 0.6774317026138306, + "learning_rate": 4.928609896837599e-06, + "loss": 0.6533, + "step": 1744 + }, + { + "epoch": 0.48337950138504154, + "grad_norm": 0.689127504825592, + "learning_rate": 4.928523437506577e-06, + "loss": 0.7142, + "step": 1745 + }, + { + "epoch": 0.48365650969529084, + "grad_norm": 0.6922279596328735, + "learning_rate": 4.928436926611801e-06, + "loss": 0.7517, + "step": 1746 + }, + { + "epoch": 0.4839335180055402, + "grad_norm": 0.7059768438339233, + "learning_rate": 4.928350364155109e-06, + "loss": 0.6873, + "step": 1747 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.6696379780769348, + "learning_rate": 4.9282637501383365e-06, + "loss": 0.6752, + "step": 1748 + }, + { + "epoch": 0.4844875346260388, + "grad_norm": 0.6618900299072266, + "learning_rate": 4.9281770845633235e-06, + "loss": 0.6876, + "step": 1749 + }, + { + "epoch": 0.48476454293628807, + "grad_norm": 0.6845422387123108, + "learning_rate": 4.92809036743191e-06, + "loss": 0.7124, + "step": 1750 + }, + { + "epoch": 0.4850415512465374, + "grad_norm": 0.6621571779251099, + "learning_rate": 4.928003598745938e-06, + "loss": 0.6814, + "step": 1751 + }, + { + "epoch": 0.4853185595567867, + "grad_norm": 0.6772110462188721, + "learning_rate": 4.927916778507248e-06, + "loss": 0.6969, + "step": 1752 + }, + { + "epoch": 0.485595567867036, + "grad_norm": 0.6477333307266235, + "learning_rate": 4.927829906717685e-06, + "loss": 0.6031, + "step": 1753 + }, + { + "epoch": 0.4858725761772853, + "grad_norm": 0.6633053421974182, + "learning_rate": 4.927742983379094e-06, + "loss": 0.7203, + "step": 1754 + }, + { + "epoch": 0.48614958448753465, + "grad_norm": 0.6834869980812073, + "learning_rate": 4.927656008493318e-06, + "loss": 0.6837, + "step": 1755 + }, + { + "epoch": 0.48642659279778394, + "grad_norm": 0.6382920742034912, + "learning_rate": 4.9275689820622065e-06, + "loss": 0.7213, + "step": 1756 + }, + { + "epoch": 0.48670360110803323, + "grad_norm": 0.6626554727554321, + "learning_rate": 4.927481904087606e-06, + "loss": 0.7088, + "step": 1757 + }, + { + "epoch": 0.48698060941828253, + "grad_norm": 0.6923210620880127, + "learning_rate": 4.927394774571366e-06, + "loss": 0.6638, + "step": 1758 + }, + { + "epoch": 0.4872576177285319, + "grad_norm": 0.6683520674705505, + "learning_rate": 4.927307593515335e-06, + "loss": 0.6765, + "step": 1759 + }, + { + "epoch": 0.48753462603878117, + "grad_norm": 0.6730638146400452, + "learning_rate": 4.927220360921366e-06, + "loss": 0.7028, + "step": 1760 + }, + { + "epoch": 0.48781163434903047, + "grad_norm": 0.6665538549423218, + "learning_rate": 4.927133076791309e-06, + "loss": 0.6821, + "step": 1761 + }, + { + "epoch": 0.48808864265927976, + "grad_norm": 0.6897082924842834, + "learning_rate": 4.9270457411270204e-06, + "loss": 0.6895, + "step": 1762 + }, + { + "epoch": 0.4883656509695291, + "grad_norm": 0.6820263862609863, + "learning_rate": 4.926958353930352e-06, + "loss": 0.6818, + "step": 1763 + }, + { + "epoch": 0.4886426592797784, + "grad_norm": 0.7254486083984375, + "learning_rate": 4.92687091520316e-06, + "loss": 0.7015, + "step": 1764 + }, + { + "epoch": 0.4889196675900277, + "grad_norm": 0.705342710018158, + "learning_rate": 4.9267834249473005e-06, + "loss": 0.6557, + "step": 1765 + }, + { + "epoch": 0.489196675900277, + "grad_norm": 0.6940473914146423, + "learning_rate": 4.926695883164632e-06, + "loss": 0.718, + "step": 1766 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 0.6987528800964355, + "learning_rate": 4.926608289857012e-06, + "loss": 0.688, + "step": 1767 + }, + { + "epoch": 0.48975069252077563, + "grad_norm": 0.7007743716239929, + "learning_rate": 4.926520645026302e-06, + "loss": 0.6859, + "step": 1768 + }, + { + "epoch": 0.4900277008310249, + "grad_norm": 0.6608262062072754, + "learning_rate": 4.9264329486743615e-06, + "loss": 0.6685, + "step": 1769 + }, + { + "epoch": 0.4903047091412742, + "grad_norm": 0.6854458451271057, + "learning_rate": 4.9263452008030535e-06, + "loss": 0.7118, + "step": 1770 + }, + { + "epoch": 0.49058171745152357, + "grad_norm": 0.6807644963264465, + "learning_rate": 4.92625740141424e-06, + "loss": 0.6473, + "step": 1771 + }, + { + "epoch": 0.49085872576177286, + "grad_norm": 0.6337978839874268, + "learning_rate": 4.926169550509787e-06, + "loss": 0.6851, + "step": 1772 + }, + { + "epoch": 0.49113573407202216, + "grad_norm": 0.672011137008667, + "learning_rate": 4.926081648091559e-06, + "loss": 0.6875, + "step": 1773 + }, + { + "epoch": 0.49141274238227145, + "grad_norm": 0.7125269770622253, + "learning_rate": 4.925993694161421e-06, + "loss": 0.6748, + "step": 1774 + }, + { + "epoch": 0.4916897506925208, + "grad_norm": 0.6711752414703369, + "learning_rate": 4.925905688721241e-06, + "loss": 0.6989, + "step": 1775 + }, + { + "epoch": 0.4919667590027701, + "grad_norm": 0.6642163991928101, + "learning_rate": 4.925817631772889e-06, + "loss": 0.6894, + "step": 1776 + }, + { + "epoch": 0.4922437673130194, + "grad_norm": 0.6630699634552002, + "learning_rate": 4.925729523318233e-06, + "loss": 0.712, + "step": 1777 + }, + { + "epoch": 0.4925207756232687, + "grad_norm": 0.6922430992126465, + "learning_rate": 4.925641363359146e-06, + "loss": 0.6625, + "step": 1778 + }, + { + "epoch": 0.49279778393351803, + "grad_norm": 0.6554149389266968, + "learning_rate": 4.925553151897497e-06, + "loss": 0.6949, + "step": 1779 + }, + { + "epoch": 0.4930747922437673, + "grad_norm": 0.6485135555267334, + "learning_rate": 4.925464888935162e-06, + "loss": 0.6487, + "step": 1780 + }, + { + "epoch": 0.4933518005540166, + "grad_norm": 0.7048740386962891, + "learning_rate": 4.925376574474013e-06, + "loss": 0.6684, + "step": 1781 + }, + { + "epoch": 0.4936288088642659, + "grad_norm": 0.6761953234672546, + "learning_rate": 4.925288208515924e-06, + "loss": 0.6866, + "step": 1782 + }, + { + "epoch": 0.49390581717451526, + "grad_norm": 0.7035673260688782, + "learning_rate": 4.925199791062774e-06, + "loss": 0.7412, + "step": 1783 + }, + { + "epoch": 0.49418282548476455, + "grad_norm": 0.6940863728523254, + "learning_rate": 4.925111322116439e-06, + "loss": 0.6624, + "step": 1784 + }, + { + "epoch": 0.49445983379501385, + "grad_norm": 0.6827403903007507, + "learning_rate": 4.925022801678798e-06, + "loss": 0.6649, + "step": 1785 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 0.7142612338066101, + "learning_rate": 4.92493422975173e-06, + "loss": 0.6943, + "step": 1786 + }, + { + "epoch": 0.4950138504155125, + "grad_norm": 0.6687315702438354, + "learning_rate": 4.924845606337115e-06, + "loss": 0.6855, + "step": 1787 + }, + { + "epoch": 0.4952908587257618, + "grad_norm": 0.6995869874954224, + "learning_rate": 4.924756931436836e-06, + "loss": 0.6714, + "step": 1788 + }, + { + "epoch": 0.4955678670360111, + "grad_norm": 0.7208627462387085, + "learning_rate": 4.924668205052775e-06, + "loss": 0.6602, + "step": 1789 + }, + { + "epoch": 0.49584487534626037, + "grad_norm": 0.7047656774520874, + "learning_rate": 4.924579427186817e-06, + "loss": 0.7161, + "step": 1790 + }, + { + "epoch": 0.4961218836565097, + "grad_norm": 0.6512801051139832, + "learning_rate": 4.924490597840844e-06, + "loss": 0.6679, + "step": 1791 + }, + { + "epoch": 0.496398891966759, + "grad_norm": 0.6830857992172241, + "learning_rate": 4.924401717016746e-06, + "loss": 0.6823, + "step": 1792 + }, + { + "epoch": 0.4966759002770083, + "grad_norm": 0.6951817870140076, + "learning_rate": 4.924312784716407e-06, + "loss": 0.6831, + "step": 1793 + }, + { + "epoch": 0.4969529085872576, + "grad_norm": 0.6855754256248474, + "learning_rate": 4.924223800941718e-06, + "loss": 0.6707, + "step": 1794 + }, + { + "epoch": 0.49722991689750695, + "grad_norm": 0.7017086744308472, + "learning_rate": 4.924134765694566e-06, + "loss": 0.658, + "step": 1795 + }, + { + "epoch": 0.49750692520775625, + "grad_norm": 0.7068955302238464, + "learning_rate": 4.924045678976842e-06, + "loss": 0.7202, + "step": 1796 + }, + { + "epoch": 0.49778393351800554, + "grad_norm": 0.6873322129249573, + "learning_rate": 4.9239565407904385e-06, + "loss": 0.7351, + "step": 1797 + }, + { + "epoch": 0.49806094182825483, + "grad_norm": 0.6638666987419128, + "learning_rate": 4.923867351137247e-06, + "loss": 0.7046, + "step": 1798 + }, + { + "epoch": 0.4983379501385042, + "grad_norm": 0.680953323841095, + "learning_rate": 4.923778110019163e-06, + "loss": 0.6932, + "step": 1799 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 0.715457022190094, + "learning_rate": 4.923688817438079e-06, + "loss": 0.7159, + "step": 1800 + }, + { + "epoch": 0.49889196675900277, + "grad_norm": 0.6973574161529541, + "learning_rate": 4.923599473395892e-06, + "loss": 0.6758, + "step": 1801 + }, + { + "epoch": 0.49916897506925206, + "grad_norm": 0.6752958297729492, + "learning_rate": 4.9235100778944985e-06, + "loss": 0.7031, + "step": 1802 + }, + { + "epoch": 0.4994459833795014, + "grad_norm": 0.6299696564674377, + "learning_rate": 4.923420630935798e-06, + "loss": 0.6299, + "step": 1803 + }, + { + "epoch": 0.4997229916897507, + "grad_norm": 0.6795063018798828, + "learning_rate": 4.923331132521689e-06, + "loss": 0.6879, + "step": 1804 + }, + { + "epoch": 0.5, + "grad_norm": 0.6836988925933838, + "learning_rate": 4.923241582654071e-06, + "loss": 0.6935, + "step": 1805 + }, + { + "epoch": 0.5002770083102493, + "grad_norm": 0.6711650490760803, + "learning_rate": 4.9231519813348464e-06, + "loss": 0.6667, + "step": 1806 + }, + { + "epoch": 0.5005540166204986, + "grad_norm": 0.6842010617256165, + "learning_rate": 4.923062328565916e-06, + "loss": 0.667, + "step": 1807 + }, + { + "epoch": 0.5008310249307479, + "grad_norm": 0.6528140902519226, + "learning_rate": 4.922972624349185e-06, + "loss": 0.6881, + "step": 1808 + }, + { + "epoch": 0.5011080332409972, + "grad_norm": 0.7012379765510559, + "learning_rate": 4.922882868686558e-06, + "loss": 0.7278, + "step": 1809 + }, + { + "epoch": 0.5013850415512465, + "grad_norm": 0.6806972622871399, + "learning_rate": 4.92279306157994e-06, + "loss": 0.7221, + "step": 1810 + }, + { + "epoch": 0.5016620498614959, + "grad_norm": 0.7014011740684509, + "learning_rate": 4.922703203031239e-06, + "loss": 0.6982, + "step": 1811 + }, + { + "epoch": 0.5019390581717451, + "grad_norm": 0.6584193706512451, + "learning_rate": 4.922613293042361e-06, + "loss": 0.6485, + "step": 1812 + }, + { + "epoch": 0.5022160664819945, + "grad_norm": 0.6598594188690186, + "learning_rate": 4.922523331615217e-06, + "loss": 0.7386, + "step": 1813 + }, + { + "epoch": 0.5024930747922438, + "grad_norm": 0.7168267965316772, + "learning_rate": 4.922433318751716e-06, + "loss": 0.7082, + "step": 1814 + }, + { + "epoch": 0.502770083102493, + "grad_norm": 0.6848852038383484, + "learning_rate": 4.922343254453769e-06, + "loss": 0.7277, + "step": 1815 + }, + { + "epoch": 0.5030470914127424, + "grad_norm": 0.683768630027771, + "learning_rate": 4.9222531387232885e-06, + "loss": 0.6798, + "step": 1816 + }, + { + "epoch": 0.5033240997229916, + "grad_norm": 0.6405003070831299, + "learning_rate": 4.922162971562189e-06, + "loss": 0.6653, + "step": 1817 + }, + { + "epoch": 0.503601108033241, + "grad_norm": 0.6444788575172424, + "learning_rate": 4.922072752972383e-06, + "loss": 0.6698, + "step": 1818 + }, + { + "epoch": 0.5038781163434903, + "grad_norm": 0.6805223822593689, + "learning_rate": 4.921982482955788e-06, + "loss": 0.7485, + "step": 1819 + }, + { + "epoch": 0.5041551246537396, + "grad_norm": 0.6668921709060669, + "learning_rate": 4.921892161514319e-06, + "loss": 0.6912, + "step": 1820 + }, + { + "epoch": 0.5044321329639889, + "grad_norm": 0.663844883441925, + "learning_rate": 4.9218017886498955e-06, + "loss": 0.681, + "step": 1821 + }, + { + "epoch": 0.5047091412742383, + "grad_norm": 0.6882054805755615, + "learning_rate": 4.921711364364435e-06, + "loss": 0.6901, + "step": 1822 + }, + { + "epoch": 0.5049861495844875, + "grad_norm": 0.6432827115058899, + "learning_rate": 4.921620888659858e-06, + "loss": 0.6772, + "step": 1823 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 0.6826911568641663, + "learning_rate": 4.921530361538086e-06, + "loss": 0.6965, + "step": 1824 + }, + { + "epoch": 0.5055401662049861, + "grad_norm": 0.6841583847999573, + "learning_rate": 4.921439783001039e-06, + "loss": 0.7128, + "step": 1825 + }, + { + "epoch": 0.5058171745152354, + "grad_norm": 0.6850943565368652, + "learning_rate": 4.921349153050643e-06, + "loss": 0.6691, + "step": 1826 + }, + { + "epoch": 0.5060941828254848, + "grad_norm": 0.679665744304657, + "learning_rate": 4.92125847168882e-06, + "loss": 0.7019, + "step": 1827 + }, + { + "epoch": 0.506371191135734, + "grad_norm": 0.6384556293487549, + "learning_rate": 4.921167738917497e-06, + "loss": 0.6531, + "step": 1828 + }, + { + "epoch": 0.5066481994459834, + "grad_norm": 0.6422626972198486, + "learning_rate": 4.9210769547386e-06, + "loss": 0.6471, + "step": 1829 + }, + { + "epoch": 0.5069252077562327, + "grad_norm": 0.6919006109237671, + "learning_rate": 4.920986119154057e-06, + "loss": 0.6955, + "step": 1830 + }, + { + "epoch": 0.507202216066482, + "grad_norm": 0.6437128782272339, + "learning_rate": 4.920895232165795e-06, + "loss": 0.6925, + "step": 1831 + }, + { + "epoch": 0.5074792243767313, + "grad_norm": 0.6609724164009094, + "learning_rate": 4.920804293775746e-06, + "loss": 0.691, + "step": 1832 + }, + { + "epoch": 0.5077562326869806, + "grad_norm": 0.6840398907661438, + "learning_rate": 4.920713303985839e-06, + "loss": 0.6822, + "step": 1833 + }, + { + "epoch": 0.5080332409972299, + "grad_norm": 0.6742506623268127, + "learning_rate": 4.920622262798007e-06, + "loss": 0.7082, + "step": 1834 + }, + { + "epoch": 0.5083102493074793, + "grad_norm": 0.6945616602897644, + "learning_rate": 4.920531170214183e-06, + "loss": 0.6722, + "step": 1835 + }, + { + "epoch": 0.5085872576177285, + "grad_norm": 1.1759380102157593, + "learning_rate": 4.920440026236301e-06, + "loss": 0.6814, + "step": 1836 + }, + { + "epoch": 0.5088642659279778, + "grad_norm": 0.72176593542099, + "learning_rate": 4.920348830866296e-06, + "loss": 0.6785, + "step": 1837 + }, + { + "epoch": 0.5091412742382272, + "grad_norm": 0.6691416501998901, + "learning_rate": 4.920257584106104e-06, + "loss": 0.7269, + "step": 1838 + }, + { + "epoch": 0.5094182825484764, + "grad_norm": 0.7100477814674377, + "learning_rate": 4.9201662859576626e-06, + "loss": 0.6908, + "step": 1839 + }, + { + "epoch": 0.5096952908587258, + "grad_norm": 0.658385694026947, + "learning_rate": 4.920074936422911e-06, + "loss": 0.6894, + "step": 1840 + }, + { + "epoch": 0.509972299168975, + "grad_norm": 0.6802435517311096, + "learning_rate": 4.9199835355037885e-06, + "loss": 0.7067, + "step": 1841 + }, + { + "epoch": 0.5102493074792244, + "grad_norm": 0.6722739338874817, + "learning_rate": 4.919892083202235e-06, + "loss": 0.689, + "step": 1842 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 0.671492338180542, + "learning_rate": 4.919800579520193e-06, + "loss": 0.6686, + "step": 1843 + }, + { + "epoch": 0.510803324099723, + "grad_norm": 0.6547128558158875, + "learning_rate": 4.919709024459605e-06, + "loss": 0.703, + "step": 1844 + }, + { + "epoch": 0.5110803324099723, + "grad_norm": 0.6681481003761292, + "learning_rate": 4.9196174180224165e-06, + "loss": 0.6717, + "step": 1845 + }, + { + "epoch": 0.5113573407202217, + "grad_norm": 0.6683051586151123, + "learning_rate": 4.91952576021057e-06, + "loss": 0.6832, + "step": 1846 + }, + { + "epoch": 0.5116343490304709, + "grad_norm": 0.6824548840522766, + "learning_rate": 4.919434051026013e-06, + "loss": 0.6598, + "step": 1847 + }, + { + "epoch": 0.5119113573407202, + "grad_norm": 0.6966956257820129, + "learning_rate": 4.919342290470692e-06, + "loss": 0.7482, + "step": 1848 + }, + { + "epoch": 0.5121883656509695, + "grad_norm": 0.7106775641441345, + "learning_rate": 4.919250478546556e-06, + "loss": 0.6644, + "step": 1849 + }, + { + "epoch": 0.5124653739612188, + "grad_norm": 0.6864263415336609, + "learning_rate": 4.9191586152555555e-06, + "loss": 0.6907, + "step": 1850 + }, + { + "epoch": 0.5127423822714682, + "grad_norm": 0.6232467889785767, + "learning_rate": 4.919066700599639e-06, + "loss": 0.6006, + "step": 1851 + }, + { + "epoch": 0.5130193905817174, + "grad_norm": 0.6870824098587036, + "learning_rate": 4.918974734580759e-06, + "loss": 0.6423, + "step": 1852 + }, + { + "epoch": 0.5132963988919668, + "grad_norm": 0.6564236879348755, + "learning_rate": 4.9188827172008674e-06, + "loss": 0.7023, + "step": 1853 + }, + { + "epoch": 0.5135734072022161, + "grad_norm": 0.6429625153541565, + "learning_rate": 4.918790648461919e-06, + "loss": 0.6377, + "step": 1854 + }, + { + "epoch": 0.5138504155124654, + "grad_norm": 0.6833292841911316, + "learning_rate": 4.918698528365868e-06, + "loss": 0.7047, + "step": 1855 + }, + { + "epoch": 0.5141274238227147, + "grad_norm": 0.6977648138999939, + "learning_rate": 4.918606356914671e-06, + "loss": 0.6748, + "step": 1856 + }, + { + "epoch": 0.5144044321329639, + "grad_norm": 0.6762489676475525, + "learning_rate": 4.918514134110284e-06, + "loss": 0.7136, + "step": 1857 + }, + { + "epoch": 0.5146814404432133, + "grad_norm": 0.719980001449585, + "learning_rate": 4.918421859954666e-06, + "loss": 0.678, + "step": 1858 + }, + { + "epoch": 0.5149584487534626, + "grad_norm": 0.6446436047554016, + "learning_rate": 4.918329534449776e-06, + "loss": 0.682, + "step": 1859 + }, + { + "epoch": 0.5152354570637119, + "grad_norm": 0.6879029273986816, + "learning_rate": 4.918237157597574e-06, + "loss": 0.6936, + "step": 1860 + }, + { + "epoch": 0.5155124653739612, + "grad_norm": 0.6882273554801941, + "learning_rate": 4.918144729400022e-06, + "loss": 0.6932, + "step": 1861 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 0.660173237323761, + "learning_rate": 4.9180522498590814e-06, + "loss": 0.6947, + "step": 1862 + }, + { + "epoch": 0.5160664819944598, + "grad_norm": 0.682055652141571, + "learning_rate": 4.917959718976717e-06, + "loss": 0.7055, + "step": 1863 + }, + { + "epoch": 0.5163434903047092, + "grad_norm": 0.7079998850822449, + "learning_rate": 4.917867136754894e-06, + "loss": 0.6849, + "step": 1864 + }, + { + "epoch": 0.5166204986149584, + "grad_norm": 0.6993677616119385, + "learning_rate": 4.917774503195576e-06, + "loss": 0.7021, + "step": 1865 + }, + { + "epoch": 0.5168975069252078, + "grad_norm": 0.6869961619377136, + "learning_rate": 4.91768181830073e-06, + "loss": 0.7318, + "step": 1866 + }, + { + "epoch": 0.5171745152354571, + "grad_norm": 0.6560007929801941, + "learning_rate": 4.917589082072327e-06, + "loss": 0.6704, + "step": 1867 + }, + { + "epoch": 0.5174515235457063, + "grad_norm": 0.6822911500930786, + "learning_rate": 4.917496294512333e-06, + "loss": 0.7208, + "step": 1868 + }, + { + "epoch": 0.5177285318559557, + "grad_norm": 0.6258894801139832, + "learning_rate": 4.917403455622718e-06, + "loss": 0.6708, + "step": 1869 + }, + { + "epoch": 0.518005540166205, + "grad_norm": 0.6495811939239502, + "learning_rate": 4.917310565405456e-06, + "loss": 0.6872, + "step": 1870 + }, + { + "epoch": 0.5182825484764543, + "grad_norm": 0.6962676644325256, + "learning_rate": 4.917217623862516e-06, + "loss": 0.7156, + "step": 1871 + }, + { + "epoch": 0.5185595567867036, + "grad_norm": 0.6337746977806091, + "learning_rate": 4.917124630995874e-06, + "loss": 0.6518, + "step": 1872 + }, + { + "epoch": 0.5188365650969529, + "grad_norm": 0.6667745113372803, + "learning_rate": 4.917031586807503e-06, + "loss": 0.7224, + "step": 1873 + }, + { + "epoch": 0.5191135734072022, + "grad_norm": 0.6814945936203003, + "learning_rate": 4.91693849129938e-06, + "loss": 0.6924, + "step": 1874 + }, + { + "epoch": 0.5193905817174516, + "grad_norm": 0.7064763903617859, + "learning_rate": 4.916845344473479e-06, + "loss": 0.6947, + "step": 1875 + }, + { + "epoch": 0.5196675900277008, + "grad_norm": 0.7004358768463135, + "learning_rate": 4.91675214633178e-06, + "loss": 0.6945, + "step": 1876 + }, + { + "epoch": 0.5199445983379501, + "grad_norm": 0.6734568476676941, + "learning_rate": 4.916658896876262e-06, + "loss": 0.7157, + "step": 1877 + }, + { + "epoch": 0.5202216066481995, + "grad_norm": 0.6627720594406128, + "learning_rate": 4.916565596108903e-06, + "loss": 0.6442, + "step": 1878 + }, + { + "epoch": 0.5204986149584487, + "grad_norm": 0.666963517665863, + "learning_rate": 4.916472244031686e-06, + "loss": 0.6916, + "step": 1879 + }, + { + "epoch": 0.5207756232686981, + "grad_norm": 0.7252178192138672, + "learning_rate": 4.916378840646592e-06, + "loss": 0.6777, + "step": 1880 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 0.6848912835121155, + "learning_rate": 4.9162853859556044e-06, + "loss": 0.6795, + "step": 1881 + }, + { + "epoch": 0.5213296398891967, + "grad_norm": 0.6617556214332581, + "learning_rate": 4.916191879960708e-06, + "loss": 0.6869, + "step": 1882 + }, + { + "epoch": 0.521606648199446, + "grad_norm": 0.6472238898277283, + "learning_rate": 4.916098322663887e-06, + "loss": 0.6688, + "step": 1883 + }, + { + "epoch": 0.5218836565096953, + "grad_norm": 0.6495205163955688, + "learning_rate": 4.916004714067128e-06, + "loss": 0.6866, + "step": 1884 + }, + { + "epoch": 0.5221606648199446, + "grad_norm": 0.7360695004463196, + "learning_rate": 4.915911054172421e-06, + "loss": 0.6491, + "step": 1885 + }, + { + "epoch": 0.522437673130194, + "grad_norm": 0.6612504720687866, + "learning_rate": 4.915817342981751e-06, + "loss": 0.7256, + "step": 1886 + }, + { + "epoch": 0.5227146814404432, + "grad_norm": 0.6689541935920715, + "learning_rate": 4.915723580497111e-06, + "loss": 0.6962, + "step": 1887 + }, + { + "epoch": 0.5229916897506925, + "grad_norm": 0.7215927243232727, + "learning_rate": 4.915629766720489e-06, + "loss": 0.684, + "step": 1888 + }, + { + "epoch": 0.5232686980609418, + "grad_norm": 0.6565717458724976, + "learning_rate": 4.915535901653879e-06, + "loss": 0.7088, + "step": 1889 + }, + { + "epoch": 0.5235457063711911, + "grad_norm": 0.6787716150283813, + "learning_rate": 4.9154419852992725e-06, + "loss": 0.6987, + "step": 1890 + }, + { + "epoch": 0.5238227146814405, + "grad_norm": 0.6738946437835693, + "learning_rate": 4.915348017658666e-06, + "loss": 0.7169, + "step": 1891 + }, + { + "epoch": 0.5240997229916897, + "grad_norm": 0.6366645693778992, + "learning_rate": 4.915253998734051e-06, + "loss": 0.7423, + "step": 1892 + }, + { + "epoch": 0.5243767313019391, + "grad_norm": 0.6482120752334595, + "learning_rate": 4.9151599285274265e-06, + "loss": 0.7022, + "step": 1893 + }, + { + "epoch": 0.5246537396121884, + "grad_norm": 0.625254213809967, + "learning_rate": 4.915065807040789e-06, + "loss": 0.6335, + "step": 1894 + }, + { + "epoch": 0.5249307479224377, + "grad_norm": 0.6547906398773193, + "learning_rate": 4.914971634276137e-06, + "loss": 0.6519, + "step": 1895 + }, + { + "epoch": 0.525207756232687, + "grad_norm": 0.6612831354141235, + "learning_rate": 4.914877410235469e-06, + "loss": 0.713, + "step": 1896 + }, + { + "epoch": 0.5254847645429362, + "grad_norm": 0.6797193884849548, + "learning_rate": 4.914783134920788e-06, + "loss": 0.6793, + "step": 1897 + }, + { + "epoch": 0.5257617728531856, + "grad_norm": 0.665197491645813, + "learning_rate": 4.914688808334094e-06, + "loss": 0.6594, + "step": 1898 + }, + { + "epoch": 0.5260387811634349, + "grad_norm": 0.6539757251739502, + "learning_rate": 4.914594430477391e-06, + "loss": 0.7063, + "step": 1899 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.6706299185752869, + "learning_rate": 4.91450000135268e-06, + "loss": 0.6764, + "step": 1900 + }, + { + "epoch": 0.5265927977839335, + "grad_norm": 0.6564507484436035, + "learning_rate": 4.91440552096197e-06, + "loss": 0.6905, + "step": 1901 + }, + { + "epoch": 0.5268698060941829, + "grad_norm": 0.6501426696777344, + "learning_rate": 4.914310989307264e-06, + "loss": 0.7434, + "step": 1902 + }, + { + "epoch": 0.5271468144044321, + "grad_norm": 0.6355409026145935, + "learning_rate": 4.914216406390571e-06, + "loss": 0.7126, + "step": 1903 + }, + { + "epoch": 0.5274238227146815, + "grad_norm": 0.7012143731117249, + "learning_rate": 4.914121772213898e-06, + "loss": 0.7368, + "step": 1904 + }, + { + "epoch": 0.5277008310249307, + "grad_norm": 0.678661584854126, + "learning_rate": 4.914027086779254e-06, + "loss": 0.6804, + "step": 1905 + }, + { + "epoch": 0.52797783933518, + "grad_norm": 0.6787761449813843, + "learning_rate": 4.913932350088652e-06, + "loss": 0.7025, + "step": 1906 + }, + { + "epoch": 0.5282548476454294, + "grad_norm": 0.6858866810798645, + "learning_rate": 4.9138375621441e-06, + "loss": 0.7017, + "step": 1907 + }, + { + "epoch": 0.5285318559556786, + "grad_norm": 0.7146446704864502, + "learning_rate": 4.913742722947613e-06, + "loss": 0.7135, + "step": 1908 + }, + { + "epoch": 0.528808864265928, + "grad_norm": 0.6727023720741272, + "learning_rate": 4.9136478325012035e-06, + "loss": 0.6683, + "step": 1909 + }, + { + "epoch": 0.5290858725761773, + "grad_norm": 0.648564338684082, + "learning_rate": 4.913552890806888e-06, + "loss": 0.7056, + "step": 1910 + }, + { + "epoch": 0.5293628808864266, + "grad_norm": 0.678922712802887, + "learning_rate": 4.9134578978666796e-06, + "loss": 0.6711, + "step": 1911 + }, + { + "epoch": 0.5296398891966759, + "grad_norm": 0.6676106452941895, + "learning_rate": 4.913362853682597e-06, + "loss": 0.6928, + "step": 1912 + }, + { + "epoch": 0.5299168975069252, + "grad_norm": 0.68855220079422, + "learning_rate": 4.913267758256659e-06, + "loss": 0.713, + "step": 1913 + }, + { + "epoch": 0.5301939058171745, + "grad_norm": 0.6664445996284485, + "learning_rate": 4.913172611590882e-06, + "loss": 0.6194, + "step": 1914 + }, + { + "epoch": 0.5304709141274239, + "grad_norm": 0.7783384919166565, + "learning_rate": 4.913077413687289e-06, + "loss": 0.7466, + "step": 1915 + }, + { + "epoch": 0.5307479224376731, + "grad_norm": 0.6518760323524475, + "learning_rate": 4.9129821645479e-06, + "loss": 0.6592, + "step": 1916 + }, + { + "epoch": 0.5310249307479225, + "grad_norm": 0.6837249398231506, + "learning_rate": 4.912886864174738e-06, + "loss": 0.6914, + "step": 1917 + }, + { + "epoch": 0.5313019390581717, + "grad_norm": 0.6892364621162415, + "learning_rate": 4.912791512569826e-06, + "loss": 0.6914, + "step": 1918 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 0.6553475856781006, + "learning_rate": 4.912696109735188e-06, + "loss": 0.6355, + "step": 1919 + }, + { + "epoch": 0.5318559556786704, + "grad_norm": 0.6912408471107483, + "learning_rate": 4.91260065567285e-06, + "loss": 0.6365, + "step": 1920 + }, + { + "epoch": 0.5321329639889196, + "grad_norm": 0.6865231394767761, + "learning_rate": 4.9125051503848395e-06, + "loss": 0.7024, + "step": 1921 + }, + { + "epoch": 0.532409972299169, + "grad_norm": 0.6905850172042847, + "learning_rate": 4.912409593873184e-06, + "loss": 0.7336, + "step": 1922 + }, + { + "epoch": 0.5326869806094183, + "grad_norm": 0.6499538421630859, + "learning_rate": 4.912313986139911e-06, + "loss": 0.7127, + "step": 1923 + }, + { + "epoch": 0.5329639889196676, + "grad_norm": 0.6666481494903564, + "learning_rate": 4.912218327187053e-06, + "loss": 0.6548, + "step": 1924 + }, + { + "epoch": 0.5332409972299169, + "grad_norm": 0.7385328412055969, + "learning_rate": 4.91212261701664e-06, + "loss": 0.7113, + "step": 1925 + }, + { + "epoch": 0.5335180055401662, + "grad_norm": 0.710387110710144, + "learning_rate": 4.912026855630703e-06, + "loss": 0.7185, + "step": 1926 + }, + { + "epoch": 0.5337950138504155, + "grad_norm": 0.6992027163505554, + "learning_rate": 4.911931043031276e-06, + "loss": 0.6737, + "step": 1927 + }, + { + "epoch": 0.5340720221606648, + "grad_norm": 0.6654460430145264, + "learning_rate": 4.9118351792203945e-06, + "loss": 0.6781, + "step": 1928 + }, + { + "epoch": 0.5343490304709141, + "grad_norm": 0.6434751749038696, + "learning_rate": 4.911739264200093e-06, + "loss": 0.7154, + "step": 1929 + }, + { + "epoch": 0.5346260387811634, + "grad_norm": 0.6851822137832642, + "learning_rate": 4.911643297972407e-06, + "loss": 0.6425, + "step": 1930 + }, + { + "epoch": 0.5349030470914128, + "grad_norm": 0.6717454195022583, + "learning_rate": 4.911547280539376e-06, + "loss": 0.7232, + "step": 1931 + }, + { + "epoch": 0.535180055401662, + "grad_norm": 0.6625047326087952, + "learning_rate": 4.9114512119030376e-06, + "loss": 0.6991, + "step": 1932 + }, + { + "epoch": 0.5354570637119114, + "grad_norm": 0.6835640072822571, + "learning_rate": 4.911355092065432e-06, + "loss": 0.6726, + "step": 1933 + }, + { + "epoch": 0.5357340720221606, + "grad_norm": 0.6565011143684387, + "learning_rate": 4.9112589210285995e-06, + "loss": 0.705, + "step": 1934 + }, + { + "epoch": 0.53601108033241, + "grad_norm": 0.6798642873764038, + "learning_rate": 4.911162698794583e-06, + "loss": 0.7011, + "step": 1935 + }, + { + "epoch": 0.5362880886426593, + "grad_norm": 0.7038365602493286, + "learning_rate": 4.9110664253654235e-06, + "loss": 0.6952, + "step": 1936 + }, + { + "epoch": 0.5365650969529085, + "grad_norm": 0.691293478012085, + "learning_rate": 4.910970100743168e-06, + "loss": 0.7138, + "step": 1937 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 0.6723795533180237, + "learning_rate": 4.91087372492986e-06, + "loss": 0.676, + "step": 1938 + }, + { + "epoch": 0.5371191135734072, + "grad_norm": 0.6425924897193909, + "learning_rate": 4.910777297927546e-06, + "loss": 0.6654, + "step": 1939 + }, + { + "epoch": 0.5373961218836565, + "grad_norm": 0.6891401410102844, + "learning_rate": 4.910680819738274e-06, + "loss": 0.7162, + "step": 1940 + }, + { + "epoch": 0.5376731301939058, + "grad_norm": 0.6883473992347717, + "learning_rate": 4.910584290364091e-06, + "loss": 0.6866, + "step": 1941 + }, + { + "epoch": 0.5379501385041551, + "grad_norm": 0.6681075096130371, + "learning_rate": 4.910487709807048e-06, + "loss": 0.6435, + "step": 1942 + }, + { + "epoch": 0.5382271468144044, + "grad_norm": 0.6832405924797058, + "learning_rate": 4.9103910780691955e-06, + "loss": 0.6804, + "step": 1943 + }, + { + "epoch": 0.5385041551246538, + "grad_norm": 0.6717491745948792, + "learning_rate": 4.910294395152585e-06, + "loss": 0.6988, + "step": 1944 + }, + { + "epoch": 0.538781163434903, + "grad_norm": 0.7094027400016785, + "learning_rate": 4.9101976610592685e-06, + "loss": 0.6964, + "step": 1945 + }, + { + "epoch": 0.5390581717451524, + "grad_norm": 0.6554908752441406, + "learning_rate": 4.910100875791301e-06, + "loss": 0.6476, + "step": 1946 + }, + { + "epoch": 0.5393351800554017, + "grad_norm": 0.6950293779373169, + "learning_rate": 4.910004039350737e-06, + "loss": 0.6839, + "step": 1947 + }, + { + "epoch": 0.539612188365651, + "grad_norm": 0.6729973554611206, + "learning_rate": 4.909907151739634e-06, + "loss": 0.6865, + "step": 1948 + }, + { + "epoch": 0.5398891966759003, + "grad_norm": 0.6982777714729309, + "learning_rate": 4.909810212960047e-06, + "loss": 0.7073, + "step": 1949 + }, + { + "epoch": 0.5401662049861495, + "grad_norm": 0.6632289886474609, + "learning_rate": 4.9097132230140345e-06, + "loss": 0.6576, + "step": 1950 + }, + { + "epoch": 0.5404432132963989, + "grad_norm": 0.7004045844078064, + "learning_rate": 4.9096161819036585e-06, + "loss": 0.6677, + "step": 1951 + }, + { + "epoch": 0.5407202216066482, + "grad_norm": 0.6294344663619995, + "learning_rate": 4.9095190896309755e-06, + "loss": 0.6924, + "step": 1952 + }, + { + "epoch": 0.5409972299168975, + "grad_norm": 0.6689262390136719, + "learning_rate": 4.90942194619805e-06, + "loss": 0.726, + "step": 1953 + }, + { + "epoch": 0.5412742382271468, + "grad_norm": 0.6792681813240051, + "learning_rate": 4.909324751606943e-06, + "loss": 0.6986, + "step": 1954 + }, + { + "epoch": 0.5415512465373962, + "grad_norm": 0.6492560505867004, + "learning_rate": 4.9092275058597195e-06, + "loss": 0.6953, + "step": 1955 + }, + { + "epoch": 0.5418282548476454, + "grad_norm": 0.6609320044517517, + "learning_rate": 4.909130208958443e-06, + "loss": 0.6908, + "step": 1956 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 0.6573171615600586, + "learning_rate": 4.909032860905179e-06, + "loss": 0.635, + "step": 1957 + }, + { + "epoch": 0.542382271468144, + "grad_norm": 0.6838486194610596, + "learning_rate": 4.9089354617019975e-06, + "loss": 0.7012, + "step": 1958 + }, + { + "epoch": 0.5426592797783933, + "grad_norm": 0.6595408916473389, + "learning_rate": 4.908838011350963e-06, + "loss": 0.6127, + "step": 1959 + }, + { + "epoch": 0.5429362880886427, + "grad_norm": 0.6571938395500183, + "learning_rate": 4.908740509854146e-06, + "loss": 0.6551, + "step": 1960 + }, + { + "epoch": 0.5432132963988919, + "grad_norm": 0.6965537071228027, + "learning_rate": 4.908642957213617e-06, + "loss": 0.6995, + "step": 1961 + }, + { + "epoch": 0.5434903047091413, + "grad_norm": 0.6637973189353943, + "learning_rate": 4.9085453534314474e-06, + "loss": 0.6816, + "step": 1962 + }, + { + "epoch": 0.5437673130193906, + "grad_norm": 0.6605237722396851, + "learning_rate": 4.908447698509709e-06, + "loss": 0.6722, + "step": 1963 + }, + { + "epoch": 0.5440443213296399, + "grad_norm": 0.6632689833641052, + "learning_rate": 4.908349992450475e-06, + "loss": 0.6715, + "step": 1964 + }, + { + "epoch": 0.5443213296398892, + "grad_norm": 0.6106252670288086, + "learning_rate": 4.9082522352558215e-06, + "loss": 0.6358, + "step": 1965 + }, + { + "epoch": 0.5445983379501385, + "grad_norm": 0.6894325613975525, + "learning_rate": 4.9081544269278225e-06, + "loss": 0.7247, + "step": 1966 + }, + { + "epoch": 0.5448753462603878, + "grad_norm": 0.6844300627708435, + "learning_rate": 4.908056567468554e-06, + "loss": 0.6752, + "step": 1967 + }, + { + "epoch": 0.5451523545706372, + "grad_norm": 0.671852171421051, + "learning_rate": 4.907958656880097e-06, + "loss": 0.6932, + "step": 1968 + }, + { + "epoch": 0.5454293628808864, + "grad_norm": 0.6739179491996765, + "learning_rate": 4.907860695164527e-06, + "loss": 0.6638, + "step": 1969 + }, + { + "epoch": 0.5457063711911357, + "grad_norm": 0.6991792321205139, + "learning_rate": 4.907762682323926e-06, + "loss": 0.6845, + "step": 1970 + }, + { + "epoch": 0.5459833795013851, + "grad_norm": 0.6290937662124634, + "learning_rate": 4.907664618360375e-06, + "loss": 0.6629, + "step": 1971 + }, + { + "epoch": 0.5462603878116343, + "grad_norm": 0.7173877954483032, + "learning_rate": 4.907566503275955e-06, + "loss": 0.6929, + "step": 1972 + }, + { + "epoch": 0.5465373961218837, + "grad_norm": 0.667913556098938, + "learning_rate": 4.90746833707275e-06, + "loss": 0.6246, + "step": 1973 + }, + { + "epoch": 0.5468144044321329, + "grad_norm": 0.6834477782249451, + "learning_rate": 4.907370119752844e-06, + "loss": 0.7205, + "step": 1974 + }, + { + "epoch": 0.5470914127423823, + "grad_norm": 0.7111289501190186, + "learning_rate": 4.9072718513183235e-06, + "loss": 0.6614, + "step": 1975 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.7072902917861938, + "learning_rate": 4.9071735317712735e-06, + "loss": 0.6774, + "step": 1976 + }, + { + "epoch": 0.5476454293628809, + "grad_norm": 0.6918482184410095, + "learning_rate": 4.907075161113782e-06, + "loss": 0.6708, + "step": 1977 + }, + { + "epoch": 0.5479224376731302, + "grad_norm": 0.6814166307449341, + "learning_rate": 4.906976739347938e-06, + "loss": 0.7135, + "step": 1978 + }, + { + "epoch": 0.5481994459833796, + "grad_norm": 0.6976650357246399, + "learning_rate": 4.906878266475831e-06, + "loss": 0.7145, + "step": 1979 + }, + { + "epoch": 0.5484764542936288, + "grad_norm": 0.6577694416046143, + "learning_rate": 4.906779742499551e-06, + "loss": 0.7047, + "step": 1980 + }, + { + "epoch": 0.5487534626038781, + "grad_norm": 0.642149806022644, + "learning_rate": 4.906681167421192e-06, + "loss": 0.6948, + "step": 1981 + }, + { + "epoch": 0.5490304709141274, + "grad_norm": 0.6713173985481262, + "learning_rate": 4.906582541242846e-06, + "loss": 0.6845, + "step": 1982 + }, + { + "epoch": 0.5493074792243767, + "grad_norm": 0.6460973024368286, + "learning_rate": 4.906483863966606e-06, + "loss": 0.7429, + "step": 1983 + }, + { + "epoch": 0.5495844875346261, + "grad_norm": 0.6690168380737305, + "learning_rate": 4.906385135594569e-06, + "loss": 0.6535, + "step": 1984 + }, + { + "epoch": 0.5498614958448753, + "grad_norm": 0.6556865572929382, + "learning_rate": 4.9062863561288295e-06, + "loss": 0.6635, + "step": 1985 + }, + { + "epoch": 0.5501385041551247, + "grad_norm": 0.675041675567627, + "learning_rate": 4.9061875255714864e-06, + "loss": 0.6961, + "step": 1986 + }, + { + "epoch": 0.550415512465374, + "grad_norm": 0.6664207577705383, + "learning_rate": 4.906088643924637e-06, + "loss": 0.6739, + "step": 1987 + }, + { + "epoch": 0.5506925207756233, + "grad_norm": 0.6541404128074646, + "learning_rate": 4.9059897111903806e-06, + "loss": 0.6884, + "step": 1988 + }, + { + "epoch": 0.5509695290858726, + "grad_norm": 0.7333690524101257, + "learning_rate": 4.90589072737082e-06, + "loss": 0.6715, + "step": 1989 + }, + { + "epoch": 0.5512465373961218, + "grad_norm": 0.6783778667449951, + "learning_rate": 4.905791692468054e-06, + "loss": 0.638, + "step": 1990 + }, + { + "epoch": 0.5515235457063712, + "grad_norm": 0.6893191933631897, + "learning_rate": 4.905692606484187e-06, + "loss": 0.6847, + "step": 1991 + }, + { + "epoch": 0.5518005540166205, + "grad_norm": 0.654119074344635, + "learning_rate": 4.905593469421323e-06, + "loss": 0.6741, + "step": 1992 + }, + { + "epoch": 0.5520775623268698, + "grad_norm": 0.7027085423469543, + "learning_rate": 4.905494281281565e-06, + "loss": 0.7323, + "step": 1993 + }, + { + "epoch": 0.5523545706371191, + "grad_norm": 0.6599571704864502, + "learning_rate": 4.9053950420670216e-06, + "loss": 0.7109, + "step": 1994 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6476833820343018, + "learning_rate": 4.905295751779799e-06, + "loss": 0.6835, + "step": 1995 + }, + { + "epoch": 0.5529085872576177, + "grad_norm": 0.6545064449310303, + "learning_rate": 4.905196410422004e-06, + "loss": 0.6604, + "step": 1996 + }, + { + "epoch": 0.5531855955678671, + "grad_norm": 0.6796197295188904, + "learning_rate": 4.905097017995748e-06, + "loss": 0.6958, + "step": 1997 + }, + { + "epoch": 0.5534626038781163, + "grad_norm": 0.7135651111602783, + "learning_rate": 4.90499757450314e-06, + "loss": 0.6845, + "step": 1998 + }, + { + "epoch": 0.5537396121883656, + "grad_norm": 0.6246007680892944, + "learning_rate": 4.904898079946292e-06, + "loss": 0.65, + "step": 1999 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 0.6683261394500732, + "learning_rate": 4.9047985343273156e-06, + "loss": 0.7332, + "step": 2000 + }, + { + "epoch": 0.5542936288088642, + "grad_norm": 0.7060092687606812, + "learning_rate": 4.9046989376483255e-06, + "loss": 0.6491, + "step": 2001 + }, + { + "epoch": 0.5545706371191136, + "grad_norm": 0.6899551153182983, + "learning_rate": 4.904599289911437e-06, + "loss": 0.7351, + "step": 2002 + }, + { + "epoch": 0.5548476454293629, + "grad_norm": 0.7160254120826721, + "learning_rate": 4.9044995911187635e-06, + "loss": 0.7185, + "step": 2003 + }, + { + "epoch": 0.5551246537396122, + "grad_norm": 0.6531845927238464, + "learning_rate": 4.904399841272423e-06, + "loss": 0.6797, + "step": 2004 + }, + { + "epoch": 0.5554016620498615, + "grad_norm": 0.7408345341682434, + "learning_rate": 4.904300040374536e-06, + "loss": 0.7409, + "step": 2005 + }, + { + "epoch": 0.5556786703601108, + "grad_norm": 0.7078654170036316, + "learning_rate": 4.904200188427217e-06, + "loss": 0.6958, + "step": 2006 + }, + { + "epoch": 0.5559556786703601, + "grad_norm": 0.6584929823875427, + "learning_rate": 4.904100285432589e-06, + "loss": 0.7117, + "step": 2007 + }, + { + "epoch": 0.5562326869806095, + "grad_norm": 0.7188738584518433, + "learning_rate": 4.904000331392773e-06, + "loss": 0.6743, + "step": 2008 + }, + { + "epoch": 0.5565096952908587, + "grad_norm": 0.7072994709014893, + "learning_rate": 4.903900326309891e-06, + "loss": 0.6766, + "step": 2009 + }, + { + "epoch": 0.556786703601108, + "grad_norm": 0.6705852150917053, + "learning_rate": 4.903800270186065e-06, + "loss": 0.718, + "step": 2010 + }, + { + "epoch": 0.5570637119113574, + "grad_norm": 0.6577473282814026, + "learning_rate": 4.903700163023421e-06, + "loss": 0.6679, + "step": 2011 + }, + { + "epoch": 0.5573407202216066, + "grad_norm": 0.6974996328353882, + "learning_rate": 4.903600004824085e-06, + "loss": 0.7009, + "step": 2012 + }, + { + "epoch": 0.557617728531856, + "grad_norm": 0.6663270592689514, + "learning_rate": 4.903499795590182e-06, + "loss": 0.6532, + "step": 2013 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 0.689300537109375, + "learning_rate": 4.90339953532384e-06, + "loss": 0.6609, + "step": 2014 + }, + { + "epoch": 0.5581717451523546, + "grad_norm": 0.693435788154602, + "learning_rate": 4.90329922402719e-06, + "loss": 0.6501, + "step": 2015 + }, + { + "epoch": 0.5584487534626039, + "grad_norm": 0.662970781326294, + "learning_rate": 4.903198861702359e-06, + "loss": 0.7003, + "step": 2016 + }, + { + "epoch": 0.5587257617728532, + "grad_norm": 0.6968392133712769, + "learning_rate": 4.903098448351479e-06, + "loss": 0.7019, + "step": 2017 + }, + { + "epoch": 0.5590027700831025, + "grad_norm": 0.6442001461982727, + "learning_rate": 4.902997983976682e-06, + "loss": 0.6748, + "step": 2018 + }, + { + "epoch": 0.5592797783933517, + "grad_norm": 0.6815208196640015, + "learning_rate": 4.902897468580101e-06, + "loss": 0.6804, + "step": 2019 + }, + { + "epoch": 0.5595567867036011, + "grad_norm": 0.6363765597343445, + "learning_rate": 4.902796902163871e-06, + "loss": 0.6565, + "step": 2020 + }, + { + "epoch": 0.5598337950138504, + "grad_norm": 0.6487285494804382, + "learning_rate": 4.9026962847301265e-06, + "loss": 0.6293, + "step": 2021 + }, + { + "epoch": 0.5601108033240997, + "grad_norm": 0.6575140357017517, + "learning_rate": 4.902595616281004e-06, + "loss": 0.6957, + "step": 2022 + }, + { + "epoch": 0.560387811634349, + "grad_norm": 0.7005342245101929, + "learning_rate": 4.9024948968186414e-06, + "loss": 0.7377, + "step": 2023 + }, + { + "epoch": 0.5606648199445984, + "grad_norm": 0.6797147989273071, + "learning_rate": 4.902394126345177e-06, + "loss": 0.7089, + "step": 2024 + }, + { + "epoch": 0.5609418282548476, + "grad_norm": 0.6822004318237305, + "learning_rate": 4.9022933048627496e-06, + "loss": 0.7156, + "step": 2025 + }, + { + "epoch": 0.561218836565097, + "grad_norm": 0.6667379140853882, + "learning_rate": 4.902192432373501e-06, + "loss": 0.695, + "step": 2026 + }, + { + "epoch": 0.5614958448753462, + "grad_norm": 0.6670651435852051, + "learning_rate": 4.902091508879572e-06, + "loss": 0.6792, + "step": 2027 + }, + { + "epoch": 0.5617728531855956, + "grad_norm": 0.6695021390914917, + "learning_rate": 4.901990534383106e-06, + "loss": 0.7117, + "step": 2028 + }, + { + "epoch": 0.5620498614958449, + "grad_norm": 0.6924020051956177, + "learning_rate": 4.901889508886247e-06, + "loss": 0.6988, + "step": 2029 + }, + { + "epoch": 0.5623268698060941, + "grad_norm": 0.6992893218994141, + "learning_rate": 4.90178843239114e-06, + "loss": 0.6979, + "step": 2030 + }, + { + "epoch": 0.5626038781163435, + "grad_norm": 0.6664560437202454, + "learning_rate": 4.9016873048999315e-06, + "loss": 0.6932, + "step": 2031 + }, + { + "epoch": 0.5628808864265928, + "grad_norm": 0.6536049246788025, + "learning_rate": 4.901586126414768e-06, + "loss": 0.6256, + "step": 2032 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 0.7021706104278564, + "learning_rate": 4.901484896937798e-06, + "loss": 0.6799, + "step": 2033 + }, + { + "epoch": 0.5634349030470914, + "grad_norm": 0.6776540279388428, + "learning_rate": 4.901383616471171e-06, + "loss": 0.6923, + "step": 2034 + }, + { + "epoch": 0.5637119113573407, + "grad_norm": 0.6760473251342773, + "learning_rate": 4.901282285017037e-06, + "loss": 0.6688, + "step": 2035 + }, + { + "epoch": 0.56398891966759, + "grad_norm": 0.6964700818061829, + "learning_rate": 4.901180902577549e-06, + "loss": 0.6396, + "step": 2036 + }, + { + "epoch": 0.5642659279778394, + "grad_norm": 0.6710968613624573, + "learning_rate": 4.901079469154857e-06, + "loss": 0.6854, + "step": 2037 + }, + { + "epoch": 0.5645429362880886, + "grad_norm": 0.7143495678901672, + "learning_rate": 4.900977984751117e-06, + "loss": 0.7059, + "step": 2038 + }, + { + "epoch": 0.564819944598338, + "grad_norm": 0.6527528166770935, + "learning_rate": 4.900876449368483e-06, + "loss": 0.6767, + "step": 2039 + }, + { + "epoch": 0.5650969529085873, + "grad_norm": 0.677239716053009, + "learning_rate": 4.90077486300911e-06, + "loss": 0.6588, + "step": 2040 + }, + { + "epoch": 0.5653739612188365, + "grad_norm": 0.728654682636261, + "learning_rate": 4.900673225675157e-06, + "loss": 0.6777, + "step": 2041 + }, + { + "epoch": 0.5656509695290859, + "grad_norm": 0.6956666111946106, + "learning_rate": 4.90057153736878e-06, + "loss": 0.6898, + "step": 2042 + }, + { + "epoch": 0.5659279778393351, + "grad_norm": 0.6565061807632446, + "learning_rate": 4.900469798092139e-06, + "loss": 0.6801, + "step": 2043 + }, + { + "epoch": 0.5662049861495845, + "grad_norm": 0.6923288702964783, + "learning_rate": 4.900368007847394e-06, + "loss": 0.7155, + "step": 2044 + }, + { + "epoch": 0.5664819944598338, + "grad_norm": 0.6817392706871033, + "learning_rate": 4.900266166636707e-06, + "loss": 0.7134, + "step": 2045 + }, + { + "epoch": 0.5667590027700831, + "grad_norm": 0.6653991341590881, + "learning_rate": 4.900164274462239e-06, + "loss": 0.6989, + "step": 2046 + }, + { + "epoch": 0.5670360110803324, + "grad_norm": 0.671635091304779, + "learning_rate": 4.900062331326154e-06, + "loss": 0.6963, + "step": 2047 + }, + { + "epoch": 0.5673130193905818, + "grad_norm": 0.7176443934440613, + "learning_rate": 4.899960337230617e-06, + "loss": 0.6653, + "step": 2048 + }, + { + "epoch": 0.567590027700831, + "grad_norm": 0.6723015904426575, + "learning_rate": 4.899858292177792e-06, + "loss": 0.6992, + "step": 2049 + }, + { + "epoch": 0.5678670360110804, + "grad_norm": 0.6572104096412659, + "learning_rate": 4.899756196169849e-06, + "loss": 0.6819, + "step": 2050 + }, + { + "epoch": 0.5681440443213296, + "grad_norm": 0.6798298954963684, + "learning_rate": 4.899654049208952e-06, + "loss": 0.6943, + "step": 2051 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.6697311401367188, + "learning_rate": 4.8995518512972726e-06, + "loss": 0.694, + "step": 2052 + }, + { + "epoch": 0.5686980609418283, + "grad_norm": 0.6293474435806274, + "learning_rate": 4.8994496024369786e-06, + "loss": 0.6486, + "step": 2053 + }, + { + "epoch": 0.5689750692520775, + "grad_norm": 0.6766144633293152, + "learning_rate": 4.899347302630242e-06, + "loss": 0.7201, + "step": 2054 + }, + { + "epoch": 0.5692520775623269, + "grad_norm": 0.6531354784965515, + "learning_rate": 4.899244951879236e-06, + "loss": 0.6748, + "step": 2055 + }, + { + "epoch": 0.5695290858725762, + "grad_norm": 0.6813303828239441, + "learning_rate": 4.899142550186132e-06, + "loss": 0.6287, + "step": 2056 + }, + { + "epoch": 0.5698060941828255, + "grad_norm": 0.6930283904075623, + "learning_rate": 4.899040097553105e-06, + "loss": 0.7111, + "step": 2057 + }, + { + "epoch": 0.5700831024930748, + "grad_norm": 0.6502225399017334, + "learning_rate": 4.8989375939823305e-06, + "loss": 0.6785, + "step": 2058 + }, + { + "epoch": 0.570360110803324, + "grad_norm": 0.6486106514930725, + "learning_rate": 4.898835039475984e-06, + "loss": 0.6774, + "step": 2059 + }, + { + "epoch": 0.5706371191135734, + "grad_norm": 0.676793098449707, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.697, + "step": 2060 + }, + { + "epoch": 0.5709141274238227, + "grad_norm": 0.6627329587936401, + "learning_rate": 4.898629777665288e-06, + "loss": 0.6715, + "step": 2061 + }, + { + "epoch": 0.571191135734072, + "grad_norm": 0.7187315225601196, + "learning_rate": 4.898527070365297e-06, + "loss": 0.6544, + "step": 2062 + }, + { + "epoch": 0.5714681440443213, + "grad_norm": 0.6739577054977417, + "learning_rate": 4.898424312138452e-06, + "loss": 0.629, + "step": 2063 + }, + { + "epoch": 0.5717451523545707, + "grad_norm": 0.6906730532646179, + "learning_rate": 4.898321502986934e-06, + "loss": 0.706, + "step": 2064 + }, + { + "epoch": 0.5720221606648199, + "grad_norm": 0.6810632348060608, + "learning_rate": 4.898218642912924e-06, + "loss": 0.6717, + "step": 2065 + }, + { + "epoch": 0.5722991689750693, + "grad_norm": 0.6437877416610718, + "learning_rate": 4.898115731918609e-06, + "loss": 0.6883, + "step": 2066 + }, + { + "epoch": 0.5725761772853185, + "grad_norm": 0.6814095377922058, + "learning_rate": 4.898012770006173e-06, + "loss": 0.7277, + "step": 2067 + }, + { + "epoch": 0.5728531855955679, + "grad_norm": 0.6995571255683899, + "learning_rate": 4.897909757177801e-06, + "loss": 0.6707, + "step": 2068 + }, + { + "epoch": 0.5731301939058172, + "grad_norm": 0.709454357624054, + "learning_rate": 4.897806693435683e-06, + "loss": 0.6806, + "step": 2069 + }, + { + "epoch": 0.5734072022160664, + "grad_norm": 0.6949672698974609, + "learning_rate": 4.897703578782004e-06, + "loss": 0.7012, + "step": 2070 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 0.6759223341941833, + "learning_rate": 4.8976004132189555e-06, + "loss": 0.6652, + "step": 2071 + }, + { + "epoch": 0.5739612188365651, + "grad_norm": 0.710740327835083, + "learning_rate": 4.897497196748728e-06, + "loss": 0.7139, + "step": 2072 + }, + { + "epoch": 0.5742382271468144, + "grad_norm": 0.6668627858161926, + "learning_rate": 4.897393929373512e-06, + "loss": 0.7286, + "step": 2073 + }, + { + "epoch": 0.5745152354570637, + "grad_norm": 0.6559934616088867, + "learning_rate": 4.8972906110955e-06, + "loss": 0.6539, + "step": 2074 + }, + { + "epoch": 0.574792243767313, + "grad_norm": 0.6624038219451904, + "learning_rate": 4.897187241916888e-06, + "loss": 0.6742, + "step": 2075 + }, + { + "epoch": 0.5750692520775623, + "grad_norm": 0.6761890649795532, + "learning_rate": 4.897083821839868e-06, + "loss": 0.6862, + "step": 2076 + }, + { + "epoch": 0.5753462603878117, + "grad_norm": 0.7120620012283325, + "learning_rate": 4.896980350866637e-06, + "loss": 0.7066, + "step": 2077 + }, + { + "epoch": 0.5756232686980609, + "grad_norm": 0.6602346301078796, + "learning_rate": 4.896876828999391e-06, + "loss": 0.6686, + "step": 2078 + }, + { + "epoch": 0.5759002770083103, + "grad_norm": 0.6451171040534973, + "learning_rate": 4.89677325624033e-06, + "loss": 0.6283, + "step": 2079 + }, + { + "epoch": 0.5761772853185596, + "grad_norm": 0.6720165014266968, + "learning_rate": 4.896669632591652e-06, + "loss": 0.6713, + "step": 2080 + }, + { + "epoch": 0.5764542936288088, + "grad_norm": 0.6697086691856384, + "learning_rate": 4.896565958055557e-06, + "loss": 0.6797, + "step": 2081 + }, + { + "epoch": 0.5767313019390582, + "grad_norm": 0.7319167256355286, + "learning_rate": 4.896462232634247e-06, + "loss": 0.6524, + "step": 2082 + }, + { + "epoch": 0.5770083102493074, + "grad_norm": 0.6829770803451538, + "learning_rate": 4.896358456329924e-06, + "loss": 0.6859, + "step": 2083 + }, + { + "epoch": 0.5772853185595568, + "grad_norm": 0.6501836180686951, + "learning_rate": 4.896254629144791e-06, + "loss": 0.6848, + "step": 2084 + }, + { + "epoch": 0.5775623268698061, + "grad_norm": 0.6823615431785583, + "learning_rate": 4.896150751081052e-06, + "loss": 0.689, + "step": 2085 + }, + { + "epoch": 0.5778393351800554, + "grad_norm": 0.6782605648040771, + "learning_rate": 4.896046822140915e-06, + "loss": 0.6689, + "step": 2086 + }, + { + "epoch": 0.5781163434903047, + "grad_norm": 0.6973512768745422, + "learning_rate": 4.895942842326584e-06, + "loss": 0.6894, + "step": 2087 + }, + { + "epoch": 0.5783933518005541, + "grad_norm": 0.7102792859077454, + "learning_rate": 4.895838811640269e-06, + "loss": 0.6893, + "step": 2088 + }, + { + "epoch": 0.5786703601108033, + "grad_norm": 0.6860445141792297, + "learning_rate": 4.895734730084176e-06, + "loss": 0.7189, + "step": 2089 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.6870277523994446, + "learning_rate": 4.895630597660518e-06, + "loss": 0.7391, + "step": 2090 + }, + { + "epoch": 0.5792243767313019, + "grad_norm": 0.8472511768341064, + "learning_rate": 4.895526414371505e-06, + "loss": 0.6462, + "step": 2091 + }, + { + "epoch": 0.5795013850415512, + "grad_norm": 0.6843624114990234, + "learning_rate": 4.895422180219348e-06, + "loss": 0.6971, + "step": 2092 + }, + { + "epoch": 0.5797783933518006, + "grad_norm": 0.6715996861457825, + "learning_rate": 4.895317895206261e-06, + "loss": 0.6273, + "step": 2093 + }, + { + "epoch": 0.5800554016620498, + "grad_norm": 0.6269164085388184, + "learning_rate": 4.895213559334458e-06, + "loss": 0.7139, + "step": 2094 + }, + { + "epoch": 0.5803324099722992, + "grad_norm": 0.7106332182884216, + "learning_rate": 4.895109172606154e-06, + "loss": 0.7015, + "step": 2095 + }, + { + "epoch": 0.5806094182825485, + "grad_norm": 0.6532184481620789, + "learning_rate": 4.895004735023567e-06, + "loss": 0.6837, + "step": 2096 + }, + { + "epoch": 0.5808864265927978, + "grad_norm": 0.6516802310943604, + "learning_rate": 4.894900246588912e-06, + "loss": 0.6981, + "step": 2097 + }, + { + "epoch": 0.5811634349030471, + "grad_norm": 0.6725658774375916, + "learning_rate": 4.894795707304409e-06, + "loss": 0.6277, + "step": 2098 + }, + { + "epoch": 0.5814404432132964, + "grad_norm": 0.6376446485519409, + "learning_rate": 4.894691117172279e-06, + "loss": 0.6467, + "step": 2099 + }, + { + "epoch": 0.5817174515235457, + "grad_norm": 0.663567304611206, + "learning_rate": 4.894586476194739e-06, + "loss": 0.7279, + "step": 2100 + }, + { + "epoch": 0.581994459833795, + "grad_norm": 0.6665627360343933, + "learning_rate": 4.8944817843740145e-06, + "loss": 0.6799, + "step": 2101 + }, + { + "epoch": 0.5822714681440443, + "grad_norm": 0.6752210855484009, + "learning_rate": 4.894377041712327e-06, + "loss": 0.6438, + "step": 2102 + }, + { + "epoch": 0.5825484764542936, + "grad_norm": 0.6799635887145996, + "learning_rate": 4.8942722482119e-06, + "loss": 0.7247, + "step": 2103 + }, + { + "epoch": 0.582825484764543, + "grad_norm": 0.6850577592849731, + "learning_rate": 4.894167403874958e-06, + "loss": 0.6415, + "step": 2104 + }, + { + "epoch": 0.5831024930747922, + "grad_norm": 0.684273362159729, + "learning_rate": 4.894062508703729e-06, + "loss": 0.702, + "step": 2105 + }, + { + "epoch": 0.5833795013850416, + "grad_norm": 0.6766973733901978, + "learning_rate": 4.893957562700439e-06, + "loss": 0.7174, + "step": 2106 + }, + { + "epoch": 0.5836565096952908, + "grad_norm": 0.7171596884727478, + "learning_rate": 4.8938525658673165e-06, + "loss": 0.6696, + "step": 2107 + }, + { + "epoch": 0.5839335180055402, + "grad_norm": 0.6781456470489502, + "learning_rate": 4.8937475182065905e-06, + "loss": 0.731, + "step": 2108 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 0.6551393270492554, + "learning_rate": 4.893642419720491e-06, + "loss": 0.6546, + "step": 2109 + }, + { + "epoch": 0.5844875346260388, + "grad_norm": 0.6730072498321533, + "learning_rate": 4.893537270411252e-06, + "loss": 0.6606, + "step": 2110 + }, + { + "epoch": 0.5847645429362881, + "grad_norm": 0.7042357921600342, + "learning_rate": 4.893432070281102e-06, + "loss": 0.7056, + "step": 2111 + }, + { + "epoch": 0.5850415512465375, + "grad_norm": 0.6876088380813599, + "learning_rate": 4.893326819332279e-06, + "loss": 0.73, + "step": 2112 + }, + { + "epoch": 0.5853185595567867, + "grad_norm": 0.700372576713562, + "learning_rate": 4.893221517567015e-06, + "loss": 0.6916, + "step": 2113 + }, + { + "epoch": 0.585595567867036, + "grad_norm": 0.6858752965927124, + "learning_rate": 4.893116164987547e-06, + "loss": 0.6089, + "step": 2114 + }, + { + "epoch": 0.5858725761772853, + "grad_norm": 0.6903958916664124, + "learning_rate": 4.893010761596111e-06, + "loss": 0.6903, + "step": 2115 + }, + { + "epoch": 0.5861495844875346, + "grad_norm": 0.7026486396789551, + "learning_rate": 4.892905307394945e-06, + "loss": 0.7043, + "step": 2116 + }, + { + "epoch": 0.586426592797784, + "grad_norm": 0.6673232913017273, + "learning_rate": 4.89279980238629e-06, + "loss": 0.6843, + "step": 2117 + }, + { + "epoch": 0.5867036011080332, + "grad_norm": 0.6887499094009399, + "learning_rate": 4.892694246572384e-06, + "loss": 0.6362, + "step": 2118 + }, + { + "epoch": 0.5869806094182826, + "grad_norm": 0.6565698385238647, + "learning_rate": 4.892588639955468e-06, + "loss": 0.6743, + "step": 2119 + }, + { + "epoch": 0.5872576177285319, + "grad_norm": 0.6758242249488831, + "learning_rate": 4.892482982537786e-06, + "loss": 0.7125, + "step": 2120 + }, + { + "epoch": 0.5875346260387811, + "grad_norm": 0.7436144948005676, + "learning_rate": 4.892377274321581e-06, + "loss": 0.7179, + "step": 2121 + }, + { + "epoch": 0.5878116343490305, + "grad_norm": 0.6895393133163452, + "learning_rate": 4.892271515309096e-06, + "loss": 0.7159, + "step": 2122 + }, + { + "epoch": 0.5880886426592797, + "grad_norm": 0.6338022947311401, + "learning_rate": 4.892165705502578e-06, + "loss": 0.6417, + "step": 2123 + }, + { + "epoch": 0.5883656509695291, + "grad_norm": 0.6784072518348694, + "learning_rate": 4.892059844904273e-06, + "loss": 0.6617, + "step": 2124 + }, + { + "epoch": 0.5886426592797784, + "grad_norm": 0.6545912623405457, + "learning_rate": 4.891953933516429e-06, + "loss": 0.6912, + "step": 2125 + }, + { + "epoch": 0.5889196675900277, + "grad_norm": 0.6839507222175598, + "learning_rate": 4.891847971341293e-06, + "loss": 0.6548, + "step": 2126 + }, + { + "epoch": 0.589196675900277, + "grad_norm": 0.690337061882019, + "learning_rate": 4.891741958381119e-06, + "loss": 0.7042, + "step": 2127 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.6950432062149048, + "learning_rate": 4.891635894638154e-06, + "loss": 0.6529, + "step": 2128 + }, + { + "epoch": 0.5897506925207756, + "grad_norm": 0.6476900577545166, + "learning_rate": 4.8915297801146515e-06, + "loss": 0.6045, + "step": 2129 + }, + { + "epoch": 0.590027700831025, + "grad_norm": 0.7296699285507202, + "learning_rate": 4.891423614812864e-06, + "loss": 0.7426, + "step": 2130 + }, + { + "epoch": 0.5903047091412742, + "grad_norm": 0.6767834424972534, + "learning_rate": 4.8913173987350474e-06, + "loss": 0.6965, + "step": 2131 + }, + { + "epoch": 0.5905817174515235, + "grad_norm": 0.7208825349807739, + "learning_rate": 4.891211131883455e-06, + "loss": 0.7025, + "step": 2132 + }, + { + "epoch": 0.5908587257617729, + "grad_norm": 0.6839019656181335, + "learning_rate": 4.891104814260344e-06, + "loss": 0.6779, + "step": 2133 + }, + { + "epoch": 0.5911357340720221, + "grad_norm": 0.7386277318000793, + "learning_rate": 4.890998445867971e-06, + "loss": 0.7333, + "step": 2134 + }, + { + "epoch": 0.5914127423822715, + "grad_norm": 0.7242138385772705, + "learning_rate": 4.890892026708596e-06, + "loss": 0.693, + "step": 2135 + }, + { + "epoch": 0.5916897506925207, + "grad_norm": 0.7165040969848633, + "learning_rate": 4.8907855567844765e-06, + "loss": 0.7229, + "step": 2136 + }, + { + "epoch": 0.5919667590027701, + "grad_norm": 0.6969289183616638, + "learning_rate": 4.890679036097875e-06, + "loss": 0.7051, + "step": 2137 + }, + { + "epoch": 0.5922437673130194, + "grad_norm": 0.6765584349632263, + "learning_rate": 4.890572464651052e-06, + "loss": 0.6803, + "step": 2138 + }, + { + "epoch": 0.5925207756232687, + "grad_norm": 0.6918100714683533, + "learning_rate": 4.890465842446272e-06, + "loss": 0.6563, + "step": 2139 + }, + { + "epoch": 0.592797783933518, + "grad_norm": 0.6466085314750671, + "learning_rate": 4.890359169485796e-06, + "loss": 0.686, + "step": 2140 + }, + { + "epoch": 0.5930747922437674, + "grad_norm": 0.6893829107284546, + "learning_rate": 4.89025244577189e-06, + "loss": 0.7093, + "step": 2141 + }, + { + "epoch": 0.5933518005540166, + "grad_norm": 0.7149084210395813, + "learning_rate": 4.890145671306822e-06, + "loss": 0.7221, + "step": 2142 + }, + { + "epoch": 0.5936288088642659, + "grad_norm": 0.6596125960350037, + "learning_rate": 4.890038846092857e-06, + "loss": 0.677, + "step": 2143 + }, + { + "epoch": 0.5939058171745152, + "grad_norm": 0.6518341302871704, + "learning_rate": 4.889931970132264e-06, + "loss": 0.6836, + "step": 2144 + }, + { + "epoch": 0.5941828254847645, + "grad_norm": 0.6978798508644104, + "learning_rate": 4.8898250434273125e-06, + "loss": 0.6593, + "step": 2145 + }, + { + "epoch": 0.5944598337950139, + "grad_norm": 0.7356164455413818, + "learning_rate": 4.889718065980272e-06, + "loss": 0.7028, + "step": 2146 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 0.670665979385376, + "learning_rate": 4.889611037793414e-06, + "loss": 0.7081, + "step": 2147 + }, + { + "epoch": 0.5950138504155125, + "grad_norm": 0.6960236430168152, + "learning_rate": 4.8895039588690114e-06, + "loss": 0.7024, + "step": 2148 + }, + { + "epoch": 0.5952908587257618, + "grad_norm": 0.6881184577941895, + "learning_rate": 4.889396829209338e-06, + "loss": 0.7227, + "step": 2149 + }, + { + "epoch": 0.5955678670360111, + "grad_norm": 0.6917644739151001, + "learning_rate": 4.889289648816667e-06, + "loss": 0.6668, + "step": 2150 + }, + { + "epoch": 0.5958448753462604, + "grad_norm": 0.6641549468040466, + "learning_rate": 4.889182417693277e-06, + "loss": 0.7113, + "step": 2151 + }, + { + "epoch": 0.5961218836565096, + "grad_norm": 0.7198570966720581, + "learning_rate": 4.889075135841441e-06, + "loss": 0.7505, + "step": 2152 + }, + { + "epoch": 0.596398891966759, + "grad_norm": 0.6820452213287354, + "learning_rate": 4.88896780326344e-06, + "loss": 0.6594, + "step": 2153 + }, + { + "epoch": 0.5966759002770083, + "grad_norm": 0.7384587526321411, + "learning_rate": 4.888860419961552e-06, + "loss": 0.7326, + "step": 2154 + }, + { + "epoch": 0.5969529085872576, + "grad_norm": 0.6759530901908875, + "learning_rate": 4.888752985938056e-06, + "loss": 0.6678, + "step": 2155 + }, + { + "epoch": 0.5972299168975069, + "grad_norm": 0.6574460864067078, + "learning_rate": 4.888645501195234e-06, + "loss": 0.6978, + "step": 2156 + }, + { + "epoch": 0.5975069252077563, + "grad_norm": 0.6380599737167358, + "learning_rate": 4.888537965735368e-06, + "loss": 0.6162, + "step": 2157 + }, + { + "epoch": 0.5977839335180055, + "grad_norm": 0.6682248711585999, + "learning_rate": 4.8884303795607424e-06, + "loss": 0.7114, + "step": 2158 + }, + { + "epoch": 0.5980609418282549, + "grad_norm": 0.7036237120628357, + "learning_rate": 4.888322742673639e-06, + "loss": 0.745, + "step": 2159 + }, + { + "epoch": 0.5983379501385041, + "grad_norm": 0.6678205132484436, + "learning_rate": 4.888215055076346e-06, + "loss": 0.7086, + "step": 2160 + }, + { + "epoch": 0.5986149584487535, + "grad_norm": 0.6644747853279114, + "learning_rate": 4.888107316771148e-06, + "loss": 0.6699, + "step": 2161 + }, + { + "epoch": 0.5988919667590028, + "grad_norm": 0.6806349158287048, + "learning_rate": 4.8879995277603325e-06, + "loss": 0.7001, + "step": 2162 + }, + { + "epoch": 0.599168975069252, + "grad_norm": 0.6920258402824402, + "learning_rate": 4.887891688046189e-06, + "loss": 0.6539, + "step": 2163 + }, + { + "epoch": 0.5994459833795014, + "grad_norm": 0.6764971017837524, + "learning_rate": 4.887783797631007e-06, + "loss": 0.7057, + "step": 2164 + }, + { + "epoch": 0.5997229916897507, + "grad_norm": 0.6854003071784973, + "learning_rate": 4.8876758565170775e-06, + "loss": 0.7155, + "step": 2165 + }, + { + "epoch": 0.6, + "grad_norm": 0.6682254076004028, + "learning_rate": 4.887567864706693e-06, + "loss": 0.7131, + "step": 2166 + }, + { + "epoch": 0.6002770083102493, + "grad_norm": 0.7079361081123352, + "learning_rate": 4.887459822202144e-06, + "loss": 0.7172, + "step": 2167 + }, + { + "epoch": 0.6005540166204986, + "grad_norm": 0.6760583519935608, + "learning_rate": 4.8873517290057265e-06, + "loss": 0.6874, + "step": 2168 + }, + { + "epoch": 0.6008310249307479, + "grad_norm": 0.6594741940498352, + "learning_rate": 4.887243585119735e-06, + "loss": 0.7076, + "step": 2169 + }, + { + "epoch": 0.6011080332409973, + "grad_norm": 0.6586843729019165, + "learning_rate": 4.887135390546467e-06, + "loss": 0.7064, + "step": 2170 + }, + { + "epoch": 0.6013850415512465, + "grad_norm": 0.6381613612174988, + "learning_rate": 4.887027145288217e-06, + "loss": 0.6532, + "step": 2171 + }, + { + "epoch": 0.6016620498614959, + "grad_norm": 0.7136830687522888, + "learning_rate": 4.886918849347286e-06, + "loss": 0.698, + "step": 2172 + }, + { + "epoch": 0.6019390581717452, + "grad_norm": 0.6528488993644714, + "learning_rate": 4.886810502725971e-06, + "loss": 0.6945, + "step": 2173 + }, + { + "epoch": 0.6022160664819944, + "grad_norm": 0.6757421493530273, + "learning_rate": 4.886702105426575e-06, + "loss": 0.6753, + "step": 2174 + }, + { + "epoch": 0.6024930747922438, + "grad_norm": 0.7247016429901123, + "learning_rate": 4.886593657451398e-06, + "loss": 0.6999, + "step": 2175 + }, + { + "epoch": 0.602770083102493, + "grad_norm": 0.7493066191673279, + "learning_rate": 4.886485158802744e-06, + "loss": 0.6965, + "step": 2176 + }, + { + "epoch": 0.6030470914127424, + "grad_norm": 0.7063272595405579, + "learning_rate": 4.886376609482914e-06, + "loss": 0.6789, + "step": 2177 + }, + { + "epoch": 0.6033240997229917, + "grad_norm": 0.6630057096481323, + "learning_rate": 4.886268009494215e-06, + "loss": 0.6746, + "step": 2178 + }, + { + "epoch": 0.603601108033241, + "grad_norm": 0.7254530787467957, + "learning_rate": 4.886159358838952e-06, + "loss": 0.7137, + "step": 2179 + }, + { + "epoch": 0.6038781163434903, + "grad_norm": 0.7073795199394226, + "learning_rate": 4.886050657519433e-06, + "loss": 0.6509, + "step": 2180 + }, + { + "epoch": 0.6041551246537397, + "grad_norm": 0.7252312898635864, + "learning_rate": 4.885941905537965e-06, + "loss": 0.6888, + "step": 2181 + }, + { + "epoch": 0.6044321329639889, + "grad_norm": 0.6500841379165649, + "learning_rate": 4.885833102896857e-06, + "loss": 0.7004, + "step": 2182 + }, + { + "epoch": 0.6047091412742382, + "grad_norm": 0.6450377702713013, + "learning_rate": 4.885724249598419e-06, + "loss": 0.6183, + "step": 2183 + }, + { + "epoch": 0.6049861495844875, + "grad_norm": 0.7047394514083862, + "learning_rate": 4.885615345644964e-06, + "loss": 0.6698, + "step": 2184 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.7013279795646667, + "learning_rate": 4.885506391038803e-06, + "loss": 0.6704, + "step": 2185 + }, + { + "epoch": 0.6055401662049862, + "grad_norm": 0.7043224573135376, + "learning_rate": 4.885397385782248e-06, + "loss": 0.6892, + "step": 2186 + }, + { + "epoch": 0.6058171745152354, + "grad_norm": 0.6639013886451721, + "learning_rate": 4.885288329877616e-06, + "loss": 0.6849, + "step": 2187 + }, + { + "epoch": 0.6060941828254848, + "grad_norm": 0.6464067101478577, + "learning_rate": 4.885179223327221e-06, + "loss": 0.691, + "step": 2188 + }, + { + "epoch": 0.6063711911357341, + "grad_norm": 0.6812418103218079, + "learning_rate": 4.885070066133379e-06, + "loss": 0.6825, + "step": 2189 + }, + { + "epoch": 0.6066481994459834, + "grad_norm": 0.677861750125885, + "learning_rate": 4.88496085829841e-06, + "loss": 0.6562, + "step": 2190 + }, + { + "epoch": 0.6069252077562327, + "grad_norm": 0.6878042221069336, + "learning_rate": 4.884851599824631e-06, + "loss": 0.6873, + "step": 2191 + }, + { + "epoch": 0.607202216066482, + "grad_norm": 0.6419922113418579, + "learning_rate": 4.884742290714363e-06, + "loss": 0.6314, + "step": 2192 + }, + { + "epoch": 0.6074792243767313, + "grad_norm": 0.669269323348999, + "learning_rate": 4.884632930969925e-06, + "loss": 0.6585, + "step": 2193 + }, + { + "epoch": 0.6077562326869806, + "grad_norm": 0.7030165791511536, + "learning_rate": 4.8845235205936415e-06, + "loss": 0.7232, + "step": 2194 + }, + { + "epoch": 0.6080332409972299, + "grad_norm": 0.7237428426742554, + "learning_rate": 4.884414059587833e-06, + "loss": 0.6807, + "step": 2195 + }, + { + "epoch": 0.6083102493074792, + "grad_norm": 0.6722372770309448, + "learning_rate": 4.884304547954826e-06, + "loss": 0.6833, + "step": 2196 + }, + { + "epoch": 0.6085872576177286, + "grad_norm": 0.6264526844024658, + "learning_rate": 4.884194985696944e-06, + "loss": 0.6354, + "step": 2197 + }, + { + "epoch": 0.6088642659279778, + "grad_norm": 0.6962215900421143, + "learning_rate": 4.884085372816514e-06, + "loss": 0.6909, + "step": 2198 + }, + { + "epoch": 0.6091412742382272, + "grad_norm": 0.7049713134765625, + "learning_rate": 4.883975709315862e-06, + "loss": 0.6686, + "step": 2199 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 0.6710338592529297, + "learning_rate": 4.883865995197319e-06, + "loss": 0.7199, + "step": 2200 + }, + { + "epoch": 0.6096952908587258, + "grad_norm": 0.6907806396484375, + "learning_rate": 4.883756230463214e-06, + "loss": 0.7232, + "step": 2201 + }, + { + "epoch": 0.6099722991689751, + "grad_norm": 0.6837185025215149, + "learning_rate": 4.8836464151158754e-06, + "loss": 0.6801, + "step": 2202 + }, + { + "epoch": 0.6102493074792243, + "grad_norm": 0.6782760620117188, + "learning_rate": 4.883536549157637e-06, + "loss": 0.652, + "step": 2203 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.6475855708122253, + "learning_rate": 4.88342663259083e-06, + "loss": 0.6828, + "step": 2204 + }, + { + "epoch": 0.610803324099723, + "grad_norm": 0.6402779817581177, + "learning_rate": 4.88331666541779e-06, + "loss": 0.6555, + "step": 2205 + }, + { + "epoch": 0.6110803324099723, + "grad_norm": 0.6540744304656982, + "learning_rate": 4.883206647640849e-06, + "loss": 0.6333, + "step": 2206 + }, + { + "epoch": 0.6113573407202216, + "grad_norm": 0.6887601613998413, + "learning_rate": 4.883096579262347e-06, + "loss": 0.7423, + "step": 2207 + }, + { + "epoch": 0.6116343490304709, + "grad_norm": 0.6777442693710327, + "learning_rate": 4.8829864602846176e-06, + "loss": 0.6991, + "step": 2208 + }, + { + "epoch": 0.6119113573407202, + "grad_norm": 0.7278162240982056, + "learning_rate": 4.8828762907100004e-06, + "loss": 0.766, + "step": 2209 + }, + { + "epoch": 0.6121883656509696, + "grad_norm": 0.6916058659553528, + "learning_rate": 4.882766070540835e-06, + "loss": 0.662, + "step": 2210 + }, + { + "epoch": 0.6124653739612188, + "grad_norm": 0.6415777802467346, + "learning_rate": 4.88265579977946e-06, + "loss": 0.6727, + "step": 2211 + }, + { + "epoch": 0.6127423822714682, + "grad_norm": 0.6946539878845215, + "learning_rate": 4.882545478428219e-06, + "loss": 0.7491, + "step": 2212 + }, + { + "epoch": 0.6130193905817175, + "grad_norm": 0.7359540462493896, + "learning_rate": 4.882435106489451e-06, + "loss": 0.6904, + "step": 2213 + }, + { + "epoch": 0.6132963988919667, + "grad_norm": 0.6972795724868774, + "learning_rate": 4.882324683965504e-06, + "loss": 0.6665, + "step": 2214 + }, + { + "epoch": 0.6135734072022161, + "grad_norm": 0.683138906955719, + "learning_rate": 4.882214210858719e-06, + "loss": 0.7161, + "step": 2215 + }, + { + "epoch": 0.6138504155124653, + "grad_norm": 0.6765393018722534, + "learning_rate": 4.882103687171443e-06, + "loss": 0.6748, + "step": 2216 + }, + { + "epoch": 0.6141274238227147, + "grad_norm": 0.6710306406021118, + "learning_rate": 4.881993112906022e-06, + "loss": 0.6821, + "step": 2217 + }, + { + "epoch": 0.614404432132964, + "grad_norm": 0.6652889251708984, + "learning_rate": 4.881882488064805e-06, + "loss": 0.6898, + "step": 2218 + }, + { + "epoch": 0.6146814404432133, + "grad_norm": 0.6607372164726257, + "learning_rate": 4.8817718126501395e-06, + "loss": 0.6551, + "step": 2219 + }, + { + "epoch": 0.6149584487534626, + "grad_norm": 0.6769833564758301, + "learning_rate": 4.881661086664377e-06, + "loss": 0.6738, + "step": 2220 + }, + { + "epoch": 0.615235457063712, + "grad_norm": 0.7280565500259399, + "learning_rate": 4.881550310109866e-06, + "loss": 0.6383, + "step": 2221 + }, + { + "epoch": 0.6155124653739612, + "grad_norm": 0.7089138627052307, + "learning_rate": 4.881439482988961e-06, + "loss": 0.6745, + "step": 2222 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 0.6932617425918579, + "learning_rate": 4.881328605304014e-06, + "loss": 0.6704, + "step": 2223 + }, + { + "epoch": 0.6160664819944598, + "grad_norm": 0.6502192616462708, + "learning_rate": 4.881217677057379e-06, + "loss": 0.7143, + "step": 2224 + }, + { + "epoch": 0.6163434903047091, + "grad_norm": 0.7033457159996033, + "learning_rate": 4.881106698251413e-06, + "loss": 0.7383, + "step": 2225 + }, + { + "epoch": 0.6166204986149585, + "grad_norm": 0.704187273979187, + "learning_rate": 4.880995668888471e-06, + "loss": 0.6947, + "step": 2226 + }, + { + "epoch": 0.6168975069252077, + "grad_norm": 0.7004435658454895, + "learning_rate": 4.88088458897091e-06, + "loss": 0.6534, + "step": 2227 + }, + { + "epoch": 0.6171745152354571, + "grad_norm": 0.6873608827590942, + "learning_rate": 4.8807734585010894e-06, + "loss": 0.695, + "step": 2228 + }, + { + "epoch": 0.6174515235457064, + "grad_norm": 0.6766569018363953, + "learning_rate": 4.880662277481367e-06, + "loss": 0.6757, + "step": 2229 + }, + { + "epoch": 0.6177285318559557, + "grad_norm": 0.6786469221115112, + "learning_rate": 4.880551045914107e-06, + "loss": 0.6997, + "step": 2230 + }, + { + "epoch": 0.618005540166205, + "grad_norm": 0.678867757320404, + "learning_rate": 4.880439763801668e-06, + "loss": 0.6518, + "step": 2231 + }, + { + "epoch": 0.6182825484764543, + "grad_norm": 0.7730464339256287, + "learning_rate": 4.880328431146414e-06, + "loss": 0.7593, + "step": 2232 + }, + { + "epoch": 0.6185595567867036, + "grad_norm": 0.6655864715576172, + "learning_rate": 4.880217047950708e-06, + "loss": 0.6665, + "step": 2233 + }, + { + "epoch": 0.618836565096953, + "grad_norm": 0.6689289212226868, + "learning_rate": 4.880105614216917e-06, + "loss": 0.7367, + "step": 2234 + }, + { + "epoch": 0.6191135734072022, + "grad_norm": 0.6539269089698792, + "learning_rate": 4.879994129947404e-06, + "loss": 0.664, + "step": 2235 + }, + { + "epoch": 0.6193905817174515, + "grad_norm": 0.6679507493972778, + "learning_rate": 4.8798825951445395e-06, + "loss": 0.6525, + "step": 2236 + }, + { + "epoch": 0.6196675900277008, + "grad_norm": 0.6500898599624634, + "learning_rate": 4.879771009810689e-06, + "loss": 0.6509, + "step": 2237 + }, + { + "epoch": 0.6199445983379501, + "grad_norm": 0.7124160528182983, + "learning_rate": 4.879659373948223e-06, + "loss": 0.7638, + "step": 2238 + }, + { + "epoch": 0.6202216066481995, + "grad_norm": 0.6627144813537598, + "learning_rate": 4.879547687559511e-06, + "loss": 0.6739, + "step": 2239 + }, + { + "epoch": 0.6204986149584487, + "grad_norm": 0.705913782119751, + "learning_rate": 4.879435950646926e-06, + "loss": 0.6733, + "step": 2240 + }, + { + "epoch": 0.6207756232686981, + "grad_norm": 0.7119175791740417, + "learning_rate": 4.879324163212838e-06, + "loss": 0.6927, + "step": 2241 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 0.739231526851654, + "learning_rate": 4.879212325259623e-06, + "loss": 0.7096, + "step": 2242 + }, + { + "epoch": 0.6213296398891966, + "grad_norm": 0.7063774466514587, + "learning_rate": 4.879100436789653e-06, + "loss": 0.7557, + "step": 2243 + }, + { + "epoch": 0.621606648199446, + "grad_norm": 0.6932379603385925, + "learning_rate": 4.878988497805307e-06, + "loss": 0.6585, + "step": 2244 + }, + { + "epoch": 0.6218836565096952, + "grad_norm": 0.6909125447273254, + "learning_rate": 4.8788765083089586e-06, + "loss": 0.7249, + "step": 2245 + }, + { + "epoch": 0.6221606648199446, + "grad_norm": 0.7003384232521057, + "learning_rate": 4.878764468302988e-06, + "loss": 0.673, + "step": 2246 + }, + { + "epoch": 0.6224376731301939, + "grad_norm": 0.6720861792564392, + "learning_rate": 4.878652377789772e-06, + "loss": 0.6629, + "step": 2247 + }, + { + "epoch": 0.6227146814404432, + "grad_norm": 0.6857220530509949, + "learning_rate": 4.8785402367716926e-06, + "loss": 0.7017, + "step": 2248 + }, + { + "epoch": 0.6229916897506925, + "grad_norm": 0.674379289150238, + "learning_rate": 4.878428045251129e-06, + "loss": 0.65, + "step": 2249 + }, + { + "epoch": 0.6232686980609419, + "grad_norm": 0.6857823729515076, + "learning_rate": 4.878315803230464e-06, + "loss": 0.6992, + "step": 2250 + }, + { + "epoch": 0.6235457063711911, + "grad_norm": 0.6758940815925598, + "learning_rate": 4.8782035107120815e-06, + "loss": 0.6714, + "step": 2251 + }, + { + "epoch": 0.6238227146814405, + "grad_norm": 0.7062450051307678, + "learning_rate": 4.878091167698366e-06, + "loss": 0.7084, + "step": 2252 + }, + { + "epoch": 0.6240997229916897, + "grad_norm": 0.7142696380615234, + "learning_rate": 4.8779787741917005e-06, + "loss": 0.6842, + "step": 2253 + }, + { + "epoch": 0.624376731301939, + "grad_norm": 0.6867758631706238, + "learning_rate": 4.877866330194474e-06, + "loss": 0.7025, + "step": 2254 + }, + { + "epoch": 0.6246537396121884, + "grad_norm": 0.6744982004165649, + "learning_rate": 4.877753835709072e-06, + "loss": 0.6642, + "step": 2255 + }, + { + "epoch": 0.6249307479224376, + "grad_norm": 0.700770914554596, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6839, + "step": 2256 + }, + { + "epoch": 0.625207756232687, + "grad_norm": 0.6821143627166748, + "learning_rate": 4.8775286952833e-06, + "loss": 0.6533, + "step": 2257 + }, + { + "epoch": 0.6254847645429363, + "grad_norm": 0.7303215265274048, + "learning_rate": 4.877416049347711e-06, + "loss": 0.6424, + "step": 2258 + }, + { + "epoch": 0.6257617728531856, + "grad_norm": 0.669938862323761, + "learning_rate": 4.877303352933507e-06, + "loss": 0.6531, + "step": 2259 + }, + { + "epoch": 0.6260387811634349, + "grad_norm": 0.753316342830658, + "learning_rate": 4.8771906060430815e-06, + "loss": 0.6983, + "step": 2260 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 0.7299981713294983, + "learning_rate": 4.877077808678829e-06, + "loss": 0.6826, + "step": 2261 + }, + { + "epoch": 0.6265927977839335, + "grad_norm": 0.6956072449684143, + "learning_rate": 4.876964960843145e-06, + "loss": 0.704, + "step": 2262 + }, + { + "epoch": 0.6268698060941829, + "grad_norm": 0.6745579242706299, + "learning_rate": 4.876852062538424e-06, + "loss": 0.675, + "step": 2263 + }, + { + "epoch": 0.6271468144044321, + "grad_norm": 0.649700403213501, + "learning_rate": 4.876739113767065e-06, + "loss": 0.6664, + "step": 2264 + }, + { + "epoch": 0.6274238227146814, + "grad_norm": 0.711959719657898, + "learning_rate": 4.8766261145314646e-06, + "loss": 0.6927, + "step": 2265 + }, + { + "epoch": 0.6277008310249308, + "grad_norm": 0.6987378597259521, + "learning_rate": 4.876513064834022e-06, + "loss": 0.7043, + "step": 2266 + }, + { + "epoch": 0.62797783933518, + "grad_norm": 0.7144526839256287, + "learning_rate": 4.876399964677138e-06, + "loss": 0.6768, + "step": 2267 + }, + { + "epoch": 0.6282548476454294, + "grad_norm": 0.6852195262908936, + "learning_rate": 4.876286814063215e-06, + "loss": 0.7018, + "step": 2268 + }, + { + "epoch": 0.6285318559556786, + "grad_norm": 0.6548054814338684, + "learning_rate": 4.876173612994653e-06, + "loss": 0.6419, + "step": 2269 + }, + { + "epoch": 0.628808864265928, + "grad_norm": 0.6945639848709106, + "learning_rate": 4.8760603614738585e-06, + "loss": 0.6724, + "step": 2270 + }, + { + "epoch": 0.6290858725761773, + "grad_norm": 0.7267112135887146, + "learning_rate": 4.875947059503234e-06, + "loss": 0.6318, + "step": 2271 + }, + { + "epoch": 0.6293628808864266, + "grad_norm": 0.6593161225318909, + "learning_rate": 4.8758337070851865e-06, + "loss": 0.6544, + "step": 2272 + }, + { + "epoch": 0.6296398891966759, + "grad_norm": 0.6698119044303894, + "learning_rate": 4.875720304222122e-06, + "loss": 0.6867, + "step": 2273 + }, + { + "epoch": 0.6299168975069253, + "grad_norm": 0.7131072878837585, + "learning_rate": 4.875606850916449e-06, + "loss": 0.6929, + "step": 2274 + }, + { + "epoch": 0.6301939058171745, + "grad_norm": 0.6932727098464966, + "learning_rate": 4.8754933471705745e-06, + "loss": 0.6804, + "step": 2275 + }, + { + "epoch": 0.6304709141274238, + "grad_norm": 0.6713318228721619, + "learning_rate": 4.8753797929869115e-06, + "loss": 0.6545, + "step": 2276 + }, + { + "epoch": 0.6307479224376731, + "grad_norm": 0.6698991656303406, + "learning_rate": 4.875266188367868e-06, + "loss": 0.7334, + "step": 2277 + }, + { + "epoch": 0.6310249307479224, + "grad_norm": 0.6613983511924744, + "learning_rate": 4.875152533315859e-06, + "loss": 0.6761, + "step": 2278 + }, + { + "epoch": 0.6313019390581718, + "grad_norm": 0.7016041874885559, + "learning_rate": 4.875038827833295e-06, + "loss": 0.7175, + "step": 2279 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.7024201154708862, + "learning_rate": 4.874925071922591e-06, + "loss": 0.6813, + "step": 2280 + }, + { + "epoch": 0.6318559556786704, + "grad_norm": 0.6844956278800964, + "learning_rate": 4.874811265586165e-06, + "loss": 0.6588, + "step": 2281 + }, + { + "epoch": 0.6321329639889197, + "grad_norm": 0.7185789942741394, + "learning_rate": 4.87469740882643e-06, + "loss": 0.6906, + "step": 2282 + }, + { + "epoch": 0.632409972299169, + "grad_norm": 0.6658942699432373, + "learning_rate": 4.874583501645804e-06, + "loss": 0.6628, + "step": 2283 + }, + { + "epoch": 0.6326869806094183, + "grad_norm": 0.7003322839736938, + "learning_rate": 4.874469544046707e-06, + "loss": 0.6811, + "step": 2284 + }, + { + "epoch": 0.6329639889196675, + "grad_norm": 0.6516456007957458, + "learning_rate": 4.874355536031558e-06, + "loss": 0.6617, + "step": 2285 + }, + { + "epoch": 0.6332409972299169, + "grad_norm": 0.6722559928894043, + "learning_rate": 4.874241477602778e-06, + "loss": 0.6499, + "step": 2286 + }, + { + "epoch": 0.6335180055401662, + "grad_norm": 0.6841036677360535, + "learning_rate": 4.874127368762787e-06, + "loss": 0.6857, + "step": 2287 + }, + { + "epoch": 0.6337950138504155, + "grad_norm": 0.7111048102378845, + "learning_rate": 4.8740132095140095e-06, + "loss": 0.647, + "step": 2288 + }, + { + "epoch": 0.6340720221606648, + "grad_norm": 0.7113420963287354, + "learning_rate": 4.873898999858869e-06, + "loss": 0.6205, + "step": 2289 + }, + { + "epoch": 0.6343490304709142, + "grad_norm": 0.709007978439331, + "learning_rate": 4.873784739799791e-06, + "loss": 0.7045, + "step": 2290 + }, + { + "epoch": 0.6346260387811634, + "grad_norm": 0.7362955808639526, + "learning_rate": 4.873670429339201e-06, + "loss": 0.6675, + "step": 2291 + }, + { + "epoch": 0.6349030470914128, + "grad_norm": 0.6905544400215149, + "learning_rate": 4.873556068479525e-06, + "loss": 0.6069, + "step": 2292 + }, + { + "epoch": 0.635180055401662, + "grad_norm": 0.6953214406967163, + "learning_rate": 4.873441657223194e-06, + "loss": 0.6839, + "step": 2293 + }, + { + "epoch": 0.6354570637119114, + "grad_norm": 0.6331837773323059, + "learning_rate": 4.873327195572633e-06, + "loss": 0.6406, + "step": 2294 + }, + { + "epoch": 0.6357340720221607, + "grad_norm": 0.7017216086387634, + "learning_rate": 4.873212683530277e-06, + "loss": 0.6999, + "step": 2295 + }, + { + "epoch": 0.6360110803324099, + "grad_norm": 0.6879724860191345, + "learning_rate": 4.873098121098555e-06, + "loss": 0.711, + "step": 2296 + }, + { + "epoch": 0.6362880886426593, + "grad_norm": 0.6714237332344055, + "learning_rate": 4.872983508279899e-06, + "loss": 0.7054, + "step": 2297 + }, + { + "epoch": 0.6365650969529086, + "grad_norm": 0.7007396817207336, + "learning_rate": 4.872868845076744e-06, + "loss": 0.6493, + "step": 2298 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 0.67763352394104, + "learning_rate": 4.872754131491523e-06, + "loss": 0.6845, + "step": 2299 + }, + { + "epoch": 0.6371191135734072, + "grad_norm": 0.6635951995849609, + "learning_rate": 4.872639367526672e-06, + "loss": 0.7008, + "step": 2300 + }, + { + "epoch": 0.6373961218836565, + "grad_norm": 0.6842290163040161, + "learning_rate": 4.8725245531846286e-06, + "loss": 0.6962, + "step": 2301 + }, + { + "epoch": 0.6376731301939058, + "grad_norm": 0.7155497670173645, + "learning_rate": 4.87240968846783e-06, + "loss": 0.7084, + "step": 2302 + }, + { + "epoch": 0.6379501385041552, + "grad_norm": 0.6613172888755798, + "learning_rate": 4.872294773378716e-06, + "loss": 0.6343, + "step": 2303 + }, + { + "epoch": 0.6382271468144044, + "grad_norm": 0.6752813458442688, + "learning_rate": 4.872179807919726e-06, + "loss": 0.7075, + "step": 2304 + }, + { + "epoch": 0.6385041551246537, + "grad_norm": 0.6734436750411987, + "learning_rate": 4.8720647920932995e-06, + "loss": 0.7254, + "step": 2305 + }, + { + "epoch": 0.6387811634349031, + "grad_norm": 0.6765357255935669, + "learning_rate": 4.871949725901881e-06, + "loss": 0.6971, + "step": 2306 + }, + { + "epoch": 0.6390581717451523, + "grad_norm": 0.6636792421340942, + "learning_rate": 4.871834609347912e-06, + "loss": 0.6661, + "step": 2307 + }, + { + "epoch": 0.6393351800554017, + "grad_norm": 0.7011874914169312, + "learning_rate": 4.871719442433836e-06, + "loss": 0.7426, + "step": 2308 + }, + { + "epoch": 0.6396121883656509, + "grad_norm": 0.6820942759513855, + "learning_rate": 4.871604225162101e-06, + "loss": 0.6943, + "step": 2309 + }, + { + "epoch": 0.6398891966759003, + "grad_norm": 0.6526979804039001, + "learning_rate": 4.871488957535152e-06, + "loss": 0.6081, + "step": 2310 + }, + { + "epoch": 0.6401662049861496, + "grad_norm": 0.6738025546073914, + "learning_rate": 4.871373639555436e-06, + "loss": 0.7388, + "step": 2311 + }, + { + "epoch": 0.6404432132963989, + "grad_norm": 0.7214510440826416, + "learning_rate": 4.871258271225402e-06, + "loss": 0.6754, + "step": 2312 + }, + { + "epoch": 0.6407202216066482, + "grad_norm": 0.6859409213066101, + "learning_rate": 4.871142852547499e-06, + "loss": 0.6421, + "step": 2313 + }, + { + "epoch": 0.6409972299168976, + "grad_norm": 0.6700871586799622, + "learning_rate": 4.871027383524178e-06, + "loss": 0.61, + "step": 2314 + }, + { + "epoch": 0.6412742382271468, + "grad_norm": 0.6625704765319824, + "learning_rate": 4.870911864157891e-06, + "loss": 0.7213, + "step": 2315 + }, + { + "epoch": 0.6415512465373961, + "grad_norm": 0.697500467300415, + "learning_rate": 4.87079629445109e-06, + "loss": 0.7131, + "step": 2316 + }, + { + "epoch": 0.6418282548476454, + "grad_norm": 0.6746952533721924, + "learning_rate": 4.87068067440623e-06, + "loss": 0.6821, + "step": 2317 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 0.6551344990730286, + "learning_rate": 4.870565004025764e-06, + "loss": 0.6542, + "step": 2318 + }, + { + "epoch": 0.6423822714681441, + "grad_norm": 0.6762755513191223, + "learning_rate": 4.87044928331215e-06, + "loss": 0.7175, + "step": 2319 + }, + { + "epoch": 0.6426592797783933, + "grad_norm": 0.664469838142395, + "learning_rate": 4.870333512267846e-06, + "loss": 0.6826, + "step": 2320 + }, + { + "epoch": 0.6429362880886427, + "grad_norm": 0.6875748038291931, + "learning_rate": 4.870217690895306e-06, + "loss": 0.6722, + "step": 2321 + }, + { + "epoch": 0.643213296398892, + "grad_norm": 0.6557368040084839, + "learning_rate": 4.870101819196992e-06, + "loss": 0.689, + "step": 2322 + }, + { + "epoch": 0.6434903047091413, + "grad_norm": 0.6870899796485901, + "learning_rate": 4.8699858971753646e-06, + "loss": 0.6555, + "step": 2323 + }, + { + "epoch": 0.6437673130193906, + "grad_norm": 0.7284519672393799, + "learning_rate": 4.869869924832883e-06, + "loss": 0.7128, + "step": 2324 + }, + { + "epoch": 0.6440443213296398, + "grad_norm": 0.7355677485466003, + "learning_rate": 4.869753902172012e-06, + "loss": 0.744, + "step": 2325 + }, + { + "epoch": 0.6443213296398892, + "grad_norm": 0.6616363525390625, + "learning_rate": 4.869637829195214e-06, + "loss": 0.6767, + "step": 2326 + }, + { + "epoch": 0.6445983379501385, + "grad_norm": 0.6916202306747437, + "learning_rate": 4.869521705904953e-06, + "loss": 0.7144, + "step": 2327 + }, + { + "epoch": 0.6448753462603878, + "grad_norm": 0.7016916275024414, + "learning_rate": 4.869405532303695e-06, + "loss": 0.7178, + "step": 2328 + }, + { + "epoch": 0.6451523545706371, + "grad_norm": 0.7312451601028442, + "learning_rate": 4.869289308393907e-06, + "loss": 0.6602, + "step": 2329 + }, + { + "epoch": 0.6454293628808865, + "grad_norm": 0.6760799884796143, + "learning_rate": 4.869173034178057e-06, + "loss": 0.6401, + "step": 2330 + }, + { + "epoch": 0.6457063711911357, + "grad_norm": 0.6891919374465942, + "learning_rate": 4.869056709658611e-06, + "loss": 0.6833, + "step": 2331 + }, + { + "epoch": 0.6459833795013851, + "grad_norm": 0.694268524646759, + "learning_rate": 4.8689403348380435e-06, + "loss": 0.7102, + "step": 2332 + }, + { + "epoch": 0.6462603878116343, + "grad_norm": 0.6928057074546814, + "learning_rate": 4.868823909718823e-06, + "loss": 0.7166, + "step": 2333 + }, + { + "epoch": 0.6465373961218837, + "grad_norm": 0.6451415419578552, + "learning_rate": 4.86870743430342e-06, + "loss": 0.7169, + "step": 2334 + }, + { + "epoch": 0.646814404432133, + "grad_norm": 0.703238308429718, + "learning_rate": 4.86859090859431e-06, + "loss": 0.6977, + "step": 2335 + }, + { + "epoch": 0.6470914127423822, + "grad_norm": 0.6713940501213074, + "learning_rate": 4.868474332593966e-06, + "loss": 0.7237, + "step": 2336 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 0.6840459108352661, + "learning_rate": 4.8683577063048645e-06, + "loss": 0.6935, + "step": 2337 + }, + { + "epoch": 0.6476454293628808, + "grad_norm": 0.7354936003684998, + "learning_rate": 4.86824102972948e-06, + "loss": 0.7613, + "step": 2338 + }, + { + "epoch": 0.6479224376731302, + "grad_norm": 0.705204427242279, + "learning_rate": 4.868124302870291e-06, + "loss": 0.7165, + "step": 2339 + }, + { + "epoch": 0.6481994459833795, + "grad_norm": 0.7116757035255432, + "learning_rate": 4.8680075257297755e-06, + "loss": 0.6692, + "step": 2340 + }, + { + "epoch": 0.6484764542936288, + "grad_norm": 0.6490796804428101, + "learning_rate": 4.8678906983104125e-06, + "loss": 0.6556, + "step": 2341 + }, + { + "epoch": 0.6487534626038781, + "grad_norm": 0.66712886095047, + "learning_rate": 4.867773820614684e-06, + "loss": 0.6004, + "step": 2342 + }, + { + "epoch": 0.6490304709141275, + "grad_norm": 0.6557923555374146, + "learning_rate": 4.86765689264507e-06, + "loss": 0.6802, + "step": 2343 + }, + { + "epoch": 0.6493074792243767, + "grad_norm": 0.6623290777206421, + "learning_rate": 4.8675399144040535e-06, + "loss": 0.6551, + "step": 2344 + }, + { + "epoch": 0.649584487534626, + "grad_norm": 0.6883432865142822, + "learning_rate": 4.867422885894119e-06, + "loss": 0.7095, + "step": 2345 + }, + { + "epoch": 0.6498614958448753, + "grad_norm": 0.7088051438331604, + "learning_rate": 4.867305807117751e-06, + "loss": 0.7272, + "step": 2346 + }, + { + "epoch": 0.6501385041551246, + "grad_norm": 0.6738824248313904, + "learning_rate": 4.8671886780774345e-06, + "loss": 0.6953, + "step": 2347 + }, + { + "epoch": 0.650415512465374, + "grad_norm": 0.7325379848480225, + "learning_rate": 4.867071498775659e-06, + "loss": 0.6544, + "step": 2348 + }, + { + "epoch": 0.6506925207756232, + "grad_norm": 0.6509625911712646, + "learning_rate": 4.8669542692149085e-06, + "loss": 0.6904, + "step": 2349 + }, + { + "epoch": 0.6509695290858726, + "grad_norm": 0.647982120513916, + "learning_rate": 4.866836989397675e-06, + "loss": 0.6693, + "step": 2350 + }, + { + "epoch": 0.6512465373961219, + "grad_norm": 0.6922804713249207, + "learning_rate": 4.866719659326449e-06, + "loss": 0.6671, + "step": 2351 + }, + { + "epoch": 0.6515235457063712, + "grad_norm": 0.6838349103927612, + "learning_rate": 4.86660227900372e-06, + "loss": 0.7061, + "step": 2352 + }, + { + "epoch": 0.6518005540166205, + "grad_norm": 0.7052571773529053, + "learning_rate": 4.866484848431981e-06, + "loss": 0.7264, + "step": 2353 + }, + { + "epoch": 0.6520775623268698, + "grad_norm": 0.6909821033477783, + "learning_rate": 4.866367367613725e-06, + "loss": 0.7305, + "step": 2354 + }, + { + "epoch": 0.6523545706371191, + "grad_norm": 0.6939254403114319, + "learning_rate": 4.866249836551447e-06, + "loss": 0.6665, + "step": 2355 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 0.6795610189437866, + "learning_rate": 4.8661322552476425e-06, + "loss": 0.6743, + "step": 2356 + }, + { + "epoch": 0.6529085872576177, + "grad_norm": 0.6629143357276917, + "learning_rate": 4.8660146237048075e-06, + "loss": 0.6936, + "step": 2357 + }, + { + "epoch": 0.653185595567867, + "grad_norm": 0.7031983733177185, + "learning_rate": 4.8658969419254395e-06, + "loss": 0.6712, + "step": 2358 + }, + { + "epoch": 0.6534626038781164, + "grad_norm": 0.6823177337646484, + "learning_rate": 4.865779209912039e-06, + "loss": 0.6988, + "step": 2359 + }, + { + "epoch": 0.6537396121883656, + "grad_norm": 0.7229796648025513, + "learning_rate": 4.8656614276671035e-06, + "loss": 0.6827, + "step": 2360 + }, + { + "epoch": 0.654016620498615, + "grad_norm": 0.6489887237548828, + "learning_rate": 4.865543595193134e-06, + "loss": 0.6489, + "step": 2361 + }, + { + "epoch": 0.6542936288088642, + "grad_norm": 0.6998574733734131, + "learning_rate": 4.865425712492634e-06, + "loss": 0.718, + "step": 2362 + }, + { + "epoch": 0.6545706371191136, + "grad_norm": 0.6842721700668335, + "learning_rate": 4.865307779568104e-06, + "loss": 0.6843, + "step": 2363 + }, + { + "epoch": 0.6548476454293629, + "grad_norm": 0.6763157248497009, + "learning_rate": 4.865189796422051e-06, + "loss": 0.6713, + "step": 2364 + }, + { + "epoch": 0.6551246537396122, + "grad_norm": 0.7252478003501892, + "learning_rate": 4.865071763056979e-06, + "loss": 0.6857, + "step": 2365 + }, + { + "epoch": 0.6554016620498615, + "grad_norm": 0.6811568140983582, + "learning_rate": 4.864953679475392e-06, + "loss": 0.6572, + "step": 2366 + }, + { + "epoch": 0.6556786703601108, + "grad_norm": 0.7102620005607605, + "learning_rate": 4.8648355456798e-06, + "loss": 0.6759, + "step": 2367 + }, + { + "epoch": 0.6559556786703601, + "grad_norm": 0.6796259880065918, + "learning_rate": 4.86471736167271e-06, + "loss": 0.6906, + "step": 2368 + }, + { + "epoch": 0.6562326869806094, + "grad_norm": 0.6587184071540833, + "learning_rate": 4.864599127456632e-06, + "loss": 0.6048, + "step": 2369 + }, + { + "epoch": 0.6565096952908587, + "grad_norm": 0.6844584345817566, + "learning_rate": 4.864480843034075e-06, + "loss": 0.7046, + "step": 2370 + }, + { + "epoch": 0.656786703601108, + "grad_norm": 0.721359133720398, + "learning_rate": 4.864362508407553e-06, + "loss": 0.6874, + "step": 2371 + }, + { + "epoch": 0.6570637119113574, + "grad_norm": 0.6863132119178772, + "learning_rate": 4.8642441235795755e-06, + "loss": 0.7056, + "step": 2372 + }, + { + "epoch": 0.6573407202216066, + "grad_norm": 0.6410539746284485, + "learning_rate": 4.864125688552659e-06, + "loss": 0.6306, + "step": 2373 + }, + { + "epoch": 0.657617728531856, + "grad_norm": 0.6737743616104126, + "learning_rate": 4.864007203329316e-06, + "loss": 0.6999, + "step": 2374 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.699963390827179, + "learning_rate": 4.863888667912062e-06, + "loss": 0.6792, + "step": 2375 + }, + { + "epoch": 0.6581717451523545, + "grad_norm": 0.7324498891830444, + "learning_rate": 4.863770082303417e-06, + "loss": 0.6925, + "step": 2376 + }, + { + "epoch": 0.6584487534626039, + "grad_norm": 0.6902961730957031, + "learning_rate": 4.8636514465058955e-06, + "loss": 0.6798, + "step": 2377 + }, + { + "epoch": 0.6587257617728531, + "grad_norm": 0.6362594962120056, + "learning_rate": 4.8635327605220174e-06, + "loss": 0.6752, + "step": 2378 + }, + { + "epoch": 0.6590027700831025, + "grad_norm": 0.6623821258544922, + "learning_rate": 4.863414024354304e-06, + "loss": 0.6557, + "step": 2379 + }, + { + "epoch": 0.6592797783933518, + "grad_norm": 0.6951994299888611, + "learning_rate": 4.863295238005275e-06, + "loss": 0.7541, + "step": 2380 + }, + { + "epoch": 0.6595567867036011, + "grad_norm": 0.714439332485199, + "learning_rate": 4.863176401477453e-06, + "loss": 0.7237, + "step": 2381 + }, + { + "epoch": 0.6598337950138504, + "grad_norm": 0.6886929869651794, + "learning_rate": 4.8630575147733605e-06, + "loss": 0.6761, + "step": 2382 + }, + { + "epoch": 0.6601108033240998, + "grad_norm": 0.6800076365470886, + "learning_rate": 4.862938577895523e-06, + "loss": 0.6398, + "step": 2383 + }, + { + "epoch": 0.660387811634349, + "grad_norm": 0.6788169145584106, + "learning_rate": 4.862819590846465e-06, + "loss": 0.682, + "step": 2384 + }, + { + "epoch": 0.6606648199445984, + "grad_norm": 0.6839731335639954, + "learning_rate": 4.862700553628713e-06, + "loss": 0.6691, + "step": 2385 + }, + { + "epoch": 0.6609418282548476, + "grad_norm": 0.7404640913009644, + "learning_rate": 4.862581466244794e-06, + "loss": 0.6836, + "step": 2386 + }, + { + "epoch": 0.661218836565097, + "grad_norm": 0.6789003610610962, + "learning_rate": 4.862462328697239e-06, + "loss": 0.6926, + "step": 2387 + }, + { + "epoch": 0.6614958448753463, + "grad_norm": 0.6176121830940247, + "learning_rate": 4.862343140988573e-06, + "loss": 0.6704, + "step": 2388 + }, + { + "epoch": 0.6617728531855955, + "grad_norm": 0.699988067150116, + "learning_rate": 4.862223903121331e-06, + "loss": 0.6668, + "step": 2389 + }, + { + "epoch": 0.6620498614958449, + "grad_norm": 0.7106815576553345, + "learning_rate": 4.862104615098043e-06, + "loss": 0.7187, + "step": 2390 + }, + { + "epoch": 0.6623268698060942, + "grad_norm": 0.758069634437561, + "learning_rate": 4.861985276921242e-06, + "loss": 0.6749, + "step": 2391 + }, + { + "epoch": 0.6626038781163435, + "grad_norm": 0.6663122177124023, + "learning_rate": 4.861865888593461e-06, + "loss": 0.691, + "step": 2392 + }, + { + "epoch": 0.6628808864265928, + "grad_norm": 0.697738766670227, + "learning_rate": 4.861746450117236e-06, + "loss": 0.7034, + "step": 2393 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 0.6918672323226929, + "learning_rate": 4.861626961495103e-06, + "loss": 0.694, + "step": 2394 + }, + { + "epoch": 0.6634349030470914, + "grad_norm": 0.6631799340248108, + "learning_rate": 4.861507422729599e-06, + "loss": 0.7189, + "step": 2395 + }, + { + "epoch": 0.6637119113573408, + "grad_norm": 0.6965389847755432, + "learning_rate": 4.86138783382326e-06, + "loss": 0.6919, + "step": 2396 + }, + { + "epoch": 0.66398891966759, + "grad_norm": 0.6833652257919312, + "learning_rate": 4.861268194778629e-06, + "loss": 0.6663, + "step": 2397 + }, + { + "epoch": 0.6642659279778393, + "grad_norm": 0.6862128973007202, + "learning_rate": 4.8611485055982424e-06, + "loss": 0.686, + "step": 2398 + }, + { + "epoch": 0.6645429362880887, + "grad_norm": 0.6753442883491516, + "learning_rate": 4.861028766284645e-06, + "loss": 0.6683, + "step": 2399 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 0.7209076285362244, + "learning_rate": 4.860908976840376e-06, + "loss": 0.6523, + "step": 2400 + }, + { + "epoch": 0.6650969529085873, + "grad_norm": 0.6486983895301819, + "learning_rate": 4.860789137267982e-06, + "loss": 0.6799, + "step": 2401 + }, + { + "epoch": 0.6653739612188365, + "grad_norm": 0.6575525999069214, + "learning_rate": 4.860669247570004e-06, + "loss": 0.695, + "step": 2402 + }, + { + "epoch": 0.6656509695290859, + "grad_norm": 0.7097014784812927, + "learning_rate": 4.86054930774899e-06, + "loss": 0.6985, + "step": 2403 + }, + { + "epoch": 0.6659279778393352, + "grad_norm": 0.7121009826660156, + "learning_rate": 4.860429317807487e-06, + "loss": 0.6775, + "step": 2404 + }, + { + "epoch": 0.6662049861495845, + "grad_norm": 0.6753017902374268, + "learning_rate": 4.860309277748041e-06, + "loss": 0.6775, + "step": 2405 + }, + { + "epoch": 0.6664819944598338, + "grad_norm": 0.7212374806404114, + "learning_rate": 4.860189187573201e-06, + "loss": 0.7011, + "step": 2406 + }, + { + "epoch": 0.6667590027700832, + "grad_norm": 0.6950245499610901, + "learning_rate": 4.860069047285517e-06, + "loss": 0.7065, + "step": 2407 + }, + { + "epoch": 0.6670360110803324, + "grad_norm": 0.6570954918861389, + "learning_rate": 4.8599488568875415e-06, + "loss": 0.6897, + "step": 2408 + }, + { + "epoch": 0.6673130193905817, + "grad_norm": 0.7085040807723999, + "learning_rate": 4.859828616381824e-06, + "loss": 0.7055, + "step": 2409 + }, + { + "epoch": 0.667590027700831, + "grad_norm": 0.6904665231704712, + "learning_rate": 4.859708325770919e-06, + "loss": 0.6502, + "step": 2410 + }, + { + "epoch": 0.6678670360110803, + "grad_norm": 0.6556096076965332, + "learning_rate": 4.8595879850573805e-06, + "loss": 0.7084, + "step": 2411 + }, + { + "epoch": 0.6681440443213297, + "grad_norm": 0.6814101934432983, + "learning_rate": 4.859467594243763e-06, + "loss": 0.6719, + "step": 2412 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 0.6942593455314636, + "learning_rate": 4.859347153332623e-06, + "loss": 0.7164, + "step": 2413 + }, + { + "epoch": 0.6686980609418283, + "grad_norm": 0.7435287237167358, + "learning_rate": 4.859226662326518e-06, + "loss": 0.6917, + "step": 2414 + }, + { + "epoch": 0.6689750692520776, + "grad_norm": 0.6552598476409912, + "learning_rate": 4.859106121228006e-06, + "loss": 0.6754, + "step": 2415 + }, + { + "epoch": 0.6692520775623269, + "grad_norm": 0.748866856098175, + "learning_rate": 4.858985530039647e-06, + "loss": 0.7039, + "step": 2416 + }, + { + "epoch": 0.6695290858725762, + "grad_norm": 0.6530470848083496, + "learning_rate": 4.858864888764001e-06, + "loss": 0.7223, + "step": 2417 + }, + { + "epoch": 0.6698060941828254, + "grad_norm": 0.6766080856323242, + "learning_rate": 4.858744197403629e-06, + "loss": 0.6461, + "step": 2418 + }, + { + "epoch": 0.6700831024930748, + "grad_norm": 0.6829447746276855, + "learning_rate": 4.858623455961095e-06, + "loss": 0.7021, + "step": 2419 + }, + { + "epoch": 0.6703601108033241, + "grad_norm": 0.6978498101234436, + "learning_rate": 4.85850266443896e-06, + "loss": 0.7025, + "step": 2420 + }, + { + "epoch": 0.6706371191135734, + "grad_norm": 0.6810044050216675, + "learning_rate": 4.858381822839792e-06, + "loss": 0.7148, + "step": 2421 + }, + { + "epoch": 0.6709141274238227, + "grad_norm": 0.6638117432594299, + "learning_rate": 4.858260931166155e-06, + "loss": 0.6814, + "step": 2422 + }, + { + "epoch": 0.6711911357340721, + "grad_norm": 0.6710538864135742, + "learning_rate": 4.858139989420616e-06, + "loss": 0.6737, + "step": 2423 + }, + { + "epoch": 0.6714681440443213, + "grad_norm": 0.6491722464561462, + "learning_rate": 4.858018997605742e-06, + "loss": 0.6752, + "step": 2424 + }, + { + "epoch": 0.6717451523545707, + "grad_norm": 0.6852057576179504, + "learning_rate": 4.857897955724105e-06, + "loss": 0.6988, + "step": 2425 + }, + { + "epoch": 0.6720221606648199, + "grad_norm": 0.6496206521987915, + "learning_rate": 4.857776863778272e-06, + "loss": 0.6858, + "step": 2426 + }, + { + "epoch": 0.6722991689750693, + "grad_norm": 0.6961385011672974, + "learning_rate": 4.857655721770814e-06, + "loss": 0.6716, + "step": 2427 + }, + { + "epoch": 0.6725761772853186, + "grad_norm": 0.6473982930183411, + "learning_rate": 4.857534529704305e-06, + "loss": 0.6943, + "step": 2428 + }, + { + "epoch": 0.6728531855955678, + "grad_norm": 0.6823958158493042, + "learning_rate": 4.857413287581319e-06, + "loss": 0.6508, + "step": 2429 + }, + { + "epoch": 0.6731301939058172, + "grad_norm": 0.645851194858551, + "learning_rate": 4.857291995404426e-06, + "loss": 0.647, + "step": 2430 + }, + { + "epoch": 0.6734072022160665, + "grad_norm": 0.6593954563140869, + "learning_rate": 4.857170653176206e-06, + "loss": 0.6857, + "step": 2431 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 0.7132719159126282, + "learning_rate": 4.857049260899233e-06, + "loss": 0.7005, + "step": 2432 + }, + { + "epoch": 0.6739612188365651, + "grad_norm": 0.6565352082252502, + "learning_rate": 4.856927818576084e-06, + "loss": 0.6505, + "step": 2433 + }, + { + "epoch": 0.6742382271468144, + "grad_norm": 0.7270442843437195, + "learning_rate": 4.85680632620934e-06, + "loss": 0.7416, + "step": 2434 + }, + { + "epoch": 0.6745152354570637, + "grad_norm": 0.6352047920227051, + "learning_rate": 4.856684783801579e-06, + "loss": 0.656, + "step": 2435 + }, + { + "epoch": 0.6747922437673131, + "grad_norm": 0.6524407863616943, + "learning_rate": 4.8565631913553805e-06, + "loss": 0.6412, + "step": 2436 + }, + { + "epoch": 0.6750692520775623, + "grad_norm": 0.6914234161376953, + "learning_rate": 4.8564415488733275e-06, + "loss": 0.7128, + "step": 2437 + }, + { + "epoch": 0.6753462603878116, + "grad_norm": 0.6721922755241394, + "learning_rate": 4.856319856358004e-06, + "loss": 0.6491, + "step": 2438 + }, + { + "epoch": 0.675623268698061, + "grad_norm": 0.6697131395339966, + "learning_rate": 4.856198113811992e-06, + "loss": 0.7048, + "step": 2439 + }, + { + "epoch": 0.6759002770083102, + "grad_norm": 0.6895703077316284, + "learning_rate": 4.856076321237876e-06, + "loss": 0.6941, + "step": 2440 + }, + { + "epoch": 0.6761772853185596, + "grad_norm": 0.7259820103645325, + "learning_rate": 4.8559544786382436e-06, + "loss": 0.6767, + "step": 2441 + }, + { + "epoch": 0.6764542936288088, + "grad_norm": 0.6969472169876099, + "learning_rate": 4.8558325860156815e-06, + "loss": 0.7194, + "step": 2442 + }, + { + "epoch": 0.6767313019390582, + "grad_norm": 0.7093849778175354, + "learning_rate": 4.855710643372777e-06, + "loss": 0.7205, + "step": 2443 + }, + { + "epoch": 0.6770083102493075, + "grad_norm": 0.6748213768005371, + "learning_rate": 4.85558865071212e-06, + "loss": 0.6662, + "step": 2444 + }, + { + "epoch": 0.6772853185595568, + "grad_norm": 0.7034565806388855, + "learning_rate": 4.855466608036301e-06, + "loss": 0.7279, + "step": 2445 + }, + { + "epoch": 0.6775623268698061, + "grad_norm": 0.6898751854896545, + "learning_rate": 4.855344515347909e-06, + "loss": 0.6723, + "step": 2446 + }, + { + "epoch": 0.6778393351800553, + "grad_norm": 0.6853146553039551, + "learning_rate": 4.8552223726495395e-06, + "loss": 0.6965, + "step": 2447 + }, + { + "epoch": 0.6781163434903047, + "grad_norm": 0.6978676319122314, + "learning_rate": 4.855100179943784e-06, + "loss": 0.6788, + "step": 2448 + }, + { + "epoch": 0.678393351800554, + "grad_norm": 0.7059216499328613, + "learning_rate": 4.854977937233238e-06, + "loss": 0.6664, + "step": 2449 + }, + { + "epoch": 0.6786703601108033, + "grad_norm": 0.735381007194519, + "learning_rate": 4.854855644520495e-06, + "loss": 0.675, + "step": 2450 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 0.6741946339607239, + "learning_rate": 4.854733301808154e-06, + "loss": 0.7043, + "step": 2451 + }, + { + "epoch": 0.679224376731302, + "grad_norm": 0.6989836096763611, + "learning_rate": 4.854610909098813e-06, + "loss": 0.6898, + "step": 2452 + }, + { + "epoch": 0.6795013850415512, + "grad_norm": 0.7232776284217834, + "learning_rate": 4.854488466395067e-06, + "loss": 0.7202, + "step": 2453 + }, + { + "epoch": 0.6797783933518006, + "grad_norm": 0.6528100967407227, + "learning_rate": 4.854365973699519e-06, + "loss": 0.6509, + "step": 2454 + }, + { + "epoch": 0.6800554016620498, + "grad_norm": 0.703453540802002, + "learning_rate": 4.8542434310147694e-06, + "loss": 0.7101, + "step": 2455 + }, + { + "epoch": 0.6803324099722992, + "grad_norm": 0.6413300037384033, + "learning_rate": 4.854120838343419e-06, + "loss": 0.6905, + "step": 2456 + }, + { + "epoch": 0.6806094182825485, + "grad_norm": 0.6912885904312134, + "learning_rate": 4.853998195688072e-06, + "loss": 0.6881, + "step": 2457 + }, + { + "epoch": 0.6808864265927977, + "grad_norm": 0.6981949210166931, + "learning_rate": 4.853875503051332e-06, + "loss": 0.674, + "step": 2458 + }, + { + "epoch": 0.6811634349030471, + "grad_norm": 0.6651879549026489, + "learning_rate": 4.853752760435803e-06, + "loss": 0.6696, + "step": 2459 + }, + { + "epoch": 0.6814404432132964, + "grad_norm": 0.6497887969017029, + "learning_rate": 4.853629967844093e-06, + "loss": 0.639, + "step": 2460 + }, + { + "epoch": 0.6817174515235457, + "grad_norm": 0.6909531950950623, + "learning_rate": 4.853507125278808e-06, + "loss": 0.7339, + "step": 2461 + }, + { + "epoch": 0.681994459833795, + "grad_norm": 0.6893798112869263, + "learning_rate": 4.853384232742556e-06, + "loss": 0.677, + "step": 2462 + }, + { + "epoch": 0.6822714681440443, + "grad_norm": 0.6748727560043335, + "learning_rate": 4.853261290237947e-06, + "loss": 0.6947, + "step": 2463 + }, + { + "epoch": 0.6825484764542936, + "grad_norm": 0.6517544984817505, + "learning_rate": 4.853138297767592e-06, + "loss": 0.652, + "step": 2464 + }, + { + "epoch": 0.682825484764543, + "grad_norm": 0.6731075644493103, + "learning_rate": 4.853015255334101e-06, + "loss": 0.7087, + "step": 2465 + }, + { + "epoch": 0.6831024930747922, + "grad_norm": 0.6525775194168091, + "learning_rate": 4.852892162940088e-06, + "loss": 0.6077, + "step": 2466 + }, + { + "epoch": 0.6833795013850416, + "grad_norm": 0.6631868481636047, + "learning_rate": 4.852769020588165e-06, + "loss": 0.7125, + "step": 2467 + }, + { + "epoch": 0.6836565096952909, + "grad_norm": 0.6692011952400208, + "learning_rate": 4.852645828280948e-06, + "loss": 0.6796, + "step": 2468 + }, + { + "epoch": 0.6839335180055401, + "grad_norm": 0.6942884922027588, + "learning_rate": 4.852522586021051e-06, + "loss": 0.6865, + "step": 2469 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.6816681027412415, + "learning_rate": 4.852399293811093e-06, + "loss": 0.6727, + "step": 2470 + }, + { + "epoch": 0.6844875346260387, + "grad_norm": 0.7101230025291443, + "learning_rate": 4.85227595165369e-06, + "loss": 0.7104, + "step": 2471 + }, + { + "epoch": 0.6847645429362881, + "grad_norm": 0.6951206922531128, + "learning_rate": 4.852152559551461e-06, + "loss": 0.6908, + "step": 2472 + }, + { + "epoch": 0.6850415512465374, + "grad_norm": 0.7145783305168152, + "learning_rate": 4.852029117507027e-06, + "loss": 0.6956, + "step": 2473 + }, + { + "epoch": 0.6853185595567867, + "grad_norm": 0.6997228860855103, + "learning_rate": 4.851905625523009e-06, + "loss": 0.6977, + "step": 2474 + }, + { + "epoch": 0.685595567867036, + "grad_norm": 0.6776831150054932, + "learning_rate": 4.851782083602027e-06, + "loss": 0.6397, + "step": 2475 + }, + { + "epoch": 0.6858725761772854, + "grad_norm": 0.6669095754623413, + "learning_rate": 4.851658491746707e-06, + "loss": 0.6706, + "step": 2476 + }, + { + "epoch": 0.6861495844875346, + "grad_norm": 0.7028627991676331, + "learning_rate": 4.851534849959671e-06, + "loss": 0.6527, + "step": 2477 + }, + { + "epoch": 0.686426592797784, + "grad_norm": 0.6905294060707092, + "learning_rate": 4.8514111582435456e-06, + "loss": 0.7172, + "step": 2478 + }, + { + "epoch": 0.6867036011080332, + "grad_norm": 0.6870238184928894, + "learning_rate": 4.851287416600955e-06, + "loss": 0.6673, + "step": 2479 + }, + { + "epoch": 0.6869806094182825, + "grad_norm": 0.6995295286178589, + "learning_rate": 4.851163625034529e-06, + "loss": 0.6624, + "step": 2480 + }, + { + "epoch": 0.6872576177285319, + "grad_norm": 0.6662357449531555, + "learning_rate": 4.851039783546895e-06, + "loss": 0.6174, + "step": 2481 + }, + { + "epoch": 0.6875346260387811, + "grad_norm": 0.620506763458252, + "learning_rate": 4.850915892140683e-06, + "loss": 0.6799, + "step": 2482 + }, + { + "epoch": 0.6878116343490305, + "grad_norm": 0.6330281496047974, + "learning_rate": 4.850791950818522e-06, + "loss": 0.6338, + "step": 2483 + }, + { + "epoch": 0.6880886426592798, + "grad_norm": 0.6967810988426208, + "learning_rate": 4.850667959583045e-06, + "loss": 0.7018, + "step": 2484 + }, + { + "epoch": 0.6883656509695291, + "grad_norm": 0.6551015377044678, + "learning_rate": 4.850543918436885e-06, + "loss": 0.6489, + "step": 2485 + }, + { + "epoch": 0.6886426592797784, + "grad_norm": 0.6983478665351868, + "learning_rate": 4.8504198273826745e-06, + "loss": 0.7017, + "step": 2486 + }, + { + "epoch": 0.6889196675900277, + "grad_norm": 0.6824020743370056, + "learning_rate": 4.850295686423048e-06, + "loss": 0.7076, + "step": 2487 + }, + { + "epoch": 0.689196675900277, + "grad_norm": 0.6972485780715942, + "learning_rate": 4.850171495560643e-06, + "loss": 0.683, + "step": 2488 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 0.684066653251648, + "learning_rate": 4.850047254798095e-06, + "loss": 0.6491, + "step": 2489 + }, + { + "epoch": 0.6897506925207756, + "grad_norm": 0.6561121940612793, + "learning_rate": 4.8499229641380425e-06, + "loss": 0.644, + "step": 2490 + }, + { + "epoch": 0.6900277008310249, + "grad_norm": 0.6939969062805176, + "learning_rate": 4.849798623583124e-06, + "loss": 0.6638, + "step": 2491 + }, + { + "epoch": 0.6903047091412743, + "grad_norm": 0.6910930871963501, + "learning_rate": 4.8496742331359805e-06, + "loss": 0.6759, + "step": 2492 + }, + { + "epoch": 0.6905817174515235, + "grad_norm": 0.7051935791969299, + "learning_rate": 4.849549792799253e-06, + "loss": 0.6873, + "step": 2493 + }, + { + "epoch": 0.6908587257617729, + "grad_norm": 0.678946316242218, + "learning_rate": 4.849425302575583e-06, + "loss": 0.6753, + "step": 2494 + }, + { + "epoch": 0.6911357340720221, + "grad_norm": 0.6918194890022278, + "learning_rate": 4.849300762467613e-06, + "loss": 0.7027, + "step": 2495 + }, + { + "epoch": 0.6914127423822715, + "grad_norm": 0.7290768623352051, + "learning_rate": 4.8491761724779905e-06, + "loss": 0.7097, + "step": 2496 + }, + { + "epoch": 0.6916897506925208, + "grad_norm": 0.6682461500167847, + "learning_rate": 4.849051532609357e-06, + "loss": 0.6562, + "step": 2497 + }, + { + "epoch": 0.69196675900277, + "grad_norm": 0.7157434225082397, + "learning_rate": 4.848926842864361e-06, + "loss": 0.71, + "step": 2498 + }, + { + "epoch": 0.6922437673130194, + "grad_norm": 0.6673200130462646, + "learning_rate": 4.84880210324565e-06, + "loss": 0.6792, + "step": 2499 + }, + { + "epoch": 0.6925207756232687, + "grad_norm": 0.7229270935058594, + "learning_rate": 4.848677313755872e-06, + "loss": 0.6921, + "step": 2500 + }, + { + "epoch": 0.692797783933518, + "grad_norm": 0.6955553889274597, + "learning_rate": 4.848552474397677e-06, + "loss": 0.7078, + "step": 2501 + }, + { + "epoch": 0.6930747922437673, + "grad_norm": 0.6488661170005798, + "learning_rate": 4.848427585173715e-06, + "loss": 0.6502, + "step": 2502 + }, + { + "epoch": 0.6933518005540166, + "grad_norm": 0.6416786313056946, + "learning_rate": 4.848302646086638e-06, + "loss": 0.641, + "step": 2503 + }, + { + "epoch": 0.6936288088642659, + "grad_norm": 0.6498820781707764, + "learning_rate": 4.848177657139099e-06, + "loss": 0.6882, + "step": 2504 + }, + { + "epoch": 0.6939058171745153, + "grad_norm": 0.6757896542549133, + "learning_rate": 4.848052618333751e-06, + "loss": 0.6767, + "step": 2505 + }, + { + "epoch": 0.6941828254847645, + "grad_norm": 0.669786274433136, + "learning_rate": 4.8479275296732514e-06, + "loss": 0.6998, + "step": 2506 + }, + { + "epoch": 0.6944598337950139, + "grad_norm": 0.6754154562950134, + "learning_rate": 4.847802391160253e-06, + "loss": 0.6024, + "step": 2507 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.7129898071289062, + "learning_rate": 4.847677202797415e-06, + "loss": 0.6552, + "step": 2508 + }, + { + "epoch": 0.6950138504155124, + "grad_norm": 0.6702117323875427, + "learning_rate": 4.847551964587395e-06, + "loss": 0.6768, + "step": 2509 + }, + { + "epoch": 0.6952908587257618, + "grad_norm": 0.6439821124076843, + "learning_rate": 4.847426676532851e-06, + "loss": 0.6517, + "step": 2510 + }, + { + "epoch": 0.695567867036011, + "grad_norm": 0.6356789469718933, + "learning_rate": 4.847301338636444e-06, + "loss": 0.6824, + "step": 2511 + }, + { + "epoch": 0.6958448753462604, + "grad_norm": 0.7396525144577026, + "learning_rate": 4.847175950900836e-06, + "loss": 0.6814, + "step": 2512 + }, + { + "epoch": 0.6961218836565097, + "grad_norm": 0.7004340887069702, + "learning_rate": 4.847050513328687e-06, + "loss": 0.6993, + "step": 2513 + }, + { + "epoch": 0.696398891966759, + "grad_norm": 0.6686164736747742, + "learning_rate": 4.846925025922663e-06, + "loss": 0.6608, + "step": 2514 + }, + { + "epoch": 0.6966759002770083, + "grad_norm": 0.6834436058998108, + "learning_rate": 4.846799488685427e-06, + "loss": 0.6856, + "step": 2515 + }, + { + "epoch": 0.6969529085872577, + "grad_norm": 0.6966507434844971, + "learning_rate": 4.846673901619645e-06, + "loss": 0.6928, + "step": 2516 + }, + { + "epoch": 0.6972299168975069, + "grad_norm": 0.6435982584953308, + "learning_rate": 4.846548264727984e-06, + "loss": 0.6524, + "step": 2517 + }, + { + "epoch": 0.6975069252077563, + "grad_norm": 0.6690963506698608, + "learning_rate": 4.84642257801311e-06, + "loss": 0.6787, + "step": 2518 + }, + { + "epoch": 0.6977839335180055, + "grad_norm": 0.6870619058609009, + "learning_rate": 4.846296841477692e-06, + "loss": 0.6838, + "step": 2519 + }, + { + "epoch": 0.6980609418282548, + "grad_norm": 0.6921945810317993, + "learning_rate": 4.846171055124401e-06, + "loss": 0.6786, + "step": 2520 + }, + { + "epoch": 0.6983379501385042, + "grad_norm": 0.6810411810874939, + "learning_rate": 4.846045218955907e-06, + "loss": 0.6997, + "step": 2521 + }, + { + "epoch": 0.6986149584487534, + "grad_norm": 0.7256765961647034, + "learning_rate": 4.845919332974882e-06, + "loss": 0.6654, + "step": 2522 + }, + { + "epoch": 0.6988919667590028, + "grad_norm": 0.706616222858429, + "learning_rate": 4.845793397183999e-06, + "loss": 0.6626, + "step": 2523 + }, + { + "epoch": 0.6991689750692521, + "grad_norm": 0.68105548620224, + "learning_rate": 4.845667411585931e-06, + "loss": 0.6739, + "step": 2524 + }, + { + "epoch": 0.6994459833795014, + "grad_norm": 0.6390605568885803, + "learning_rate": 4.845541376183354e-06, + "loss": 0.6434, + "step": 2525 + }, + { + "epoch": 0.6997229916897507, + "grad_norm": 0.6960382461547852, + "learning_rate": 4.8454152909789435e-06, + "loss": 0.6883, + "step": 2526 + }, + { + "epoch": 0.7, + "grad_norm": 0.6883596181869507, + "learning_rate": 4.845289155975378e-06, + "loss": 0.7118, + "step": 2527 + }, + { + "epoch": 0.7002770083102493, + "grad_norm": 0.6748420596122742, + "learning_rate": 4.845162971175333e-06, + "loss": 0.6885, + "step": 2528 + }, + { + "epoch": 0.7005540166204987, + "grad_norm": 0.6539015173912048, + "learning_rate": 4.8450367365814894e-06, + "loss": 0.661, + "step": 2529 + }, + { + "epoch": 0.7008310249307479, + "grad_norm": 0.6839054822921753, + "learning_rate": 4.844910452196527e-06, + "loss": 0.683, + "step": 2530 + }, + { + "epoch": 0.7011080332409972, + "grad_norm": 0.6770256757736206, + "learning_rate": 4.844784118023129e-06, + "loss": 0.6966, + "step": 2531 + }, + { + "epoch": 0.7013850415512466, + "grad_norm": 0.6649383902549744, + "learning_rate": 4.844657734063975e-06, + "loss": 0.7109, + "step": 2532 + }, + { + "epoch": 0.7016620498614958, + "grad_norm": 0.7795406579971313, + "learning_rate": 4.84453130032175e-06, + "loss": 0.7088, + "step": 2533 + }, + { + "epoch": 0.7019390581717452, + "grad_norm": 0.65948486328125, + "learning_rate": 4.8444048167991386e-06, + "loss": 0.6543, + "step": 2534 + }, + { + "epoch": 0.7022160664819944, + "grad_norm": 0.7084441781044006, + "learning_rate": 4.844278283498826e-06, + "loss": 0.6912, + "step": 2535 + }, + { + "epoch": 0.7024930747922438, + "grad_norm": 0.6589716672897339, + "learning_rate": 4.844151700423498e-06, + "loss": 0.6528, + "step": 2536 + }, + { + "epoch": 0.7027700831024931, + "grad_norm": 0.6785886883735657, + "learning_rate": 4.844025067575844e-06, + "loss": 0.6702, + "step": 2537 + }, + { + "epoch": 0.7030470914127424, + "grad_norm": 0.6644973754882812, + "learning_rate": 4.843898384958551e-06, + "loss": 0.6606, + "step": 2538 + }, + { + "epoch": 0.7033240997229917, + "grad_norm": 0.6794224381446838, + "learning_rate": 4.84377165257431e-06, + "loss": 0.6894, + "step": 2539 + }, + { + "epoch": 0.703601108033241, + "grad_norm": 0.6537707448005676, + "learning_rate": 4.843644870425811e-06, + "loss": 0.646, + "step": 2540 + }, + { + "epoch": 0.7038781163434903, + "grad_norm": 0.662074625492096, + "learning_rate": 4.843518038515747e-06, + "loss": 0.6465, + "step": 2541 + }, + { + "epoch": 0.7041551246537396, + "grad_norm": 0.6878114342689514, + "learning_rate": 4.843391156846811e-06, + "loss": 0.6566, + "step": 2542 + }, + { + "epoch": 0.7044321329639889, + "grad_norm": 0.666167676448822, + "learning_rate": 4.8432642254216945e-06, + "loss": 0.6513, + "step": 2543 + }, + { + "epoch": 0.7047091412742382, + "grad_norm": 0.6801060438156128, + "learning_rate": 4.843137244243095e-06, + "loss": 0.6763, + "step": 2544 + }, + { + "epoch": 0.7049861495844876, + "grad_norm": 0.7075893878936768, + "learning_rate": 4.843010213313709e-06, + "loss": 0.7008, + "step": 2545 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 0.6655442714691162, + "learning_rate": 4.842883132636233e-06, + "loss": 0.6849, + "step": 2546 + }, + { + "epoch": 0.7055401662049862, + "grad_norm": 0.6841294765472412, + "learning_rate": 4.842756002213364e-06, + "loss": 0.6742, + "step": 2547 + }, + { + "epoch": 0.7058171745152355, + "grad_norm": 0.6728150248527527, + "learning_rate": 4.842628822047803e-06, + "loss": 0.7086, + "step": 2548 + }, + { + "epoch": 0.7060941828254848, + "grad_norm": 0.7174102663993835, + "learning_rate": 4.8425015921422495e-06, + "loss": 0.7403, + "step": 2549 + }, + { + "epoch": 0.7063711911357341, + "grad_norm": 0.6728398203849792, + "learning_rate": 4.842374312499405e-06, + "loss": 0.6833, + "step": 2550 + }, + { + "epoch": 0.7066481994459833, + "grad_norm": 0.6851624250411987, + "learning_rate": 4.842246983121972e-06, + "loss": 0.6733, + "step": 2551 + }, + { + "epoch": 0.7069252077562327, + "grad_norm": 0.7019652724266052, + "learning_rate": 4.842119604012654e-06, + "loss": 0.618, + "step": 2552 + }, + { + "epoch": 0.707202216066482, + "grad_norm": 0.6876543164253235, + "learning_rate": 4.841992175174156e-06, + "loss": 0.6982, + "step": 2553 + }, + { + "epoch": 0.7074792243767313, + "grad_norm": 0.6728835701942444, + "learning_rate": 4.841864696609183e-06, + "loss": 0.6736, + "step": 2554 + }, + { + "epoch": 0.7077562326869806, + "grad_norm": 0.7015979886054993, + "learning_rate": 4.841737168320443e-06, + "loss": 0.6532, + "step": 2555 + }, + { + "epoch": 0.7080332409972299, + "grad_norm": 0.6520795226097107, + "learning_rate": 4.8416095903106415e-06, + "loss": 0.6563, + "step": 2556 + }, + { + "epoch": 0.7083102493074792, + "grad_norm": 0.6712133884429932, + "learning_rate": 4.841481962582489e-06, + "loss": 0.712, + "step": 2557 + }, + { + "epoch": 0.7085872576177286, + "grad_norm": 0.7220582365989685, + "learning_rate": 4.841354285138695e-06, + "loss": 0.6635, + "step": 2558 + }, + { + "epoch": 0.7088642659279778, + "grad_norm": 0.6810282468795776, + "learning_rate": 4.84122655798197e-06, + "loss": 0.6837, + "step": 2559 + }, + { + "epoch": 0.7091412742382271, + "grad_norm": 0.6945074200630188, + "learning_rate": 4.841098781115027e-06, + "loss": 0.6894, + "step": 2560 + }, + { + "epoch": 0.7094182825484765, + "grad_norm": 0.6712693572044373, + "learning_rate": 4.840970954540578e-06, + "loss": 0.6993, + "step": 2561 + }, + { + "epoch": 0.7096952908587257, + "grad_norm": 0.6899521946907043, + "learning_rate": 4.840843078261338e-06, + "loss": 0.6969, + "step": 2562 + }, + { + "epoch": 0.7099722991689751, + "grad_norm": 0.6725625991821289, + "learning_rate": 4.840715152280021e-06, + "loss": 0.7176, + "step": 2563 + }, + { + "epoch": 0.7102493074792243, + "grad_norm": 0.7018441557884216, + "learning_rate": 4.8405871765993435e-06, + "loss": 0.7118, + "step": 2564 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.6593537330627441, + "learning_rate": 4.840459151222024e-06, + "loss": 0.6727, + "step": 2565 + }, + { + "epoch": 0.710803324099723, + "grad_norm": 0.6770919561386108, + "learning_rate": 4.840331076150778e-06, + "loss": 0.68, + "step": 2566 + }, + { + "epoch": 0.7110803324099723, + "grad_norm": 0.6419566869735718, + "learning_rate": 4.840202951388328e-06, + "loss": 0.6911, + "step": 2567 + }, + { + "epoch": 0.7113573407202216, + "grad_norm": 0.6920489072799683, + "learning_rate": 4.840074776937393e-06, + "loss": 0.6668, + "step": 2568 + }, + { + "epoch": 0.711634349030471, + "grad_norm": 0.6773355603218079, + "learning_rate": 4.839946552800695e-06, + "loss": 0.6783, + "step": 2569 + }, + { + "epoch": 0.7119113573407202, + "grad_norm": 0.6632847189903259, + "learning_rate": 4.839818278980956e-06, + "loss": 0.7495, + "step": 2570 + }, + { + "epoch": 0.7121883656509695, + "grad_norm": 0.697297215461731, + "learning_rate": 4.839689955480899e-06, + "loss": 0.6777, + "step": 2571 + }, + { + "epoch": 0.7124653739612188, + "grad_norm": 0.6623373627662659, + "learning_rate": 4.8395615823032505e-06, + "loss": 0.6826, + "step": 2572 + }, + { + "epoch": 0.7127423822714681, + "grad_norm": 0.6580797433853149, + "learning_rate": 4.839433159450734e-06, + "loss": 0.6558, + "step": 2573 + }, + { + "epoch": 0.7130193905817175, + "grad_norm": 0.6638851761817932, + "learning_rate": 4.839304686926077e-06, + "loss": 0.6596, + "step": 2574 + }, + { + "epoch": 0.7132963988919667, + "grad_norm": 0.7214973568916321, + "learning_rate": 4.839176164732009e-06, + "loss": 0.6816, + "step": 2575 + }, + { + "epoch": 0.7135734072022161, + "grad_norm": 0.6427276730537415, + "learning_rate": 4.839047592871257e-06, + "loss": 0.6325, + "step": 2576 + }, + { + "epoch": 0.7138504155124654, + "grad_norm": 0.7028458714485168, + "learning_rate": 4.83891897134655e-06, + "loss": 0.6696, + "step": 2577 + }, + { + "epoch": 0.7141274238227147, + "grad_norm": 0.6528476476669312, + "learning_rate": 4.838790300160621e-06, + "loss": 0.6547, + "step": 2578 + }, + { + "epoch": 0.714404432132964, + "grad_norm": 0.7301563024520874, + "learning_rate": 4.838661579316202e-06, + "loss": 0.7337, + "step": 2579 + }, + { + "epoch": 0.7146814404432132, + "grad_norm": 0.6242020726203918, + "learning_rate": 4.838532808816025e-06, + "loss": 0.5942, + "step": 2580 + }, + { + "epoch": 0.7149584487534626, + "grad_norm": 0.6679527759552002, + "learning_rate": 4.838403988662824e-06, + "loss": 0.6418, + "step": 2581 + }, + { + "epoch": 0.7152354570637119, + "grad_norm": 0.6938562393188477, + "learning_rate": 4.838275118859335e-06, + "loss": 0.6301, + "step": 2582 + }, + { + "epoch": 0.7155124653739612, + "grad_norm": 0.6652330756187439, + "learning_rate": 4.838146199408294e-06, + "loss": 0.6818, + "step": 2583 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 0.6956022381782532, + "learning_rate": 4.838017230312438e-06, + "loss": 0.6836, + "step": 2584 + }, + { + "epoch": 0.7160664819944599, + "grad_norm": 0.714000403881073, + "learning_rate": 4.8378882115745064e-06, + "loss": 0.7088, + "step": 2585 + }, + { + "epoch": 0.7163434903047091, + "grad_norm": 0.7130530476570129, + "learning_rate": 4.837759143197237e-06, + "loss": 0.7168, + "step": 2586 + }, + { + "epoch": 0.7166204986149585, + "grad_norm": 0.6532400846481323, + "learning_rate": 4.837630025183371e-06, + "loss": 0.6817, + "step": 2587 + }, + { + "epoch": 0.7168975069252077, + "grad_norm": 0.6796839833259583, + "learning_rate": 4.83750085753565e-06, + "loss": 0.7166, + "step": 2588 + }, + { + "epoch": 0.717174515235457, + "grad_norm": 0.6681724786758423, + "learning_rate": 4.837371640256816e-06, + "loss": 0.6278, + "step": 2589 + }, + { + "epoch": 0.7174515235457064, + "grad_norm": 0.6967169642448425, + "learning_rate": 4.837242373349614e-06, + "loss": 0.6693, + "step": 2590 + }, + { + "epoch": 0.7177285318559556, + "grad_norm": 0.6756342053413391, + "learning_rate": 4.837113056816787e-06, + "loss": 0.7269, + "step": 2591 + }, + { + "epoch": 0.718005540166205, + "grad_norm": 0.6517120599746704, + "learning_rate": 4.836983690661082e-06, + "loss": 0.6512, + "step": 2592 + }, + { + "epoch": 0.7182825484764543, + "grad_norm": 0.6642926931381226, + "learning_rate": 4.8368542748852445e-06, + "loss": 0.6615, + "step": 2593 + }, + { + "epoch": 0.7185595567867036, + "grad_norm": 0.7000516653060913, + "learning_rate": 4.836724809492024e-06, + "loss": 0.6891, + "step": 2594 + }, + { + "epoch": 0.7188365650969529, + "grad_norm": 0.6529144644737244, + "learning_rate": 4.836595294484168e-06, + "loss": 0.6853, + "step": 2595 + }, + { + "epoch": 0.7191135734072022, + "grad_norm": 0.7132996320724487, + "learning_rate": 4.836465729864427e-06, + "loss": 0.6745, + "step": 2596 + }, + { + "epoch": 0.7193905817174515, + "grad_norm": 0.6974446177482605, + "learning_rate": 4.836336115635551e-06, + "loss": 0.6257, + "step": 2597 + }, + { + "epoch": 0.7196675900277009, + "grad_norm": 0.6566895842552185, + "learning_rate": 4.836206451800294e-06, + "loss": 0.6664, + "step": 2598 + }, + { + "epoch": 0.7199445983379501, + "grad_norm": 0.7557151913642883, + "learning_rate": 4.836076738361408e-06, + "loss": 0.7274, + "step": 2599 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 0.6727636456489563, + "learning_rate": 4.835946975321647e-06, + "loss": 0.6644, + "step": 2600 + }, + { + "epoch": 0.7204986149584488, + "grad_norm": 0.6884294748306274, + "learning_rate": 4.835817162683765e-06, + "loss": 0.6728, + "step": 2601 + }, + { + "epoch": 0.720775623268698, + "grad_norm": 0.6941688060760498, + "learning_rate": 4.835687300450522e-06, + "loss": 0.6552, + "step": 2602 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 0.6695191860198975, + "learning_rate": 4.8355573886246715e-06, + "loss": 0.7092, + "step": 2603 + }, + { + "epoch": 0.7213296398891966, + "grad_norm": 0.7561391592025757, + "learning_rate": 4.835427427208973e-06, + "loss": 0.6844, + "step": 2604 + }, + { + "epoch": 0.721606648199446, + "grad_norm": 0.7127019166946411, + "learning_rate": 4.835297416206187e-06, + "loss": 0.6553, + "step": 2605 + }, + { + "epoch": 0.7218836565096953, + "grad_norm": 0.6892712116241455, + "learning_rate": 4.835167355619073e-06, + "loss": 0.7134, + "step": 2606 + }, + { + "epoch": 0.7221606648199446, + "grad_norm": 0.6294607520103455, + "learning_rate": 4.835037245450393e-06, + "loss": 0.6248, + "step": 2607 + }, + { + "epoch": 0.7224376731301939, + "grad_norm": 0.7269555926322937, + "learning_rate": 4.834907085702909e-06, + "loss": 0.6899, + "step": 2608 + }, + { + "epoch": 0.7227146814404433, + "grad_norm": 0.6997284889221191, + "learning_rate": 4.834776876379384e-06, + "loss": 0.6865, + "step": 2609 + }, + { + "epoch": 0.7229916897506925, + "grad_norm": 0.6564635634422302, + "learning_rate": 4.834646617482585e-06, + "loss": 0.6942, + "step": 2610 + }, + { + "epoch": 0.7232686980609419, + "grad_norm": 0.6637692451477051, + "learning_rate": 4.834516309015274e-06, + "loss": 0.6534, + "step": 2611 + }, + { + "epoch": 0.7235457063711911, + "grad_norm": 0.6564186215400696, + "learning_rate": 4.834385950980222e-06, + "loss": 0.6673, + "step": 2612 + }, + { + "epoch": 0.7238227146814404, + "grad_norm": 0.669204592704773, + "learning_rate": 4.834255543380195e-06, + "loss": 0.6218, + "step": 2613 + }, + { + "epoch": 0.7240997229916898, + "grad_norm": 0.6596257090568542, + "learning_rate": 4.834125086217961e-06, + "loss": 0.6568, + "step": 2614 + }, + { + "epoch": 0.724376731301939, + "grad_norm": 0.6701518893241882, + "learning_rate": 4.833994579496291e-06, + "loss": 0.7237, + "step": 2615 + }, + { + "epoch": 0.7246537396121884, + "grad_norm": 0.6752590537071228, + "learning_rate": 4.833864023217955e-06, + "loss": 0.7239, + "step": 2616 + }, + { + "epoch": 0.7249307479224377, + "grad_norm": 0.7187966108322144, + "learning_rate": 4.8337334173857265e-06, + "loss": 0.7146, + "step": 2617 + }, + { + "epoch": 0.725207756232687, + "grad_norm": 0.740940511226654, + "learning_rate": 4.833602762002378e-06, + "loss": 0.735, + "step": 2618 + }, + { + "epoch": 0.7254847645429363, + "grad_norm": 0.7270171642303467, + "learning_rate": 4.8334720570706825e-06, + "loss": 0.6751, + "step": 2619 + }, + { + "epoch": 0.7257617728531855, + "grad_norm": 0.6728403568267822, + "learning_rate": 4.833341302593417e-06, + "loss": 0.6642, + "step": 2620 + }, + { + "epoch": 0.7260387811634349, + "grad_norm": 0.6736679077148438, + "learning_rate": 4.833210498573358e-06, + "loss": 0.7014, + "step": 2621 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 0.6557801365852356, + "learning_rate": 4.83307964501328e-06, + "loss": 0.6956, + "step": 2622 + }, + { + "epoch": 0.7265927977839335, + "grad_norm": 0.6692063212394714, + "learning_rate": 4.832948741915963e-06, + "loss": 0.7072, + "step": 2623 + }, + { + "epoch": 0.7268698060941828, + "grad_norm": 0.6571874022483826, + "learning_rate": 4.832817789284188e-06, + "loss": 0.6842, + "step": 2624 + }, + { + "epoch": 0.7271468144044322, + "grad_norm": 0.6895356178283691, + "learning_rate": 4.832686787120733e-06, + "loss": 0.7092, + "step": 2625 + }, + { + "epoch": 0.7274238227146814, + "grad_norm": 0.6702733635902405, + "learning_rate": 4.83255573542838e-06, + "loss": 0.6036, + "step": 2626 + }, + { + "epoch": 0.7277008310249308, + "grad_norm": 0.7040365934371948, + "learning_rate": 4.832424634209913e-06, + "loss": 0.6881, + "step": 2627 + }, + { + "epoch": 0.72797783933518, + "grad_norm": 0.6673402786254883, + "learning_rate": 4.832293483468115e-06, + "loss": 0.6887, + "step": 2628 + }, + { + "epoch": 0.7282548476454294, + "grad_norm": 0.6921663284301758, + "learning_rate": 4.8321622832057695e-06, + "loss": 0.6798, + "step": 2629 + }, + { + "epoch": 0.7285318559556787, + "grad_norm": 0.6102813482284546, + "learning_rate": 4.832031033425663e-06, + "loss": 0.622, + "step": 2630 + }, + { + "epoch": 0.728808864265928, + "grad_norm": 0.6588747501373291, + "learning_rate": 4.8318997341305815e-06, + "loss": 0.6443, + "step": 2631 + }, + { + "epoch": 0.7290858725761773, + "grad_norm": 0.6669639945030212, + "learning_rate": 4.831768385323315e-06, + "loss": 0.6138, + "step": 2632 + }, + { + "epoch": 0.7293628808864266, + "grad_norm": 0.6380589008331299, + "learning_rate": 4.83163698700665e-06, + "loss": 0.6112, + "step": 2633 + }, + { + "epoch": 0.7296398891966759, + "grad_norm": 0.7220094203948975, + "learning_rate": 4.8315055391833785e-06, + "loss": 0.6781, + "step": 2634 + }, + { + "epoch": 0.7299168975069252, + "grad_norm": 0.6523056030273438, + "learning_rate": 4.8313740418562895e-06, + "loss": 0.656, + "step": 2635 + }, + { + "epoch": 0.7301939058171745, + "grad_norm": 0.679682195186615, + "learning_rate": 4.831242495028177e-06, + "loss": 0.6812, + "step": 2636 + }, + { + "epoch": 0.7304709141274238, + "grad_norm": 0.6778846979141235, + "learning_rate": 4.831110898701832e-06, + "loss": 0.658, + "step": 2637 + }, + { + "epoch": 0.7307479224376732, + "grad_norm": 0.6646358370780945, + "learning_rate": 4.83097925288005e-06, + "loss": 0.7349, + "step": 2638 + }, + { + "epoch": 0.7310249307479224, + "grad_norm": 0.6693096160888672, + "learning_rate": 4.830847557565626e-06, + "loss": 0.686, + "step": 2639 + }, + { + "epoch": 0.7313019390581718, + "grad_norm": 0.6840837597846985, + "learning_rate": 4.830715812761355e-06, + "loss": 0.6764, + "step": 2640 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.6790053844451904, + "learning_rate": 4.830584018470036e-06, + "loss": 0.6649, + "step": 2641 + }, + { + "epoch": 0.7318559556786703, + "grad_norm": 0.7182514667510986, + "learning_rate": 4.830452174694466e-06, + "loss": 0.6707, + "step": 2642 + }, + { + "epoch": 0.7321329639889197, + "grad_norm": 0.6911314129829407, + "learning_rate": 4.830320281437446e-06, + "loss": 0.6779, + "step": 2643 + }, + { + "epoch": 0.7324099722991689, + "grad_norm": 0.6859692335128784, + "learning_rate": 4.830188338701776e-06, + "loss": 0.7284, + "step": 2644 + }, + { + "epoch": 0.7326869806094183, + "grad_norm": 0.7071799635887146, + "learning_rate": 4.830056346490255e-06, + "loss": 0.7353, + "step": 2645 + }, + { + "epoch": 0.7329639889196676, + "grad_norm": 0.6324630975723267, + "learning_rate": 4.829924304805689e-06, + "loss": 0.6674, + "step": 2646 + }, + { + "epoch": 0.7332409972299169, + "grad_norm": 0.654015839099884, + "learning_rate": 4.829792213650879e-06, + "loss": 0.6058, + "step": 2647 + }, + { + "epoch": 0.7335180055401662, + "grad_norm": 0.708918035030365, + "learning_rate": 4.829660073028631e-06, + "loss": 0.677, + "step": 2648 + }, + { + "epoch": 0.7337950138504156, + "grad_norm": 0.6400759220123291, + "learning_rate": 4.82952788294175e-06, + "loss": 0.6864, + "step": 2649 + }, + { + "epoch": 0.7340720221606648, + "grad_norm": 0.6708654761314392, + "learning_rate": 4.829395643393045e-06, + "loss": 0.7214, + "step": 2650 + }, + { + "epoch": 0.7343490304709142, + "grad_norm": 0.7093385457992554, + "learning_rate": 4.82926335438532e-06, + "loss": 0.7227, + "step": 2651 + }, + { + "epoch": 0.7346260387811634, + "grad_norm": 0.688284695148468, + "learning_rate": 4.829131015921386e-06, + "loss": 0.6762, + "step": 2652 + }, + { + "epoch": 0.7349030470914127, + "grad_norm": 0.6453286409378052, + "learning_rate": 4.828998628004053e-06, + "loss": 0.6432, + "step": 2653 + }, + { + "epoch": 0.7351800554016621, + "grad_norm": 0.7219470143318176, + "learning_rate": 4.82886619063613e-06, + "loss": 0.6863, + "step": 2654 + }, + { + "epoch": 0.7354570637119113, + "grad_norm": 0.7020809054374695, + "learning_rate": 4.828733703820432e-06, + "loss": 0.6498, + "step": 2655 + }, + { + "epoch": 0.7357340720221607, + "grad_norm": 0.6752793788909912, + "learning_rate": 4.8286011675597705e-06, + "loss": 0.6503, + "step": 2656 + }, + { + "epoch": 0.7360110803324099, + "grad_norm": 0.6839720010757446, + "learning_rate": 4.828468581856959e-06, + "loss": 0.6151, + "step": 2657 + }, + { + "epoch": 0.7362880886426593, + "grad_norm": 0.6902003288269043, + "learning_rate": 4.828335946714814e-06, + "loss": 0.6585, + "step": 2658 + }, + { + "epoch": 0.7365650969529086, + "grad_norm": 0.6736903190612793, + "learning_rate": 4.82820326213615e-06, + "loss": 0.6276, + "step": 2659 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.654117226600647, + "learning_rate": 4.8280705281237865e-06, + "loss": 0.6586, + "step": 2660 + }, + { + "epoch": 0.7371191135734072, + "grad_norm": 0.6765998601913452, + "learning_rate": 4.827937744680539e-06, + "loss": 0.668, + "step": 2661 + }, + { + "epoch": 0.7373961218836566, + "grad_norm": 0.6439967155456543, + "learning_rate": 4.8278049118092285e-06, + "loss": 0.6644, + "step": 2662 + }, + { + "epoch": 0.7376731301939058, + "grad_norm": 0.6422175168991089, + "learning_rate": 4.827672029512676e-06, + "loss": 0.6662, + "step": 2663 + }, + { + "epoch": 0.7379501385041551, + "grad_norm": 0.636263370513916, + "learning_rate": 4.827539097793702e-06, + "loss": 0.6252, + "step": 2664 + }, + { + "epoch": 0.7382271468144044, + "grad_norm": 0.695054829120636, + "learning_rate": 4.827406116655128e-06, + "loss": 0.6781, + "step": 2665 + }, + { + "epoch": 0.7385041551246537, + "grad_norm": 0.6738481521606445, + "learning_rate": 4.82727308609978e-06, + "loss": 0.6835, + "step": 2666 + }, + { + "epoch": 0.7387811634349031, + "grad_norm": 0.656569242477417, + "learning_rate": 4.82714000613048e-06, + "loss": 0.6515, + "step": 2667 + }, + { + "epoch": 0.7390581717451523, + "grad_norm": 0.7025095820426941, + "learning_rate": 4.827006876750056e-06, + "loss": 0.7286, + "step": 2668 + }, + { + "epoch": 0.7393351800554017, + "grad_norm": 0.6530948281288147, + "learning_rate": 4.826873697961332e-06, + "loss": 0.6761, + "step": 2669 + }, + { + "epoch": 0.739612188365651, + "grad_norm": 0.6586973071098328, + "learning_rate": 4.826740469767138e-06, + "loss": 0.6785, + "step": 2670 + }, + { + "epoch": 0.7398891966759003, + "grad_norm": 0.661683976650238, + "learning_rate": 4.826607192170301e-06, + "loss": 0.7015, + "step": 2671 + }, + { + "epoch": 0.7401662049861496, + "grad_norm": 0.6833496689796448, + "learning_rate": 4.8264738651736534e-06, + "loss": 0.6494, + "step": 2672 + }, + { + "epoch": 0.7404432132963988, + "grad_norm": 0.7184470891952515, + "learning_rate": 4.8263404887800235e-06, + "loss": 0.7068, + "step": 2673 + }, + { + "epoch": 0.7407202216066482, + "grad_norm": 0.7301485538482666, + "learning_rate": 4.826207062992245e-06, + "loss": 0.7386, + "step": 2674 + }, + { + "epoch": 0.7409972299168975, + "grad_norm": 0.6598369479179382, + "learning_rate": 4.826073587813149e-06, + "loss": 0.6654, + "step": 2675 + }, + { + "epoch": 0.7412742382271468, + "grad_norm": 0.6928764581680298, + "learning_rate": 4.825940063245572e-06, + "loss": 0.6326, + "step": 2676 + }, + { + "epoch": 0.7415512465373961, + "grad_norm": 0.668339192867279, + "learning_rate": 4.825806489292346e-06, + "loss": 0.643, + "step": 2677 + }, + { + "epoch": 0.7418282548476455, + "grad_norm": 0.6804963946342468, + "learning_rate": 4.82567286595631e-06, + "loss": 0.7222, + "step": 2678 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 0.6823146939277649, + "learning_rate": 4.8255391932403e-06, + "loss": 0.6546, + "step": 2679 + }, + { + "epoch": 0.7423822714681441, + "grad_norm": 0.7106555700302124, + "learning_rate": 4.825405471147153e-06, + "loss": 0.7349, + "step": 2680 + }, + { + "epoch": 0.7426592797783933, + "grad_norm": 0.6523182988166809, + "learning_rate": 4.82527169967971e-06, + "loss": 0.6886, + "step": 2681 + }, + { + "epoch": 0.7429362880886426, + "grad_norm": 0.6475448608398438, + "learning_rate": 4.825137878840811e-06, + "loss": 0.693, + "step": 2682 + }, + { + "epoch": 0.743213296398892, + "grad_norm": 0.7019377946853638, + "learning_rate": 4.825004008633297e-06, + "loss": 0.6831, + "step": 2683 + }, + { + "epoch": 0.7434903047091412, + "grad_norm": 0.6636565327644348, + "learning_rate": 4.82487008906001e-06, + "loss": 0.6647, + "step": 2684 + }, + { + "epoch": 0.7437673130193906, + "grad_norm": 0.6838463544845581, + "learning_rate": 4.824736120123794e-06, + "loss": 0.7015, + "step": 2685 + }, + { + "epoch": 0.7440443213296399, + "grad_norm": 0.6885688900947571, + "learning_rate": 4.824602101827494e-06, + "loss": 0.6308, + "step": 2686 + }, + { + "epoch": 0.7443213296398892, + "grad_norm": 0.7183178663253784, + "learning_rate": 4.824468034173954e-06, + "loss": 0.686, + "step": 2687 + }, + { + "epoch": 0.7445983379501385, + "grad_norm": 0.6711190938949585, + "learning_rate": 4.824333917166023e-06, + "loss": 0.6677, + "step": 2688 + }, + { + "epoch": 0.7448753462603878, + "grad_norm": 0.6922284364700317, + "learning_rate": 4.824199750806547e-06, + "loss": 0.6963, + "step": 2689 + }, + { + "epoch": 0.7451523545706371, + "grad_norm": 0.6533357501029968, + "learning_rate": 4.8240655350983745e-06, + "loss": 0.627, + "step": 2690 + }, + { + "epoch": 0.7454293628808865, + "grad_norm": 0.7120330929756165, + "learning_rate": 4.8239312700443555e-06, + "loss": 0.6529, + "step": 2691 + }, + { + "epoch": 0.7457063711911357, + "grad_norm": 0.6339964866638184, + "learning_rate": 4.823796955647341e-06, + "loss": 0.6094, + "step": 2692 + }, + { + "epoch": 0.745983379501385, + "grad_norm": 0.6632034778594971, + "learning_rate": 4.823662591910183e-06, + "loss": 0.7077, + "step": 2693 + }, + { + "epoch": 0.7462603878116344, + "grad_norm": 0.7076951265335083, + "learning_rate": 4.823528178835734e-06, + "loss": 0.6762, + "step": 2694 + }, + { + "epoch": 0.7465373961218836, + "grad_norm": 0.6844574809074402, + "learning_rate": 4.823393716426849e-06, + "loss": 0.7062, + "step": 2695 + }, + { + "epoch": 0.746814404432133, + "grad_norm": 0.6797798871994019, + "learning_rate": 4.82325920468638e-06, + "loss": 0.6544, + "step": 2696 + }, + { + "epoch": 0.7470914127423822, + "grad_norm": 0.6964753866195679, + "learning_rate": 4.823124643617187e-06, + "loss": 0.7259, + "step": 2697 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 0.6596205830574036, + "learning_rate": 4.822990033222125e-06, + "loss": 0.6765, + "step": 2698 + }, + { + "epoch": 0.7476454293628809, + "grad_norm": 0.7021011710166931, + "learning_rate": 4.822855373504051e-06, + "loss": 0.6504, + "step": 2699 + }, + { + "epoch": 0.7479224376731302, + "grad_norm": 0.6649656295776367, + "learning_rate": 4.8227206644658275e-06, + "loss": 0.7189, + "step": 2700 + }, + { + "epoch": 0.7481994459833795, + "grad_norm": 0.6813759803771973, + "learning_rate": 4.822585906110311e-06, + "loss": 0.6507, + "step": 2701 + }, + { + "epoch": 0.7484764542936289, + "grad_norm": 0.6968197822570801, + "learning_rate": 4.822451098440366e-06, + "loss": 0.6694, + "step": 2702 + }, + { + "epoch": 0.7487534626038781, + "grad_norm": 0.6730479598045349, + "learning_rate": 4.822316241458852e-06, + "loss": 0.6031, + "step": 2703 + }, + { + "epoch": 0.7490304709141274, + "grad_norm": 0.6745450496673584, + "learning_rate": 4.822181335168634e-06, + "loss": 0.6585, + "step": 2704 + }, + { + "epoch": 0.7493074792243767, + "grad_norm": 0.7031593322753906, + "learning_rate": 4.822046379572576e-06, + "loss": 0.7156, + "step": 2705 + }, + { + "epoch": 0.749584487534626, + "grad_norm": 0.6351406574249268, + "learning_rate": 4.821911374673543e-06, + "loss": 0.6556, + "step": 2706 + }, + { + "epoch": 0.7498614958448754, + "grad_norm": 0.7087862491607666, + "learning_rate": 4.8217763204744035e-06, + "loss": 0.7392, + "step": 2707 + }, + { + "epoch": 0.7501385041551246, + "grad_norm": 0.7211248278617859, + "learning_rate": 4.8216412169780225e-06, + "loss": 0.6403, + "step": 2708 + }, + { + "epoch": 0.750415512465374, + "grad_norm": 0.6689242124557495, + "learning_rate": 4.82150606418727e-06, + "loss": 0.6803, + "step": 2709 + }, + { + "epoch": 0.7506925207756233, + "grad_norm": 0.6601364016532898, + "learning_rate": 4.821370862105015e-06, + "loss": 0.6967, + "step": 2710 + }, + { + "epoch": 0.7509695290858726, + "grad_norm": 0.7165829539299011, + "learning_rate": 4.8212356107341284e-06, + "loss": 0.7043, + "step": 2711 + }, + { + "epoch": 0.7512465373961219, + "grad_norm": 0.6883214712142944, + "learning_rate": 4.821100310077482e-06, + "loss": 0.6122, + "step": 2712 + }, + { + "epoch": 0.7515235457063711, + "grad_norm": 0.6225855946540833, + "learning_rate": 4.8209649601379495e-06, + "loss": 0.6616, + "step": 2713 + }, + { + "epoch": 0.7518005540166205, + "grad_norm": 0.6740206480026245, + "learning_rate": 4.820829560918402e-06, + "loss": 0.6976, + "step": 2714 + }, + { + "epoch": 0.7520775623268698, + "grad_norm": 0.7862870693206787, + "learning_rate": 4.8206941124217175e-06, + "loss": 0.6914, + "step": 2715 + }, + { + "epoch": 0.7523545706371191, + "grad_norm": 0.6579274535179138, + "learning_rate": 4.82055861465077e-06, + "loss": 0.6876, + "step": 2716 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 0.6604716181755066, + "learning_rate": 4.820423067608438e-06, + "loss": 0.6536, + "step": 2717 + }, + { + "epoch": 0.7529085872576178, + "grad_norm": 0.6885457634925842, + "learning_rate": 4.820287471297598e-06, + "loss": 0.6206, + "step": 2718 + }, + { + "epoch": 0.753185595567867, + "grad_norm": 0.6242653131484985, + "learning_rate": 4.820151825721129e-06, + "loss": 0.6144, + "step": 2719 + }, + { + "epoch": 0.7534626038781164, + "grad_norm": 0.6885089874267578, + "learning_rate": 4.820016130881912e-06, + "loss": 0.6358, + "step": 2720 + }, + { + "epoch": 0.7537396121883656, + "grad_norm": 0.7248243093490601, + "learning_rate": 4.819880386782829e-06, + "loss": 0.7197, + "step": 2721 + }, + { + "epoch": 0.754016620498615, + "grad_norm": 0.6652242541313171, + "learning_rate": 4.81974459342676e-06, + "loss": 0.6642, + "step": 2722 + }, + { + "epoch": 0.7542936288088643, + "grad_norm": 0.688584566116333, + "learning_rate": 4.819608750816589e-06, + "loss": 0.6648, + "step": 2723 + }, + { + "epoch": 0.7545706371191135, + "grad_norm": 0.6547428965568542, + "learning_rate": 4.8194728589552005e-06, + "loss": 0.6109, + "step": 2724 + }, + { + "epoch": 0.7548476454293629, + "grad_norm": 0.6694716811180115, + "learning_rate": 4.819336917845481e-06, + "loss": 0.665, + "step": 2725 + }, + { + "epoch": 0.7551246537396122, + "grad_norm": 0.6976423263549805, + "learning_rate": 4.819200927490315e-06, + "loss": 0.735, + "step": 2726 + }, + { + "epoch": 0.7554016620498615, + "grad_norm": 0.6669568419456482, + "learning_rate": 4.819064887892592e-06, + "loss": 0.6577, + "step": 2727 + }, + { + "epoch": 0.7556786703601108, + "grad_norm": 0.6943166255950928, + "learning_rate": 4.8189287990551974e-06, + "loss": 0.6681, + "step": 2728 + }, + { + "epoch": 0.7559556786703601, + "grad_norm": 0.6642052531242371, + "learning_rate": 4.818792660981023e-06, + "loss": 0.68, + "step": 2729 + }, + { + "epoch": 0.7562326869806094, + "grad_norm": 0.7273021936416626, + "learning_rate": 4.818656473672959e-06, + "loss": 0.7097, + "step": 2730 + }, + { + "epoch": 0.7565096952908588, + "grad_norm": 0.6770200729370117, + "learning_rate": 4.818520237133897e-06, + "loss": 0.6669, + "step": 2731 + }, + { + "epoch": 0.756786703601108, + "grad_norm": 0.6642410755157471, + "learning_rate": 4.81838395136673e-06, + "loss": 0.667, + "step": 2732 + }, + { + "epoch": 0.7570637119113574, + "grad_norm": 0.7103769183158875, + "learning_rate": 4.818247616374351e-06, + "loss": 0.6894, + "step": 2733 + }, + { + "epoch": 0.7573407202216067, + "grad_norm": 0.652321994304657, + "learning_rate": 4.8181112321596544e-06, + "loss": 0.6889, + "step": 2734 + }, + { + "epoch": 0.7576177285318559, + "grad_norm": 0.684417724609375, + "learning_rate": 4.817974798725537e-06, + "loss": 0.686, + "step": 2735 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 0.6455621123313904, + "learning_rate": 4.817838316074894e-06, + "loss": 0.6586, + "step": 2736 + }, + { + "epoch": 0.7581717451523545, + "grad_norm": 0.7123239636421204, + "learning_rate": 4.817701784210626e-06, + "loss": 0.6715, + "step": 2737 + }, + { + "epoch": 0.7584487534626039, + "grad_norm": 0.6630301475524902, + "learning_rate": 4.817565203135629e-06, + "loss": 0.6998, + "step": 2738 + }, + { + "epoch": 0.7587257617728532, + "grad_norm": 0.6584340333938599, + "learning_rate": 4.817428572852806e-06, + "loss": 0.6578, + "step": 2739 + }, + { + "epoch": 0.7590027700831025, + "grad_norm": 0.6827883720397949, + "learning_rate": 4.817291893365055e-06, + "loss": 0.6981, + "step": 2740 + }, + { + "epoch": 0.7592797783933518, + "grad_norm": 0.7373248338699341, + "learning_rate": 4.81715516467528e-06, + "loss": 0.6469, + "step": 2741 + }, + { + "epoch": 0.7595567867036012, + "grad_norm": 0.6962078213691711, + "learning_rate": 4.817018386786383e-06, + "loss": 0.6335, + "step": 2742 + }, + { + "epoch": 0.7598337950138504, + "grad_norm": 0.6791117787361145, + "learning_rate": 4.816881559701269e-06, + "loss": 0.6387, + "step": 2743 + }, + { + "epoch": 0.7601108033240997, + "grad_norm": 0.732049822807312, + "learning_rate": 4.816744683422843e-06, + "loss": 0.6952, + "step": 2744 + }, + { + "epoch": 0.760387811634349, + "grad_norm": 0.6718223690986633, + "learning_rate": 4.81660775795401e-06, + "loss": 0.6392, + "step": 2745 + }, + { + "epoch": 0.7606648199445983, + "grad_norm": 0.7064507007598877, + "learning_rate": 4.816470783297679e-06, + "loss": 0.7156, + "step": 2746 + }, + { + "epoch": 0.7609418282548477, + "grad_norm": 0.6774894595146179, + "learning_rate": 4.8163337594567576e-06, + "loss": 0.6582, + "step": 2747 + }, + { + "epoch": 0.7612188365650969, + "grad_norm": 0.6640307307243347, + "learning_rate": 4.816196686434156e-06, + "loss": 0.6511, + "step": 2748 + }, + { + "epoch": 0.7614958448753463, + "grad_norm": 0.6646149754524231, + "learning_rate": 4.816059564232782e-06, + "loss": 0.6468, + "step": 2749 + }, + { + "epoch": 0.7617728531855956, + "grad_norm": 0.6548853516578674, + "learning_rate": 4.815922392855551e-06, + "loss": 0.6684, + "step": 2750 + }, + { + "epoch": 0.7620498614958449, + "grad_norm": 0.6696813702583313, + "learning_rate": 4.815785172305372e-06, + "loss": 0.6903, + "step": 2751 + }, + { + "epoch": 0.7623268698060942, + "grad_norm": 0.6660913825035095, + "learning_rate": 4.8156479025851604e-06, + "loss": 0.6537, + "step": 2752 + }, + { + "epoch": 0.7626038781163434, + "grad_norm": 0.6767622828483582, + "learning_rate": 4.81551058369783e-06, + "loss": 0.6948, + "step": 2753 + }, + { + "epoch": 0.7628808864265928, + "grad_norm": 0.6668721437454224, + "learning_rate": 4.815373215646298e-06, + "loss": 0.675, + "step": 2754 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.7125071883201599, + "learning_rate": 4.815235798433478e-06, + "loss": 0.6843, + "step": 2755 + }, + { + "epoch": 0.7634349030470914, + "grad_norm": 0.6485822796821594, + "learning_rate": 4.81509833206229e-06, + "loss": 0.6778, + "step": 2756 + }, + { + "epoch": 0.7637119113573407, + "grad_norm": 0.6997203230857849, + "learning_rate": 4.814960816535653e-06, + "loss": 0.7199, + "step": 2757 + }, + { + "epoch": 0.7639889196675901, + "grad_norm": 0.6658564805984497, + "learning_rate": 4.814823251856486e-06, + "loss": 0.6775, + "step": 2758 + }, + { + "epoch": 0.7642659279778393, + "grad_norm": 0.6849539875984192, + "learning_rate": 4.8146856380277094e-06, + "loss": 0.6719, + "step": 2759 + }, + { + "epoch": 0.7645429362880887, + "grad_norm": 0.6896138191223145, + "learning_rate": 4.814547975052245e-06, + "loss": 0.7024, + "step": 2760 + }, + { + "epoch": 0.7648199445983379, + "grad_norm": 0.6258552670478821, + "learning_rate": 4.814410262933018e-06, + "loss": 0.6067, + "step": 2761 + }, + { + "epoch": 0.7650969529085873, + "grad_norm": 0.6760597229003906, + "learning_rate": 4.81427250167295e-06, + "loss": 0.6688, + "step": 2762 + }, + { + "epoch": 0.7653739612188366, + "grad_norm": 0.651985228061676, + "learning_rate": 4.814134691274966e-06, + "loss": 0.6739, + "step": 2763 + }, + { + "epoch": 0.7656509695290858, + "grad_norm": 0.7685214281082153, + "learning_rate": 4.813996831741994e-06, + "loss": 0.6574, + "step": 2764 + }, + { + "epoch": 0.7659279778393352, + "grad_norm": 0.7026417851448059, + "learning_rate": 4.81385892307696e-06, + "loss": 0.6929, + "step": 2765 + }, + { + "epoch": 0.7662049861495844, + "grad_norm": 0.6741276383399963, + "learning_rate": 4.813720965282792e-06, + "loss": 0.6752, + "step": 2766 + }, + { + "epoch": 0.7664819944598338, + "grad_norm": 0.6811288595199585, + "learning_rate": 4.8135829583624185e-06, + "loss": 0.6843, + "step": 2767 + }, + { + "epoch": 0.7667590027700831, + "grad_norm": 0.6758236289024353, + "learning_rate": 4.8134449023187715e-06, + "loss": 0.6943, + "step": 2768 + }, + { + "epoch": 0.7670360110803324, + "grad_norm": 0.6365170478820801, + "learning_rate": 4.813306797154781e-06, + "loss": 0.619, + "step": 2769 + }, + { + "epoch": 0.7673130193905817, + "grad_norm": 0.6581631898880005, + "learning_rate": 4.813168642873379e-06, + "loss": 0.6761, + "step": 2770 + }, + { + "epoch": 0.7675900277008311, + "grad_norm": 0.6589594483375549, + "learning_rate": 4.813030439477501e-06, + "loss": 0.7126, + "step": 2771 + }, + { + "epoch": 0.7678670360110803, + "grad_norm": 0.6597155332565308, + "learning_rate": 4.812892186970078e-06, + "loss": 0.7007, + "step": 2772 + }, + { + "epoch": 0.7681440443213297, + "grad_norm": 0.677655816078186, + "learning_rate": 4.812753885354049e-06, + "loss": 0.6439, + "step": 2773 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 0.6718606948852539, + "learning_rate": 4.8126155346323475e-06, + "loss": 0.7329, + "step": 2774 + }, + { + "epoch": 0.7686980609418282, + "grad_norm": 0.6654607057571411, + "learning_rate": 4.812477134807914e-06, + "loss": 0.6597, + "step": 2775 + }, + { + "epoch": 0.7689750692520776, + "grad_norm": 0.6524559855461121, + "learning_rate": 4.812338685883684e-06, + "loss": 0.6147, + "step": 2776 + }, + { + "epoch": 0.7692520775623268, + "grad_norm": 0.6646273732185364, + "learning_rate": 4.8122001878626e-06, + "loss": 0.669, + "step": 2777 + }, + { + "epoch": 0.7695290858725762, + "grad_norm": 0.6726834774017334, + "learning_rate": 4.8120616407476e-06, + "loss": 0.672, + "step": 2778 + }, + { + "epoch": 0.7698060941828255, + "grad_norm": 0.652396559715271, + "learning_rate": 4.811923044541628e-06, + "loss": 0.6632, + "step": 2779 + }, + { + "epoch": 0.7700831024930748, + "grad_norm": 0.6736995577812195, + "learning_rate": 4.811784399247625e-06, + "loss": 0.629, + "step": 2780 + }, + { + "epoch": 0.7703601108033241, + "grad_norm": 0.6741900444030762, + "learning_rate": 4.811645704868537e-06, + "loss": 0.6602, + "step": 2781 + }, + { + "epoch": 0.7706371191135734, + "grad_norm": 0.6603039503097534, + "learning_rate": 4.8115069614073065e-06, + "loss": 0.7292, + "step": 2782 + }, + { + "epoch": 0.7709141274238227, + "grad_norm": 0.6762208938598633, + "learning_rate": 4.81136816886688e-06, + "loss": 0.6798, + "step": 2783 + }, + { + "epoch": 0.771191135734072, + "grad_norm": 0.668865978717804, + "learning_rate": 4.811229327250204e-06, + "loss": 0.6508, + "step": 2784 + }, + { + "epoch": 0.7714681440443213, + "grad_norm": 0.7083577513694763, + "learning_rate": 4.8110904365602285e-06, + "loss": 0.6987, + "step": 2785 + }, + { + "epoch": 0.7717451523545706, + "grad_norm": 0.67978835105896, + "learning_rate": 4.8109514967999e-06, + "loss": 0.6525, + "step": 2786 + }, + { + "epoch": 0.77202216066482, + "grad_norm": 0.6726179718971252, + "learning_rate": 4.8108125079721705e-06, + "loss": 0.6151, + "step": 2787 + }, + { + "epoch": 0.7722991689750692, + "grad_norm": 0.6633230447769165, + "learning_rate": 4.8106734700799905e-06, + "loss": 0.7054, + "step": 2788 + }, + { + "epoch": 0.7725761772853186, + "grad_norm": 0.6689402461051941, + "learning_rate": 4.810534383126311e-06, + "loss": 0.6577, + "step": 2789 + }, + { + "epoch": 0.7728531855955678, + "grad_norm": 0.706071674823761, + "learning_rate": 4.810395247114087e-06, + "loss": 0.7239, + "step": 2790 + }, + { + "epoch": 0.7731301939058172, + "grad_norm": 0.7108891606330872, + "learning_rate": 4.810256062046271e-06, + "loss": 0.6916, + "step": 2791 + }, + { + "epoch": 0.7734072022160665, + "grad_norm": 0.6585329174995422, + "learning_rate": 4.810116827925819e-06, + "loss": 0.6815, + "step": 2792 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 0.6893103718757629, + "learning_rate": 4.8099775447556885e-06, + "loss": 0.7041, + "step": 2793 + }, + { + "epoch": 0.7739612188365651, + "grad_norm": 0.604040801525116, + "learning_rate": 4.809838212538835e-06, + "loss": 0.5979, + "step": 2794 + }, + { + "epoch": 0.7742382271468145, + "grad_norm": 0.7108220458030701, + "learning_rate": 4.809698831278217e-06, + "loss": 0.6336, + "step": 2795 + }, + { + "epoch": 0.7745152354570637, + "grad_norm": 0.6532512903213501, + "learning_rate": 4.809559400976796e-06, + "loss": 0.6453, + "step": 2796 + }, + { + "epoch": 0.774792243767313, + "grad_norm": 0.6732306480407715, + "learning_rate": 4.809419921637529e-06, + "loss": 0.697, + "step": 2797 + }, + { + "epoch": 0.7750692520775623, + "grad_norm": 0.7318335175514221, + "learning_rate": 4.80928039326338e-06, + "loss": 0.6928, + "step": 2798 + }, + { + "epoch": 0.7753462603878116, + "grad_norm": 0.6802420020103455, + "learning_rate": 4.8091408158573115e-06, + "loss": 0.6836, + "step": 2799 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 0.7560245394706726, + "learning_rate": 4.8090011894222874e-06, + "loss": 0.7218, + "step": 2800 + }, + { + "epoch": 0.7759002770083102, + "grad_norm": 0.6963402628898621, + "learning_rate": 4.80886151396127e-06, + "loss": 0.6777, + "step": 2801 + }, + { + "epoch": 0.7761772853185596, + "grad_norm": 0.6747308969497681, + "learning_rate": 4.808721789477227e-06, + "loss": 0.6525, + "step": 2802 + }, + { + "epoch": 0.7764542936288089, + "grad_norm": 0.6558110117912292, + "learning_rate": 4.8085820159731245e-06, + "loss": 0.6419, + "step": 2803 + }, + { + "epoch": 0.7767313019390581, + "grad_norm": 0.6836314797401428, + "learning_rate": 4.808442193451931e-06, + "loss": 0.6685, + "step": 2804 + }, + { + "epoch": 0.7770083102493075, + "grad_norm": 0.7180214524269104, + "learning_rate": 4.808302321916614e-06, + "loss": 0.7183, + "step": 2805 + }, + { + "epoch": 0.7772853185595567, + "grad_norm": 0.6658193469047546, + "learning_rate": 4.8081624013701435e-06, + "loss": 0.6348, + "step": 2806 + }, + { + "epoch": 0.7775623268698061, + "grad_norm": 0.7152186036109924, + "learning_rate": 4.808022431815491e-06, + "loss": 0.7257, + "step": 2807 + }, + { + "epoch": 0.7778393351800554, + "grad_norm": 0.7137231230735779, + "learning_rate": 4.807882413255629e-06, + "loss": 0.7042, + "step": 2808 + }, + { + "epoch": 0.7781163434903047, + "grad_norm": 0.6712000370025635, + "learning_rate": 4.807742345693529e-06, + "loss": 0.7017, + "step": 2809 + }, + { + "epoch": 0.778393351800554, + "grad_norm": 0.6861433386802673, + "learning_rate": 4.807602229132166e-06, + "loss": 0.6771, + "step": 2810 + }, + { + "epoch": 0.7786703601108034, + "grad_norm": 0.6944916248321533, + "learning_rate": 4.807462063574515e-06, + "loss": 0.6572, + "step": 2811 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 0.6742745041847229, + "learning_rate": 4.8073218490235506e-06, + "loss": 0.6802, + "step": 2812 + }, + { + "epoch": 0.779224376731302, + "grad_norm": 0.672527551651001, + "learning_rate": 4.807181585482252e-06, + "loss": 0.6926, + "step": 2813 + }, + { + "epoch": 0.7795013850415512, + "grad_norm": 0.7015137672424316, + "learning_rate": 4.807041272953596e-06, + "loss": 0.7219, + "step": 2814 + }, + { + "epoch": 0.7797783933518005, + "grad_norm": 0.6598617434501648, + "learning_rate": 4.806900911440563e-06, + "loss": 0.695, + "step": 2815 + }, + { + "epoch": 0.7800554016620499, + "grad_norm": 0.6474493145942688, + "learning_rate": 4.806760500946132e-06, + "loss": 0.6592, + "step": 2816 + }, + { + "epoch": 0.7803324099722991, + "grad_norm": 0.673628032207489, + "learning_rate": 4.806620041473285e-06, + "loss": 0.6625, + "step": 2817 + }, + { + "epoch": 0.7806094182825485, + "grad_norm": 0.6895366311073303, + "learning_rate": 4.806479533025003e-06, + "loss": 0.7107, + "step": 2818 + }, + { + "epoch": 0.7808864265927978, + "grad_norm": 0.6482427716255188, + "learning_rate": 4.806338975604271e-06, + "loss": 0.6109, + "step": 2819 + }, + { + "epoch": 0.7811634349030471, + "grad_norm": 0.6845487356185913, + "learning_rate": 4.806198369214073e-06, + "loss": 0.7063, + "step": 2820 + }, + { + "epoch": 0.7814404432132964, + "grad_norm": 0.6725820302963257, + "learning_rate": 4.8060577138573945e-06, + "loss": 0.6446, + "step": 2821 + }, + { + "epoch": 0.7817174515235457, + "grad_norm": 0.6608311533927917, + "learning_rate": 4.805917009537222e-06, + "loss": 0.6662, + "step": 2822 + }, + { + "epoch": 0.781994459833795, + "grad_norm": 0.677912175655365, + "learning_rate": 4.805776256256543e-06, + "loss": 0.6848, + "step": 2823 + }, + { + "epoch": 0.7822714681440444, + "grad_norm": 0.6723567843437195, + "learning_rate": 4.805635454018344e-06, + "loss": 0.7034, + "step": 2824 + }, + { + "epoch": 0.7825484764542936, + "grad_norm": 0.7071090936660767, + "learning_rate": 4.805494602825617e-06, + "loss": 0.6575, + "step": 2825 + }, + { + "epoch": 0.782825484764543, + "grad_norm": 0.6822702288627625, + "learning_rate": 4.805353702681352e-06, + "loss": 0.6527, + "step": 2826 + }, + { + "epoch": 0.7831024930747923, + "grad_norm": 0.7353903651237488, + "learning_rate": 4.8052127535885415e-06, + "loss": 0.6578, + "step": 2827 + }, + { + "epoch": 0.7833795013850415, + "grad_norm": 0.6553813815116882, + "learning_rate": 4.805071755550177e-06, + "loss": 0.6559, + "step": 2828 + }, + { + "epoch": 0.7836565096952909, + "grad_norm": 0.6671435236930847, + "learning_rate": 4.804930708569252e-06, + "loss": 0.7025, + "step": 2829 + }, + { + "epoch": 0.7839335180055401, + "grad_norm": 0.6840083003044128, + "learning_rate": 4.804789612648762e-06, + "loss": 0.7181, + "step": 2830 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 0.6222642660140991, + "learning_rate": 4.804648467791704e-06, + "loss": 0.5592, + "step": 2831 + }, + { + "epoch": 0.7844875346260388, + "grad_norm": 0.6630566120147705, + "learning_rate": 4.804507274001073e-06, + "loss": 0.68, + "step": 2832 + }, + { + "epoch": 0.7847645429362881, + "grad_norm": 0.7513610124588013, + "learning_rate": 4.804366031279866e-06, + "loss": 0.7193, + "step": 2833 + }, + { + "epoch": 0.7850415512465374, + "grad_norm": 0.6853703856468201, + "learning_rate": 4.804224739631085e-06, + "loss": 0.665, + "step": 2834 + }, + { + "epoch": 0.7853185595567868, + "grad_norm": 0.6761021614074707, + "learning_rate": 4.804083399057728e-06, + "loss": 0.6699, + "step": 2835 + }, + { + "epoch": 0.785595567867036, + "grad_norm": 0.6687069535255432, + "learning_rate": 4.803942009562797e-06, + "loss": 0.6945, + "step": 2836 + }, + { + "epoch": 0.7858725761772853, + "grad_norm": 0.6809954643249512, + "learning_rate": 4.803800571149292e-06, + "loss": 0.6902, + "step": 2837 + }, + { + "epoch": 0.7861495844875346, + "grad_norm": 0.7132784128189087, + "learning_rate": 4.8036590838202195e-06, + "loss": 0.7272, + "step": 2838 + }, + { + "epoch": 0.7864265927977839, + "grad_norm": 0.6614826321601868, + "learning_rate": 4.80351754757858e-06, + "loss": 0.6934, + "step": 2839 + }, + { + "epoch": 0.7867036011080333, + "grad_norm": 0.6766554713249207, + "learning_rate": 4.803375962427381e-06, + "loss": 0.6621, + "step": 2840 + }, + { + "epoch": 0.7869806094182825, + "grad_norm": 0.6399201154708862, + "learning_rate": 4.803234328369629e-06, + "loss": 0.715, + "step": 2841 + }, + { + "epoch": 0.7872576177285319, + "grad_norm": 0.68129962682724, + "learning_rate": 4.80309264540833e-06, + "loss": 0.6761, + "step": 2842 + }, + { + "epoch": 0.7875346260387812, + "grad_norm": 0.6482844352722168, + "learning_rate": 4.802950913546493e-06, + "loss": 0.6969, + "step": 2843 + }, + { + "epoch": 0.7878116343490305, + "grad_norm": 0.6589528322219849, + "learning_rate": 4.802809132787125e-06, + "loss": 0.6856, + "step": 2844 + }, + { + "epoch": 0.7880886426592798, + "grad_norm": 0.6505930423736572, + "learning_rate": 4.80266730313324e-06, + "loss": 0.6447, + "step": 2845 + }, + { + "epoch": 0.788365650969529, + "grad_norm": 0.6826377511024475, + "learning_rate": 4.8025254245878486e-06, + "loss": 0.73, + "step": 2846 + }, + { + "epoch": 0.7886426592797784, + "grad_norm": 0.6710218787193298, + "learning_rate": 4.802383497153961e-06, + "loss": 0.6912, + "step": 2847 + }, + { + "epoch": 0.7889196675900277, + "grad_norm": 0.6851953268051147, + "learning_rate": 4.802241520834593e-06, + "loss": 0.6747, + "step": 2848 + }, + { + "epoch": 0.789196675900277, + "grad_norm": 0.6651461124420166, + "learning_rate": 4.802099495632757e-06, + "loss": 0.6338, + "step": 2849 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.6330905556678772, + "learning_rate": 4.8019574215514705e-06, + "loss": 0.6573, + "step": 2850 + }, + { + "epoch": 0.7897506925207757, + "grad_norm": 0.7091717720031738, + "learning_rate": 4.80181529859375e-06, + "loss": 0.678, + "step": 2851 + }, + { + "epoch": 0.7900277008310249, + "grad_norm": 0.6825574040412903, + "learning_rate": 4.801673126762612e-06, + "loss": 0.6308, + "step": 2852 + }, + { + "epoch": 0.7903047091412743, + "grad_norm": 0.6608375310897827, + "learning_rate": 4.801530906061076e-06, + "loss": 0.6497, + "step": 2853 + }, + { + "epoch": 0.7905817174515235, + "grad_norm": 0.641620397567749, + "learning_rate": 4.801388636492161e-06, + "loss": 0.678, + "step": 2854 + }, + { + "epoch": 0.7908587257617729, + "grad_norm": 0.6613548994064331, + "learning_rate": 4.801246318058888e-06, + "loss": 0.668, + "step": 2855 + }, + { + "epoch": 0.7911357340720222, + "grad_norm": 0.6885948181152344, + "learning_rate": 4.8011039507642785e-06, + "loss": 0.6979, + "step": 2856 + }, + { + "epoch": 0.7914127423822714, + "grad_norm": 0.67684006690979, + "learning_rate": 4.800961534611357e-06, + "loss": 0.7128, + "step": 2857 + }, + { + "epoch": 0.7916897506925208, + "grad_norm": 0.6780455112457275, + "learning_rate": 4.800819069603145e-06, + "loss": 0.6661, + "step": 2858 + }, + { + "epoch": 0.7919667590027701, + "grad_norm": 0.6764765977859497, + "learning_rate": 4.800676555742669e-06, + "loss": 0.6854, + "step": 2859 + }, + { + "epoch": 0.7922437673130194, + "grad_norm": 0.6664876341819763, + "learning_rate": 4.800533993032955e-06, + "loss": 0.6581, + "step": 2860 + }, + { + "epoch": 0.7925207756232687, + "grad_norm": 0.6832924485206604, + "learning_rate": 4.800391381477028e-06, + "loss": 0.6748, + "step": 2861 + }, + { + "epoch": 0.792797783933518, + "grad_norm": 0.6635904312133789, + "learning_rate": 4.800248721077919e-06, + "loss": 0.6682, + "step": 2862 + }, + { + "epoch": 0.7930747922437673, + "grad_norm": 0.6542471647262573, + "learning_rate": 4.800106011838655e-06, + "loss": 0.6875, + "step": 2863 + }, + { + "epoch": 0.7933518005540167, + "grad_norm": 0.6659771203994751, + "learning_rate": 4.7999632537622655e-06, + "loss": 0.6572, + "step": 2864 + }, + { + "epoch": 0.7936288088642659, + "grad_norm": 0.6873894929885864, + "learning_rate": 4.799820446851784e-06, + "loss": 0.6411, + "step": 2865 + }, + { + "epoch": 0.7939058171745152, + "grad_norm": 0.6617361903190613, + "learning_rate": 4.79967759111024e-06, + "loss": 0.6834, + "step": 2866 + }, + { + "epoch": 0.7941828254847645, + "grad_norm": 0.6959488391876221, + "learning_rate": 4.799534686540669e-06, + "loss": 0.7283, + "step": 2867 + }, + { + "epoch": 0.7944598337950138, + "grad_norm": 0.7266706824302673, + "learning_rate": 4.799391733146104e-06, + "loss": 0.6708, + "step": 2868 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 0.6400945782661438, + "learning_rate": 4.799248730929581e-06, + "loss": 0.6214, + "step": 2869 + }, + { + "epoch": 0.7950138504155124, + "grad_norm": 0.6700130105018616, + "learning_rate": 4.799105679894135e-06, + "loss": 0.6563, + "step": 2870 + }, + { + "epoch": 0.7952908587257618, + "grad_norm": 0.6552063226699829, + "learning_rate": 4.798962580042804e-06, + "loss": 0.6555, + "step": 2871 + }, + { + "epoch": 0.7955678670360111, + "grad_norm": 0.6592832803726196, + "learning_rate": 4.7988194313786275e-06, + "loss": 0.6347, + "step": 2872 + }, + { + "epoch": 0.7958448753462604, + "grad_norm": 0.7056299448013306, + "learning_rate": 4.798676233904643e-06, + "loss": 0.7304, + "step": 2873 + }, + { + "epoch": 0.7961218836565097, + "grad_norm": 0.6806883811950684, + "learning_rate": 4.798532987623892e-06, + "loss": 0.7033, + "step": 2874 + }, + { + "epoch": 0.796398891966759, + "grad_norm": 0.6627018451690674, + "learning_rate": 4.798389692539416e-06, + "loss": 0.7396, + "step": 2875 + }, + { + "epoch": 0.7966759002770083, + "grad_norm": 0.6570676565170288, + "learning_rate": 4.798246348654258e-06, + "loss": 0.6506, + "step": 2876 + }, + { + "epoch": 0.7969529085872576, + "grad_norm": 0.6514227390289307, + "learning_rate": 4.79810295597146e-06, + "loss": 0.6356, + "step": 2877 + }, + { + "epoch": 0.7972299168975069, + "grad_norm": 0.6728456020355225, + "learning_rate": 4.7979595144940675e-06, + "loss": 0.6399, + "step": 2878 + }, + { + "epoch": 0.7975069252077562, + "grad_norm": 0.6666335463523865, + "learning_rate": 4.797816024225127e-06, + "loss": 0.6662, + "step": 2879 + }, + { + "epoch": 0.7977839335180056, + "grad_norm": 0.6706849932670593, + "learning_rate": 4.797672485167684e-06, + "loss": 0.6629, + "step": 2880 + }, + { + "epoch": 0.7980609418282548, + "grad_norm": 0.6484897136688232, + "learning_rate": 4.797528897324786e-06, + "loss": 0.6704, + "step": 2881 + }, + { + "epoch": 0.7983379501385042, + "grad_norm": 0.6639708876609802, + "learning_rate": 4.797385260699483e-06, + "loss": 0.708, + "step": 2882 + }, + { + "epoch": 0.7986149584487534, + "grad_norm": 0.7080711722373962, + "learning_rate": 4.7972415752948235e-06, + "loss": 0.7269, + "step": 2883 + }, + { + "epoch": 0.7988919667590028, + "grad_norm": 0.6324862837791443, + "learning_rate": 4.797097841113859e-06, + "loss": 0.6438, + "step": 2884 + }, + { + "epoch": 0.7991689750692521, + "grad_norm": 0.6741070747375488, + "learning_rate": 4.796954058159641e-06, + "loss": 0.6608, + "step": 2885 + }, + { + "epoch": 0.7994459833795013, + "grad_norm": 0.65198814868927, + "learning_rate": 4.796810226435223e-06, + "loss": 0.6623, + "step": 2886 + }, + { + "epoch": 0.7997229916897507, + "grad_norm": 0.6975104212760925, + "learning_rate": 4.796666345943658e-06, + "loss": 0.7031, + "step": 2887 + }, + { + "epoch": 0.8, + "grad_norm": 0.7220117449760437, + "learning_rate": 4.796522416688002e-06, + "loss": 0.7107, + "step": 2888 + }, + { + "epoch": 0.8002770083102493, + "grad_norm": 0.7077227234840393, + "learning_rate": 4.7963784386713095e-06, + "loss": 0.6777, + "step": 2889 + }, + { + "epoch": 0.8005540166204986, + "grad_norm": 0.6867684125900269, + "learning_rate": 4.79623441189664e-06, + "loss": 0.6972, + "step": 2890 + }, + { + "epoch": 0.8008310249307479, + "grad_norm": 0.6424522399902344, + "learning_rate": 4.796090336367048e-06, + "loss": 0.6423, + "step": 2891 + }, + { + "epoch": 0.8011080332409972, + "grad_norm": 0.7177652716636658, + "learning_rate": 4.795946212085596e-06, + "loss": 0.6744, + "step": 2892 + }, + { + "epoch": 0.8013850415512466, + "grad_norm": 0.6668875813484192, + "learning_rate": 4.795802039055343e-06, + "loss": 0.6788, + "step": 2893 + }, + { + "epoch": 0.8016620498614958, + "grad_norm": 0.6542249321937561, + "learning_rate": 4.795657817279349e-06, + "loss": 0.7118, + "step": 2894 + }, + { + "epoch": 0.8019390581717452, + "grad_norm": 0.6446555256843567, + "learning_rate": 4.795513546760677e-06, + "loss": 0.6142, + "step": 2895 + }, + { + "epoch": 0.8022160664819945, + "grad_norm": 0.6913520693778992, + "learning_rate": 4.7953692275023915e-06, + "loss": 0.6781, + "step": 2896 + }, + { + "epoch": 0.8024930747922437, + "grad_norm": 0.6808624863624573, + "learning_rate": 4.795224859507553e-06, + "loss": 0.7138, + "step": 2897 + }, + { + "epoch": 0.8027700831024931, + "grad_norm": 0.6847794651985168, + "learning_rate": 4.795080442779232e-06, + "loss": 0.7004, + "step": 2898 + }, + { + "epoch": 0.8030470914127423, + "grad_norm": 0.7110046148300171, + "learning_rate": 4.794935977320491e-06, + "loss": 0.6779, + "step": 2899 + }, + { + "epoch": 0.8033240997229917, + "grad_norm": 0.6769573092460632, + "learning_rate": 4.794791463134399e-06, + "loss": 0.6365, + "step": 2900 + }, + { + "epoch": 0.803601108033241, + "grad_norm": 0.6371225118637085, + "learning_rate": 4.794646900224023e-06, + "loss": 0.6444, + "step": 2901 + }, + { + "epoch": 0.8038781163434903, + "grad_norm": 0.7346711754798889, + "learning_rate": 4.794502288592435e-06, + "loss": 0.6966, + "step": 2902 + }, + { + "epoch": 0.8041551246537396, + "grad_norm": 0.6535019278526306, + "learning_rate": 4.794357628242703e-06, + "loss": 0.6423, + "step": 2903 + }, + { + "epoch": 0.804432132963989, + "grad_norm": 0.6877285242080688, + "learning_rate": 4.7942129191778995e-06, + "loss": 0.6663, + "step": 2904 + }, + { + "epoch": 0.8047091412742382, + "grad_norm": 0.6775194406509399, + "learning_rate": 4.794068161401097e-06, + "loss": 0.7229, + "step": 2905 + }, + { + "epoch": 0.8049861495844876, + "grad_norm": 0.685641884803772, + "learning_rate": 4.793923354915369e-06, + "loss": 0.6879, + "step": 2906 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 0.6785410642623901, + "learning_rate": 4.7937784997237894e-06, + "loss": 0.7212, + "step": 2907 + }, + { + "epoch": 0.8055401662049861, + "grad_norm": 0.6728705167770386, + "learning_rate": 4.793633595829435e-06, + "loss": 0.6853, + "step": 2908 + }, + { + "epoch": 0.8058171745152355, + "grad_norm": 0.6703183054924011, + "learning_rate": 4.793488643235383e-06, + "loss": 0.6484, + "step": 2909 + }, + { + "epoch": 0.8060941828254847, + "grad_norm": 0.6292587518692017, + "learning_rate": 4.793343641944709e-06, + "loss": 0.6437, + "step": 2910 + }, + { + "epoch": 0.8063711911357341, + "grad_norm": 0.6794230937957764, + "learning_rate": 4.793198591960494e-06, + "loss": 0.6689, + "step": 2911 + }, + { + "epoch": 0.8066481994459834, + "grad_norm": 0.7074090838432312, + "learning_rate": 4.793053493285816e-06, + "loss": 0.691, + "step": 2912 + }, + { + "epoch": 0.8069252077562327, + "grad_norm": 0.6919972896575928, + "learning_rate": 4.792908345923757e-06, + "loss": 0.6969, + "step": 2913 + }, + { + "epoch": 0.807202216066482, + "grad_norm": 0.6462123990058899, + "learning_rate": 4.7927631498773975e-06, + "loss": 0.6998, + "step": 2914 + }, + { + "epoch": 0.8074792243767313, + "grad_norm": 0.7100611925125122, + "learning_rate": 4.792617905149822e-06, + "loss": 0.7316, + "step": 2915 + }, + { + "epoch": 0.8077562326869806, + "grad_norm": 0.6892986297607422, + "learning_rate": 4.7924726117441135e-06, + "loss": 0.6959, + "step": 2916 + }, + { + "epoch": 0.80803324099723, + "grad_norm": 0.7016753554344177, + "learning_rate": 4.792327269663357e-06, + "loss": 0.764, + "step": 2917 + }, + { + "epoch": 0.8083102493074792, + "grad_norm": 0.6642240285873413, + "learning_rate": 4.7921818789106385e-06, + "loss": 0.7016, + "step": 2918 + }, + { + "epoch": 0.8085872576177285, + "grad_norm": 0.6750599145889282, + "learning_rate": 4.792036439489045e-06, + "loss": 0.6751, + "step": 2919 + }, + { + "epoch": 0.8088642659279779, + "grad_norm": 0.720486044883728, + "learning_rate": 4.791890951401664e-06, + "loss": 0.6709, + "step": 2920 + }, + { + "epoch": 0.8091412742382271, + "grad_norm": 0.6749086380004883, + "learning_rate": 4.791745414651588e-06, + "loss": 0.6387, + "step": 2921 + }, + { + "epoch": 0.8094182825484765, + "grad_norm": 0.6732552647590637, + "learning_rate": 4.791599829241902e-06, + "loss": 0.673, + "step": 2922 + }, + { + "epoch": 0.8096952908587257, + "grad_norm": 0.7399674654006958, + "learning_rate": 4.7914541951757e-06, + "loss": 0.6923, + "step": 2923 + }, + { + "epoch": 0.8099722991689751, + "grad_norm": 0.6551409959793091, + "learning_rate": 4.791308512456074e-06, + "loss": 0.651, + "step": 2924 + }, + { + "epoch": 0.8102493074792244, + "grad_norm": 0.69414883852005, + "learning_rate": 4.7911627810861165e-06, + "loss": 0.6788, + "step": 2925 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 0.7157154083251953, + "learning_rate": 4.791017001068922e-06, + "loss": 0.6912, + "step": 2926 + }, + { + "epoch": 0.810803324099723, + "grad_norm": 0.6880971193313599, + "learning_rate": 4.7908711724075865e-06, + "loss": 0.6905, + "step": 2927 + }, + { + "epoch": 0.8110803324099723, + "grad_norm": 0.6524692177772522, + "learning_rate": 4.790725295105205e-06, + "loss": 0.6504, + "step": 2928 + }, + { + "epoch": 0.8113573407202216, + "grad_norm": 0.6358970403671265, + "learning_rate": 4.790579369164876e-06, + "loss": 0.6964, + "step": 2929 + }, + { + "epoch": 0.8116343490304709, + "grad_norm": 0.6947261691093445, + "learning_rate": 4.790433394589698e-06, + "loss": 0.6553, + "step": 2930 + }, + { + "epoch": 0.8119113573407202, + "grad_norm": 0.681126594543457, + "learning_rate": 4.790287371382769e-06, + "loss": 0.7046, + "step": 2931 + }, + { + "epoch": 0.8121883656509695, + "grad_norm": 0.6445147395133972, + "learning_rate": 4.790141299547191e-06, + "loss": 0.6449, + "step": 2932 + }, + { + "epoch": 0.8124653739612189, + "grad_norm": 0.6821317076683044, + "learning_rate": 4.789995179086065e-06, + "loss": 0.7093, + "step": 2933 + }, + { + "epoch": 0.8127423822714681, + "grad_norm": 0.675289511680603, + "learning_rate": 4.789849010002493e-06, + "loss": 0.6994, + "step": 2934 + }, + { + "epoch": 0.8130193905817175, + "grad_norm": 0.7069120407104492, + "learning_rate": 4.789702792299579e-06, + "loss": 0.6969, + "step": 2935 + }, + { + "epoch": 0.8132963988919668, + "grad_norm": 0.6931147575378418, + "learning_rate": 4.7895565259804275e-06, + "loss": 0.6342, + "step": 2936 + }, + { + "epoch": 0.813573407202216, + "grad_norm": 0.6551658511161804, + "learning_rate": 4.789410211048144e-06, + "loss": 0.6957, + "step": 2937 + }, + { + "epoch": 0.8138504155124654, + "grad_norm": 0.6618546843528748, + "learning_rate": 4.789263847505835e-06, + "loss": 0.6414, + "step": 2938 + }, + { + "epoch": 0.8141274238227146, + "grad_norm": 0.6185067296028137, + "learning_rate": 4.789117435356608e-06, + "loss": 0.6516, + "step": 2939 + }, + { + "epoch": 0.814404432132964, + "grad_norm": 0.6497803330421448, + "learning_rate": 4.788970974603573e-06, + "loss": 0.6501, + "step": 2940 + }, + { + "epoch": 0.8146814404432133, + "grad_norm": 0.7077935934066772, + "learning_rate": 4.788824465249838e-06, + "loss": 0.6295, + "step": 2941 + }, + { + "epoch": 0.8149584487534626, + "grad_norm": 0.6624383926391602, + "learning_rate": 4.788677907298516e-06, + "loss": 0.6637, + "step": 2942 + }, + { + "epoch": 0.8152354570637119, + "grad_norm": 0.6660526394844055, + "learning_rate": 4.788531300752715e-06, + "loss": 0.7064, + "step": 2943 + }, + { + "epoch": 0.8155124653739613, + "grad_norm": 0.6447203755378723, + "learning_rate": 4.788384645615552e-06, + "loss": 0.5935, + "step": 2944 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.6713807582855225, + "learning_rate": 4.788237941890139e-06, + "loss": 0.6769, + "step": 2945 + }, + { + "epoch": 0.8160664819944599, + "grad_norm": 0.6947737336158752, + "learning_rate": 4.788091189579591e-06, + "loss": 0.6391, + "step": 2946 + }, + { + "epoch": 0.8163434903047091, + "grad_norm": 0.686058521270752, + "learning_rate": 4.787944388687023e-06, + "loss": 0.7027, + "step": 2947 + }, + { + "epoch": 0.8166204986149584, + "grad_norm": 0.685586154460907, + "learning_rate": 4.7877975392155544e-06, + "loss": 0.7066, + "step": 2948 + }, + { + "epoch": 0.8168975069252078, + "grad_norm": 0.6798444390296936, + "learning_rate": 4.7876506411683e-06, + "loss": 0.7094, + "step": 2949 + }, + { + "epoch": 0.817174515235457, + "grad_norm": 0.67017662525177, + "learning_rate": 4.787503694548381e-06, + "loss": 0.6798, + "step": 2950 + }, + { + "epoch": 0.8174515235457064, + "grad_norm": 0.6418706774711609, + "learning_rate": 4.787356699358917e-06, + "loss": 0.684, + "step": 2951 + }, + { + "epoch": 0.8177285318559557, + "grad_norm": 0.6584951877593994, + "learning_rate": 4.787209655603029e-06, + "loss": 0.6451, + "step": 2952 + }, + { + "epoch": 0.818005540166205, + "grad_norm": 0.6570129990577698, + "learning_rate": 4.78706256328384e-06, + "loss": 0.6856, + "step": 2953 + }, + { + "epoch": 0.8182825484764543, + "grad_norm": 0.67294842004776, + "learning_rate": 4.78691542240447e-06, + "loss": 0.6419, + "step": 2954 + }, + { + "epoch": 0.8185595567867036, + "grad_norm": 0.6495556831359863, + "learning_rate": 4.786768232968047e-06, + "loss": 0.6566, + "step": 2955 + }, + { + "epoch": 0.8188365650969529, + "grad_norm": 0.6849834322929382, + "learning_rate": 4.786620994977695e-06, + "loss": 0.6939, + "step": 2956 + }, + { + "epoch": 0.8191135734072023, + "grad_norm": 0.6808969378471375, + "learning_rate": 4.78647370843654e-06, + "loss": 0.6968, + "step": 2957 + }, + { + "epoch": 0.8193905817174515, + "grad_norm": 0.672578752040863, + "learning_rate": 4.786326373347708e-06, + "loss": 0.6602, + "step": 2958 + }, + { + "epoch": 0.8196675900277008, + "grad_norm": 0.6576619744300842, + "learning_rate": 4.786178989714328e-06, + "loss": 0.6875, + "step": 2959 + }, + { + "epoch": 0.8199445983379502, + "grad_norm": 0.7055615186691284, + "learning_rate": 4.786031557539532e-06, + "loss": 0.7054, + "step": 2960 + }, + { + "epoch": 0.8202216066481994, + "grad_norm": 0.6421630382537842, + "learning_rate": 4.785884076826446e-06, + "loss": 0.6487, + "step": 2961 + }, + { + "epoch": 0.8204986149584488, + "grad_norm": 0.6846792101860046, + "learning_rate": 4.7857365475782045e-06, + "loss": 0.6168, + "step": 2962 + }, + { + "epoch": 0.820775623268698, + "grad_norm": 0.6749417185783386, + "learning_rate": 4.7855889697979394e-06, + "loss": 0.6087, + "step": 2963 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.6619457006454468, + "learning_rate": 4.7854413434887834e-06, + "loss": 0.69, + "step": 2964 + }, + { + "epoch": 0.8213296398891967, + "grad_norm": 0.6408563852310181, + "learning_rate": 4.7852936686538705e-06, + "loss": 0.6095, + "step": 2965 + }, + { + "epoch": 0.821606648199446, + "grad_norm": 0.6566827297210693, + "learning_rate": 4.785145945296338e-06, + "loss": 0.6519, + "step": 2966 + }, + { + "epoch": 0.8218836565096953, + "grad_norm": 0.6920059323310852, + "learning_rate": 4.784998173419321e-06, + "loss": 0.666, + "step": 2967 + }, + { + "epoch": 0.8221606648199447, + "grad_norm": 0.6602988839149475, + "learning_rate": 4.784850353025958e-06, + "loss": 0.6775, + "step": 2968 + }, + { + "epoch": 0.8224376731301939, + "grad_norm": 0.6310914754867554, + "learning_rate": 4.784702484119387e-06, + "loss": 0.6774, + "step": 2969 + }, + { + "epoch": 0.8227146814404432, + "grad_norm": 0.6583638787269592, + "learning_rate": 4.784554566702747e-06, + "loss": 0.6435, + "step": 2970 + }, + { + "epoch": 0.8229916897506925, + "grad_norm": 0.6954817771911621, + "learning_rate": 4.78440660077918e-06, + "loss": 0.6828, + "step": 2971 + }, + { + "epoch": 0.8232686980609418, + "grad_norm": 0.675418496131897, + "learning_rate": 4.7842585863518266e-06, + "loss": 0.6384, + "step": 2972 + }, + { + "epoch": 0.8235457063711912, + "grad_norm": 0.6548784375190735, + "learning_rate": 4.7841105234238305e-06, + "loss": 0.6704, + "step": 2973 + }, + { + "epoch": 0.8238227146814404, + "grad_norm": 0.6904942393302917, + "learning_rate": 4.7839624119983355e-06, + "loss": 0.6832, + "step": 2974 + }, + { + "epoch": 0.8240997229916898, + "grad_norm": 0.6817192435264587, + "learning_rate": 4.7838142520784855e-06, + "loss": 0.6846, + "step": 2975 + }, + { + "epoch": 0.824376731301939, + "grad_norm": 0.6640516519546509, + "learning_rate": 4.783666043667427e-06, + "loss": 0.6778, + "step": 2976 + }, + { + "epoch": 0.8246537396121884, + "grad_norm": 0.6464129686355591, + "learning_rate": 4.7835177867683054e-06, + "loss": 0.6745, + "step": 2977 + }, + { + "epoch": 0.8249307479224377, + "grad_norm": 0.6703967452049255, + "learning_rate": 4.7833694813842704e-06, + "loss": 0.6806, + "step": 2978 + }, + { + "epoch": 0.8252077562326869, + "grad_norm": 0.7072887420654297, + "learning_rate": 4.78322112751847e-06, + "loss": 0.6715, + "step": 2979 + }, + { + "epoch": 0.8254847645429363, + "grad_norm": 0.6501997709274292, + "learning_rate": 4.783072725174055e-06, + "loss": 0.6848, + "step": 2980 + }, + { + "epoch": 0.8257617728531856, + "grad_norm": 0.7048507928848267, + "learning_rate": 4.782924274354175e-06, + "loss": 0.6898, + "step": 2981 + }, + { + "epoch": 0.8260387811634349, + "grad_norm": 0.6500950455665588, + "learning_rate": 4.782775775061983e-06, + "loss": 0.6722, + "step": 2982 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 0.6806106567382812, + "learning_rate": 4.782627227300633e-06, + "loss": 0.6942, + "step": 2983 + }, + { + "epoch": 0.8265927977839335, + "grad_norm": 0.6816373467445374, + "learning_rate": 4.782478631073276e-06, + "loss": 0.7197, + "step": 2984 + }, + { + "epoch": 0.8268698060941828, + "grad_norm": 0.6448298096656799, + "learning_rate": 4.782329986383069e-06, + "loss": 0.676, + "step": 2985 + }, + { + "epoch": 0.8271468144044322, + "grad_norm": 0.6851121187210083, + "learning_rate": 4.782181293233168e-06, + "loss": 0.6175, + "step": 2986 + }, + { + "epoch": 0.8274238227146814, + "grad_norm": 0.6796623468399048, + "learning_rate": 4.782032551626731e-06, + "loss": 0.6643, + "step": 2987 + }, + { + "epoch": 0.8277008310249307, + "grad_norm": 0.7001053094863892, + "learning_rate": 4.781883761566915e-06, + "loss": 0.6902, + "step": 2988 + }, + { + "epoch": 0.8279778393351801, + "grad_norm": 0.6801310181617737, + "learning_rate": 4.781734923056879e-06, + "loss": 0.6573, + "step": 2989 + }, + { + "epoch": 0.8282548476454293, + "grad_norm": 0.6727662086486816, + "learning_rate": 4.781586036099784e-06, + "loss": 0.6617, + "step": 2990 + }, + { + "epoch": 0.8285318559556787, + "grad_norm": 0.6912431120872498, + "learning_rate": 4.78143710069879e-06, + "loss": 0.6612, + "step": 2991 + }, + { + "epoch": 0.8288088642659279, + "grad_norm": 0.6520569324493408, + "learning_rate": 4.781288116857061e-06, + "loss": 0.6571, + "step": 2992 + }, + { + "epoch": 0.8290858725761773, + "grad_norm": 0.7189939618110657, + "learning_rate": 4.781139084577759e-06, + "loss": 0.6676, + "step": 2993 + }, + { + "epoch": 0.8293628808864266, + "grad_norm": 0.6563591957092285, + "learning_rate": 4.78099000386405e-06, + "loss": 0.6675, + "step": 2994 + }, + { + "epoch": 0.8296398891966759, + "grad_norm": 0.6994754076004028, + "learning_rate": 4.780840874719097e-06, + "loss": 0.6568, + "step": 2995 + }, + { + "epoch": 0.8299168975069252, + "grad_norm": 0.6558755040168762, + "learning_rate": 4.780691697146068e-06, + "loss": 0.6082, + "step": 2996 + }, + { + "epoch": 0.8301939058171746, + "grad_norm": 0.6601712703704834, + "learning_rate": 4.780542471148131e-06, + "loss": 0.668, + "step": 2997 + }, + { + "epoch": 0.8304709141274238, + "grad_norm": 0.6975407004356384, + "learning_rate": 4.780393196728452e-06, + "loss": 0.6882, + "step": 2998 + }, + { + "epoch": 0.8307479224376731, + "grad_norm": 0.6977407932281494, + "learning_rate": 4.780243873890203e-06, + "loss": 0.6895, + "step": 2999 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 0.6513993740081787, + "learning_rate": 4.780094502636553e-06, + "loss": 0.6568, + "step": 3000 + }, + { + "epoch": 0.8313019390581717, + "grad_norm": 0.7048574686050415, + "learning_rate": 4.779945082970673e-06, + "loss": 0.6626, + "step": 3001 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 0.6709084510803223, + "learning_rate": 4.779795614895738e-06, + "loss": 0.6928, + "step": 3002 + }, + { + "epoch": 0.8318559556786703, + "grad_norm": 0.7100402116775513, + "learning_rate": 4.779646098414919e-06, + "loss": 0.6443, + "step": 3003 + }, + { + "epoch": 0.8321329639889197, + "grad_norm": 0.6646271347999573, + "learning_rate": 4.779496533531393e-06, + "loss": 0.6267, + "step": 3004 + }, + { + "epoch": 0.832409972299169, + "grad_norm": 0.6824468970298767, + "learning_rate": 4.779346920248333e-06, + "loss": 0.6945, + "step": 3005 + }, + { + "epoch": 0.8326869806094183, + "grad_norm": 0.6574153900146484, + "learning_rate": 4.779197258568918e-06, + "loss": 0.6556, + "step": 3006 + }, + { + "epoch": 0.8329639889196676, + "grad_norm": 0.6881409883499146, + "learning_rate": 4.779047548496325e-06, + "loss": 0.6532, + "step": 3007 + }, + { + "epoch": 0.8332409972299168, + "grad_norm": 0.6868331432342529, + "learning_rate": 4.778897790033732e-06, + "loss": 0.6402, + "step": 3008 + }, + { + "epoch": 0.8335180055401662, + "grad_norm": 0.6438159942626953, + "learning_rate": 4.778747983184319e-06, + "loss": 0.6512, + "step": 3009 + }, + { + "epoch": 0.8337950138504155, + "grad_norm": 0.6847102642059326, + "learning_rate": 4.778598127951268e-06, + "loss": 0.6651, + "step": 3010 + }, + { + "epoch": 0.8340720221606648, + "grad_norm": 0.6552219390869141, + "learning_rate": 4.778448224337759e-06, + "loss": 0.6574, + "step": 3011 + }, + { + "epoch": 0.8343490304709141, + "grad_norm": 0.6644777655601501, + "learning_rate": 4.778298272346975e-06, + "loss": 0.6665, + "step": 3012 + }, + { + "epoch": 0.8346260387811635, + "grad_norm": 0.6513178944587708, + "learning_rate": 4.778148271982102e-06, + "loss": 0.6535, + "step": 3013 + }, + { + "epoch": 0.8349030470914127, + "grad_norm": 0.6750456690788269, + "learning_rate": 4.777998223246323e-06, + "loss": 0.6848, + "step": 3014 + }, + { + "epoch": 0.8351800554016621, + "grad_norm": 0.7080809473991394, + "learning_rate": 4.777848126142824e-06, + "loss": 0.6663, + "step": 3015 + }, + { + "epoch": 0.8354570637119113, + "grad_norm": 0.725734293460846, + "learning_rate": 4.777697980674793e-06, + "loss": 0.7176, + "step": 3016 + }, + { + "epoch": 0.8357340720221607, + "grad_norm": 0.7126418352127075, + "learning_rate": 4.777547786845416e-06, + "loss": 0.7036, + "step": 3017 + }, + { + "epoch": 0.83601108033241, + "grad_norm": 0.6786972284317017, + "learning_rate": 4.777397544657884e-06, + "loss": 0.6533, + "step": 3018 + }, + { + "epoch": 0.8362880886426592, + "grad_norm": 0.6766000390052795, + "learning_rate": 4.7772472541153865e-06, + "loss": 0.6674, + "step": 3019 + }, + { + "epoch": 0.8365650969529086, + "grad_norm": 0.667149007320404, + "learning_rate": 4.777096915221114e-06, + "loss": 0.7067, + "step": 3020 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 0.7148330211639404, + "learning_rate": 4.776946527978259e-06, + "loss": 0.6662, + "step": 3021 + }, + { + "epoch": 0.8371191135734072, + "grad_norm": 0.6931552886962891, + "learning_rate": 4.776796092390015e-06, + "loss": 0.6841, + "step": 3022 + }, + { + "epoch": 0.8373961218836565, + "grad_norm": 0.7319605350494385, + "learning_rate": 4.776645608459575e-06, + "loss": 0.6989, + "step": 3023 + }, + { + "epoch": 0.8376731301939058, + "grad_norm": 0.6943248510360718, + "learning_rate": 4.7764950761901355e-06, + "loss": 0.6323, + "step": 3024 + }, + { + "epoch": 0.8379501385041551, + "grad_norm": 0.6594962477684021, + "learning_rate": 4.776344495584891e-06, + "loss": 0.6597, + "step": 3025 + }, + { + "epoch": 0.8382271468144045, + "grad_norm": 0.7413750290870667, + "learning_rate": 4.7761938666470405e-06, + "loss": 0.6685, + "step": 3026 + }, + { + "epoch": 0.8385041551246537, + "grad_norm": 0.6912171244621277, + "learning_rate": 4.776043189379781e-06, + "loss": 0.6683, + "step": 3027 + }, + { + "epoch": 0.838781163434903, + "grad_norm": 0.6637134552001953, + "learning_rate": 4.775892463786312e-06, + "loss": 0.721, + "step": 3028 + }, + { + "epoch": 0.8390581717451524, + "grad_norm": 0.6988323330879211, + "learning_rate": 4.775741689869835e-06, + "loss": 0.6792, + "step": 3029 + }, + { + "epoch": 0.8393351800554016, + "grad_norm": 0.6290362477302551, + "learning_rate": 4.77559086763355e-06, + "loss": 0.5723, + "step": 3030 + }, + { + "epoch": 0.839612188365651, + "grad_norm": 0.644509494304657, + "learning_rate": 4.775439997080659e-06, + "loss": 0.6326, + "step": 3031 + }, + { + "epoch": 0.8398891966759002, + "grad_norm": 0.6706430912017822, + "learning_rate": 4.775289078214366e-06, + "loss": 0.6629, + "step": 3032 + }, + { + "epoch": 0.8401662049861496, + "grad_norm": 0.7159662842750549, + "learning_rate": 4.775138111037876e-06, + "loss": 0.6711, + "step": 3033 + }, + { + "epoch": 0.8404432132963989, + "grad_norm": 0.669113039970398, + "learning_rate": 4.774987095554393e-06, + "loss": 0.6027, + "step": 3034 + }, + { + "epoch": 0.8407202216066482, + "grad_norm": 0.6876832246780396, + "learning_rate": 4.774836031767125e-06, + "loss": 0.6608, + "step": 3035 + }, + { + "epoch": 0.8409972299168975, + "grad_norm": 0.7057596445083618, + "learning_rate": 4.774684919679279e-06, + "loss": 0.7192, + "step": 3036 + }, + { + "epoch": 0.8412742382271469, + "grad_norm": 0.6566253900527954, + "learning_rate": 4.7745337592940635e-06, + "loss": 0.6393, + "step": 3037 + }, + { + "epoch": 0.8415512465373961, + "grad_norm": 0.6582949757575989, + "learning_rate": 4.774382550614687e-06, + "loss": 0.6249, + "step": 3038 + }, + { + "epoch": 0.8418282548476455, + "grad_norm": 0.6579154133796692, + "learning_rate": 4.77423129364436e-06, + "loss": 0.6777, + "step": 3039 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.6580916047096252, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.6998, + "step": 3040 + }, + { + "epoch": 0.842382271468144, + "grad_norm": 0.6692470908164978, + "learning_rate": 4.773928634843706e-06, + "loss": 0.6524, + "step": 3041 + }, + { + "epoch": 0.8426592797783934, + "grad_norm": 0.6601621508598328, + "learning_rate": 4.773777233019805e-06, + "loss": 0.6935, + "step": 3042 + }, + { + "epoch": 0.8429362880886426, + "grad_norm": 0.6472424268722534, + "learning_rate": 4.773625782917806e-06, + "loss": 0.6437, + "step": 3043 + }, + { + "epoch": 0.843213296398892, + "grad_norm": 0.6353874206542969, + "learning_rate": 4.773474284540926e-06, + "loss": 0.7074, + "step": 3044 + }, + { + "epoch": 0.8434903047091413, + "grad_norm": 0.6795141100883484, + "learning_rate": 4.773322737892381e-06, + "loss": 0.6591, + "step": 3045 + }, + { + "epoch": 0.8437673130193906, + "grad_norm": 0.6520573496818542, + "learning_rate": 4.773171142975388e-06, + "loss": 0.6911, + "step": 3046 + }, + { + "epoch": 0.8440443213296399, + "grad_norm": 0.6691749095916748, + "learning_rate": 4.773019499793167e-06, + "loss": 0.6495, + "step": 3047 + }, + { + "epoch": 0.8443213296398892, + "grad_norm": 0.6517788171768188, + "learning_rate": 4.7728678083489375e-06, + "loss": 0.6762, + "step": 3048 + }, + { + "epoch": 0.8445983379501385, + "grad_norm": 0.6732432842254639, + "learning_rate": 4.7727160686459205e-06, + "loss": 0.6815, + "step": 3049 + }, + { + "epoch": 0.8448753462603878, + "grad_norm": 0.6788813471794128, + "learning_rate": 4.7725642806873375e-06, + "loss": 0.6832, + "step": 3050 + }, + { + "epoch": 0.8451523545706371, + "grad_norm": 0.6740108728408813, + "learning_rate": 4.772412444476411e-06, + "loss": 0.6829, + "step": 3051 + }, + { + "epoch": 0.8454293628808864, + "grad_norm": 0.7147025465965271, + "learning_rate": 4.772260560016365e-06, + "loss": 0.6787, + "step": 3052 + }, + { + "epoch": 0.8457063711911358, + "grad_norm": 0.6838698983192444, + "learning_rate": 4.7721086273104245e-06, + "loss": 0.6355, + "step": 3053 + }, + { + "epoch": 0.845983379501385, + "grad_norm": 0.7325090169906616, + "learning_rate": 4.771956646361816e-06, + "loss": 0.6596, + "step": 3054 + }, + { + "epoch": 0.8462603878116344, + "grad_norm": 0.6333363056182861, + "learning_rate": 4.771804617173765e-06, + "loss": 0.6532, + "step": 3055 + }, + { + "epoch": 0.8465373961218836, + "grad_norm": 0.6893544793128967, + "learning_rate": 4.771652539749502e-06, + "loss": 0.6605, + "step": 3056 + }, + { + "epoch": 0.846814404432133, + "grad_norm": 0.6349977850914001, + "learning_rate": 4.771500414092253e-06, + "loss": 0.6158, + "step": 3057 + }, + { + "epoch": 0.8470914127423823, + "grad_norm": 0.6788435578346252, + "learning_rate": 4.7713482402052505e-06, + "loss": 0.6811, + "step": 3058 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 0.7373189330101013, + "learning_rate": 4.771196018091724e-06, + "loss": 0.7169, + "step": 3059 + }, + { + "epoch": 0.8476454293628809, + "grad_norm": 0.6911075115203857, + "learning_rate": 4.7710437477549055e-06, + "loss": 0.6328, + "step": 3060 + }, + { + "epoch": 0.8479224376731302, + "grad_norm": 0.7234703898429871, + "learning_rate": 4.770891429198029e-06, + "loss": 0.6492, + "step": 3061 + }, + { + "epoch": 0.8481994459833795, + "grad_norm": 0.7154641151428223, + "learning_rate": 4.770739062424328e-06, + "loss": 0.6541, + "step": 3062 + }, + { + "epoch": 0.8484764542936288, + "grad_norm": 0.7392740845680237, + "learning_rate": 4.7705866474370386e-06, + "loss": 0.6636, + "step": 3063 + }, + { + "epoch": 0.8487534626038781, + "grad_norm": 0.6674519181251526, + "learning_rate": 4.770434184239396e-06, + "loss": 0.6505, + "step": 3064 + }, + { + "epoch": 0.8490304709141274, + "grad_norm": 0.703151285648346, + "learning_rate": 4.770281672834638e-06, + "loss": 0.6472, + "step": 3065 + }, + { + "epoch": 0.8493074792243768, + "grad_norm": 0.6965990662574768, + "learning_rate": 4.770129113226001e-06, + "loss": 0.6578, + "step": 3066 + }, + { + "epoch": 0.849584487534626, + "grad_norm": 0.6909862756729126, + "learning_rate": 4.7699765054167265e-06, + "loss": 0.6242, + "step": 3067 + }, + { + "epoch": 0.8498614958448754, + "grad_norm": 0.6947312355041504, + "learning_rate": 4.769823849410054e-06, + "loss": 0.6758, + "step": 3068 + }, + { + "epoch": 0.8501385041551247, + "grad_norm": 0.6394481062889099, + "learning_rate": 4.769671145209225e-06, + "loss": 0.66, + "step": 3069 + }, + { + "epoch": 0.850415512465374, + "grad_norm": 0.6775333881378174, + "learning_rate": 4.7695183928174804e-06, + "loss": 0.7078, + "step": 3070 + }, + { + "epoch": 0.8506925207756233, + "grad_norm": 0.7309216260910034, + "learning_rate": 4.7693655922380655e-06, + "loss": 0.6954, + "step": 3071 + }, + { + "epoch": 0.8509695290858725, + "grad_norm": 0.7183328866958618, + "learning_rate": 4.7692127434742234e-06, + "loss": 0.6424, + "step": 3072 + }, + { + "epoch": 0.8512465373961219, + "grad_norm": 0.6703770160675049, + "learning_rate": 4.769059846529201e-06, + "loss": 0.6586, + "step": 3073 + }, + { + "epoch": 0.8515235457063712, + "grad_norm": 0.7268140912055969, + "learning_rate": 4.768906901406242e-06, + "loss": 0.6998, + "step": 3074 + }, + { + "epoch": 0.8518005540166205, + "grad_norm": 0.7222980856895447, + "learning_rate": 4.768753908108596e-06, + "loss": 0.7278, + "step": 3075 + }, + { + "epoch": 0.8520775623268698, + "grad_norm": 0.6823320984840393, + "learning_rate": 4.76860086663951e-06, + "loss": 0.638, + "step": 3076 + }, + { + "epoch": 0.8523545706371192, + "grad_norm": 0.6753017902374268, + "learning_rate": 4.768447777002234e-06, + "loss": 0.6778, + "step": 3077 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 0.6737832427024841, + "learning_rate": 4.76829463920002e-06, + "loss": 0.6892, + "step": 3078 + }, + { + "epoch": 0.8529085872576178, + "grad_norm": 0.6660850644111633, + "learning_rate": 4.768141453236118e-06, + "loss": 0.6461, + "step": 3079 + }, + { + "epoch": 0.853185595567867, + "grad_norm": 0.6556453704833984, + "learning_rate": 4.767988219113781e-06, + "loss": 0.6457, + "step": 3080 + }, + { + "epoch": 0.8534626038781163, + "grad_norm": 0.6840183734893799, + "learning_rate": 4.767834936836261e-06, + "loss": 0.6799, + "step": 3081 + }, + { + "epoch": 0.8537396121883657, + "grad_norm": 0.6308120489120483, + "learning_rate": 4.767681606406814e-06, + "loss": 0.617, + "step": 3082 + }, + { + "epoch": 0.8540166204986149, + "grad_norm": 0.676609218120575, + "learning_rate": 4.767528227828697e-06, + "loss": 0.6684, + "step": 3083 + }, + { + "epoch": 0.8542936288088643, + "grad_norm": 0.6895732283592224, + "learning_rate": 4.767374801105163e-06, + "loss": 0.68, + "step": 3084 + }, + { + "epoch": 0.8545706371191135, + "grad_norm": 0.6707661747932434, + "learning_rate": 4.767221326239474e-06, + "loss": 0.667, + "step": 3085 + }, + { + "epoch": 0.8548476454293629, + "grad_norm": 0.6718268990516663, + "learning_rate": 4.767067803234885e-06, + "loss": 0.6455, + "step": 3086 + }, + { + "epoch": 0.8551246537396122, + "grad_norm": 0.7019315361976624, + "learning_rate": 4.766914232094657e-06, + "loss": 0.6911, + "step": 3087 + }, + { + "epoch": 0.8554016620498615, + "grad_norm": 0.685129702091217, + "learning_rate": 4.766760612822051e-06, + "loss": 0.7099, + "step": 3088 + }, + { + "epoch": 0.8556786703601108, + "grad_norm": 0.6947982311248779, + "learning_rate": 4.766606945420329e-06, + "loss": 0.7263, + "step": 3089 + }, + { + "epoch": 0.8559556786703602, + "grad_norm": 0.6423403024673462, + "learning_rate": 4.766453229892753e-06, + "loss": 0.6686, + "step": 3090 + }, + { + "epoch": 0.8562326869806094, + "grad_norm": 0.6994284987449646, + "learning_rate": 4.766299466242587e-06, + "loss": 0.659, + "step": 3091 + }, + { + "epoch": 0.8565096952908587, + "grad_norm": 0.6681022047996521, + "learning_rate": 4.766145654473096e-06, + "loss": 0.6579, + "step": 3092 + }, + { + "epoch": 0.856786703601108, + "grad_norm": 0.6877515912055969, + "learning_rate": 4.7659917945875455e-06, + "loss": 0.6182, + "step": 3093 + }, + { + "epoch": 0.8570637119113573, + "grad_norm": 0.668996274471283, + "learning_rate": 4.765837886589202e-06, + "loss": 0.6618, + "step": 3094 + }, + { + "epoch": 0.8573407202216067, + "grad_norm": 0.6529152393341064, + "learning_rate": 4.765683930481335e-06, + "loss": 0.688, + "step": 3095 + }, + { + "epoch": 0.8576177285318559, + "grad_norm": 0.7194386124610901, + "learning_rate": 4.765529926267211e-06, + "loss": 0.645, + "step": 3096 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 0.6833193302154541, + "learning_rate": 4.765375873950102e-06, + "loss": 0.6941, + "step": 3097 + }, + { + "epoch": 0.8581717451523546, + "grad_norm": 0.6345197558403015, + "learning_rate": 4.765221773533277e-06, + "loss": 0.6294, + "step": 3098 + }, + { + "epoch": 0.8584487534626039, + "grad_norm": 0.7031524777412415, + "learning_rate": 4.76506762502001e-06, + "loss": 0.6644, + "step": 3099 + }, + { + "epoch": 0.8587257617728532, + "grad_norm": 0.646828293800354, + "learning_rate": 4.764913428413572e-06, + "loss": 0.6695, + "step": 3100 + }, + { + "epoch": 0.8590027700831024, + "grad_norm": 0.6990915536880493, + "learning_rate": 4.764759183717239e-06, + "loss": 0.6898, + "step": 3101 + }, + { + "epoch": 0.8592797783933518, + "grad_norm": 0.7449160814285278, + "learning_rate": 4.764604890934285e-06, + "loss": 0.6352, + "step": 3102 + }, + { + "epoch": 0.8595567867036011, + "grad_norm": 0.6356722116470337, + "learning_rate": 4.764450550067986e-06, + "loss": 0.6182, + "step": 3103 + }, + { + "epoch": 0.8598337950138504, + "grad_norm": 0.6818626523017883, + "learning_rate": 4.764296161121618e-06, + "loss": 0.6556, + "step": 3104 + }, + { + "epoch": 0.8601108033240997, + "grad_norm": 0.7065897583961487, + "learning_rate": 4.764141724098461e-06, + "loss": 0.6528, + "step": 3105 + }, + { + "epoch": 0.8603878116343491, + "grad_norm": 0.7592453956604004, + "learning_rate": 4.763987239001793e-06, + "loss": 0.657, + "step": 3106 + }, + { + "epoch": 0.8606648199445983, + "grad_norm": 0.643108069896698, + "learning_rate": 4.7638327058348945e-06, + "loss": 0.682, + "step": 3107 + }, + { + "epoch": 0.8609418282548477, + "grad_norm": 0.6628758907318115, + "learning_rate": 4.763678124601046e-06, + "loss": 0.6405, + "step": 3108 + }, + { + "epoch": 0.8612188365650969, + "grad_norm": 0.7099124789237976, + "learning_rate": 4.7635234953035304e-06, + "loss": 0.6575, + "step": 3109 + }, + { + "epoch": 0.8614958448753463, + "grad_norm": 0.7039968371391296, + "learning_rate": 4.763368817945631e-06, + "loss": 0.7117, + "step": 3110 + }, + { + "epoch": 0.8617728531855956, + "grad_norm": 0.6697438955307007, + "learning_rate": 4.76321409253063e-06, + "loss": 0.6456, + "step": 3111 + }, + { + "epoch": 0.8620498614958448, + "grad_norm": 0.6728848814964294, + "learning_rate": 4.763059319061816e-06, + "loss": 0.5569, + "step": 3112 + }, + { + "epoch": 0.8623268698060942, + "grad_norm": 0.6149548888206482, + "learning_rate": 4.762904497542472e-06, + "loss": 0.6469, + "step": 3113 + }, + { + "epoch": 0.8626038781163435, + "grad_norm": 0.6806933283805847, + "learning_rate": 4.762749627975888e-06, + "loss": 0.7133, + "step": 3114 + }, + { + "epoch": 0.8628808864265928, + "grad_norm": 0.6651535034179688, + "learning_rate": 4.76259471036535e-06, + "loss": 0.6436, + "step": 3115 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 0.6518577337265015, + "learning_rate": 4.762439744714149e-06, + "loss": 0.661, + "step": 3116 + }, + { + "epoch": 0.8634349030470914, + "grad_norm": 0.641085684299469, + "learning_rate": 4.762284731025574e-06, + "loss": 0.6133, + "step": 3117 + }, + { + "epoch": 0.8637119113573407, + "grad_norm": 0.6976808309555054, + "learning_rate": 4.762129669302917e-06, + "loss": 0.6968, + "step": 3118 + }, + { + "epoch": 0.8639889196675901, + "grad_norm": 0.6310849785804749, + "learning_rate": 4.761974559549471e-06, + "loss": 0.6388, + "step": 3119 + }, + { + "epoch": 0.8642659279778393, + "grad_norm": 0.6805459260940552, + "learning_rate": 4.7618194017685285e-06, + "loss": 0.713, + "step": 3120 + }, + { + "epoch": 0.8645429362880886, + "grad_norm": 0.6955195069313049, + "learning_rate": 4.761664195963383e-06, + "loss": 0.6308, + "step": 3121 + }, + { + "epoch": 0.864819944598338, + "grad_norm": 0.6788739562034607, + "learning_rate": 4.761508942137332e-06, + "loss": 0.6916, + "step": 3122 + }, + { + "epoch": 0.8650969529085872, + "grad_norm": 0.7025331854820251, + "learning_rate": 4.76135364029367e-06, + "loss": 0.679, + "step": 3123 + }, + { + "epoch": 0.8653739612188366, + "grad_norm": 0.6896618604660034, + "learning_rate": 4.761198290435696e-06, + "loss": 0.6763, + "step": 3124 + }, + { + "epoch": 0.8656509695290858, + "grad_norm": 0.6727938652038574, + "learning_rate": 4.761042892566707e-06, + "loss": 0.6373, + "step": 3125 + }, + { + "epoch": 0.8659279778393352, + "grad_norm": 0.6701960563659668, + "learning_rate": 4.760887446690005e-06, + "loss": 0.7016, + "step": 3126 + }, + { + "epoch": 0.8662049861495845, + "grad_norm": 0.6560091376304626, + "learning_rate": 4.7607319528088875e-06, + "loss": 0.6484, + "step": 3127 + }, + { + "epoch": 0.8664819944598338, + "grad_norm": 0.6654905080795288, + "learning_rate": 4.760576410926657e-06, + "loss": 0.6749, + "step": 3128 + }, + { + "epoch": 0.8667590027700831, + "grad_norm": 0.6756736040115356, + "learning_rate": 4.7604208210466175e-06, + "loss": 0.6182, + "step": 3129 + }, + { + "epoch": 0.8670360110803325, + "grad_norm": 0.6454859375953674, + "learning_rate": 4.760265183172072e-06, + "loss": 0.7207, + "step": 3130 + }, + { + "epoch": 0.8673130193905817, + "grad_norm": 0.6561083197593689, + "learning_rate": 4.760109497306323e-06, + "loss": 0.6288, + "step": 3131 + }, + { + "epoch": 0.867590027700831, + "grad_norm": 0.6179625391960144, + "learning_rate": 4.759953763452678e-06, + "loss": 0.6647, + "step": 3132 + }, + { + "epoch": 0.8678670360110803, + "grad_norm": 0.7050300240516663, + "learning_rate": 4.7597979816144435e-06, + "loss": 0.6844, + "step": 3133 + }, + { + "epoch": 0.8681440443213296, + "grad_norm": 0.6492171883583069, + "learning_rate": 4.759642151794928e-06, + "loss": 0.6574, + "step": 3134 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.6972433924674988, + "learning_rate": 4.759486273997438e-06, + "loss": 0.6451, + "step": 3135 + }, + { + "epoch": 0.8686980609418282, + "grad_norm": 0.7063480615615845, + "learning_rate": 4.7593303482252835e-06, + "loss": 0.667, + "step": 3136 + }, + { + "epoch": 0.8689750692520776, + "grad_norm": 0.6627654433250427, + "learning_rate": 4.759174374481778e-06, + "loss": 0.6138, + "step": 3137 + }, + { + "epoch": 0.8692520775623269, + "grad_norm": 0.6340380311012268, + "learning_rate": 4.759018352770229e-06, + "loss": 0.6416, + "step": 3138 + }, + { + "epoch": 0.8695290858725762, + "grad_norm": 0.7147533297538757, + "learning_rate": 4.758862283093954e-06, + "loss": 0.6789, + "step": 3139 + }, + { + "epoch": 0.8698060941828255, + "grad_norm": 0.6682170629501343, + "learning_rate": 4.758706165456262e-06, + "loss": 0.661, + "step": 3140 + }, + { + "epoch": 0.8700831024930747, + "grad_norm": 0.6603893637657166, + "learning_rate": 4.758549999860471e-06, + "loss": 0.6074, + "step": 3141 + }, + { + "epoch": 0.8703601108033241, + "grad_norm": 0.6531762480735779, + "learning_rate": 4.7583937863098965e-06, + "loss": 0.6717, + "step": 3142 + }, + { + "epoch": 0.8706371191135734, + "grad_norm": 0.7047556638717651, + "learning_rate": 4.758237524807853e-06, + "loss": 0.7041, + "step": 3143 + }, + { + "epoch": 0.8709141274238227, + "grad_norm": 0.6751700043678284, + "learning_rate": 4.758081215357661e-06, + "loss": 0.6753, + "step": 3144 + }, + { + "epoch": 0.871191135734072, + "grad_norm": 0.6937599778175354, + "learning_rate": 4.757924857962638e-06, + "loss": 0.697, + "step": 3145 + }, + { + "epoch": 0.8714681440443214, + "grad_norm": 0.6919699907302856, + "learning_rate": 4.757768452626104e-06, + "loss": 0.6793, + "step": 3146 + }, + { + "epoch": 0.8717451523545706, + "grad_norm": 0.625059962272644, + "learning_rate": 4.757611999351381e-06, + "loss": 0.6872, + "step": 3147 + }, + { + "epoch": 0.87202216066482, + "grad_norm": 0.660816490650177, + "learning_rate": 4.757455498141789e-06, + "loss": 0.6999, + "step": 3148 + }, + { + "epoch": 0.8722991689750692, + "grad_norm": 0.6542554497718811, + "learning_rate": 4.757298949000652e-06, + "loss": 0.6592, + "step": 3149 + }, + { + "epoch": 0.8725761772853186, + "grad_norm": 0.6849338412284851, + "learning_rate": 4.757142351931293e-06, + "loss": 0.6313, + "step": 3150 + }, + { + "epoch": 0.8728531855955679, + "grad_norm": 0.685217559337616, + "learning_rate": 4.7569857069370395e-06, + "loss": 0.6299, + "step": 3151 + }, + { + "epoch": 0.8731301939058171, + "grad_norm": 0.6493719220161438, + "learning_rate": 4.756829014021215e-06, + "loss": 0.6399, + "step": 3152 + }, + { + "epoch": 0.8734072022160665, + "grad_norm": 0.6607489585876465, + "learning_rate": 4.756672273187146e-06, + "loss": 0.6659, + "step": 3153 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 0.6733990907669067, + "learning_rate": 4.756515484438163e-06, + "loss": 0.6533, + "step": 3154 + }, + { + "epoch": 0.8739612188365651, + "grad_norm": 0.6592486500740051, + "learning_rate": 4.7563586477775935e-06, + "loss": 0.6512, + "step": 3155 + }, + { + "epoch": 0.8742382271468144, + "grad_norm": 0.693362832069397, + "learning_rate": 4.756201763208768e-06, + "loss": 0.6779, + "step": 3156 + }, + { + "epoch": 0.8745152354570637, + "grad_norm": 0.6807980537414551, + "learning_rate": 4.756044830735017e-06, + "loss": 0.6771, + "step": 3157 + }, + { + "epoch": 0.874792243767313, + "grad_norm": 0.6866021156311035, + "learning_rate": 4.755887850359673e-06, + "loss": 0.6586, + "step": 3158 + }, + { + "epoch": 0.8750692520775624, + "grad_norm": 0.6628581881523132, + "learning_rate": 4.75573082208607e-06, + "loss": 0.6858, + "step": 3159 + }, + { + "epoch": 0.8753462603878116, + "grad_norm": 0.7030439972877502, + "learning_rate": 4.75557374591754e-06, + "loss": 0.6817, + "step": 3160 + }, + { + "epoch": 0.875623268698061, + "grad_norm": 0.6881664991378784, + "learning_rate": 4.7554166218574196e-06, + "loss": 0.6963, + "step": 3161 + }, + { + "epoch": 0.8759002770083103, + "grad_norm": 0.6781759262084961, + "learning_rate": 4.755259449909044e-06, + "loss": 0.6433, + "step": 3162 + }, + { + "epoch": 0.8761772853185595, + "grad_norm": 0.7240464687347412, + "learning_rate": 4.755102230075752e-06, + "loss": 0.6605, + "step": 3163 + }, + { + "epoch": 0.8764542936288089, + "grad_norm": 0.6497237086296082, + "learning_rate": 4.75494496236088e-06, + "loss": 0.6626, + "step": 3164 + }, + { + "epoch": 0.8767313019390581, + "grad_norm": 0.6879202723503113, + "learning_rate": 4.7547876467677685e-06, + "loss": 0.7042, + "step": 3165 + }, + { + "epoch": 0.8770083102493075, + "grad_norm": 0.6922905445098877, + "learning_rate": 4.754630283299757e-06, + "loss": 0.6756, + "step": 3166 + }, + { + "epoch": 0.8772853185595568, + "grad_norm": 0.6731038689613342, + "learning_rate": 4.754472871960186e-06, + "loss": 0.6272, + "step": 3167 + }, + { + "epoch": 0.8775623268698061, + "grad_norm": 0.6296544075012207, + "learning_rate": 4.7543154127524e-06, + "loss": 0.6471, + "step": 3168 + }, + { + "epoch": 0.8778393351800554, + "grad_norm": 0.7404965758323669, + "learning_rate": 4.754157905679741e-06, + "loss": 0.7242, + "step": 3169 + }, + { + "epoch": 0.8781163434903048, + "grad_norm": 0.6243695616722107, + "learning_rate": 4.754000350745551e-06, + "loss": 0.636, + "step": 3170 + }, + { + "epoch": 0.878393351800554, + "grad_norm": 0.6488210558891296, + "learning_rate": 4.753842747953179e-06, + "loss": 0.6139, + "step": 3171 + }, + { + "epoch": 0.8786703601108034, + "grad_norm": 0.6636707186698914, + "learning_rate": 4.75368509730597e-06, + "loss": 0.6445, + "step": 3172 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 0.6736623644828796, + "learning_rate": 4.7535273988072715e-06, + "loss": 0.6883, + "step": 3173 + }, + { + "epoch": 0.8792243767313019, + "grad_norm": 0.7248889803886414, + "learning_rate": 4.75336965246043e-06, + "loss": 0.6976, + "step": 3174 + }, + { + "epoch": 0.8795013850415513, + "grad_norm": 0.7072076201438904, + "learning_rate": 4.753211858268797e-06, + "loss": 0.6583, + "step": 3175 + }, + { + "epoch": 0.8797783933518005, + "grad_norm": 0.6631415486335754, + "learning_rate": 4.753054016235723e-06, + "loss": 0.6573, + "step": 3176 + }, + { + "epoch": 0.8800554016620499, + "grad_norm": 0.7018452286720276, + "learning_rate": 4.7528961263645575e-06, + "loss": 0.6408, + "step": 3177 + }, + { + "epoch": 0.8803324099722992, + "grad_norm": 0.6875770092010498, + "learning_rate": 4.752738188658654e-06, + "loss": 0.6701, + "step": 3178 + }, + { + "epoch": 0.8806094182825485, + "grad_norm": 0.6721932291984558, + "learning_rate": 4.752580203121367e-06, + "loss": 0.6711, + "step": 3179 + }, + { + "epoch": 0.8808864265927978, + "grad_norm": 0.7087852358818054, + "learning_rate": 4.752422169756048e-06, + "loss": 0.6526, + "step": 3180 + }, + { + "epoch": 0.881163434903047, + "grad_norm": 0.7229830026626587, + "learning_rate": 4.7522640885660556e-06, + "loss": 0.6688, + "step": 3181 + }, + { + "epoch": 0.8814404432132964, + "grad_norm": 0.6967926621437073, + "learning_rate": 4.752105959554745e-06, + "loss": 0.6511, + "step": 3182 + }, + { + "epoch": 0.8817174515235457, + "grad_norm": 0.7231464385986328, + "learning_rate": 4.751947782725472e-06, + "loss": 0.6702, + "step": 3183 + }, + { + "epoch": 0.881994459833795, + "grad_norm": 0.6495847105979919, + "learning_rate": 4.7517895580815985e-06, + "loss": 0.6626, + "step": 3184 + }, + { + "epoch": 0.8822714681440443, + "grad_norm": 0.6687087416648865, + "learning_rate": 4.751631285626482e-06, + "loss": 0.6823, + "step": 3185 + }, + { + "epoch": 0.8825484764542936, + "grad_norm": 0.6756120920181274, + "learning_rate": 4.751472965363482e-06, + "loss": 0.7317, + "step": 3186 + }, + { + "epoch": 0.8828254847645429, + "grad_norm": 0.6763913631439209, + "learning_rate": 4.751314597295963e-06, + "loss": 0.6534, + "step": 3187 + }, + { + "epoch": 0.8831024930747923, + "grad_norm": 0.6849490404129028, + "learning_rate": 4.751156181427285e-06, + "loss": 0.6579, + "step": 3188 + }, + { + "epoch": 0.8833795013850415, + "grad_norm": 0.6868329644203186, + "learning_rate": 4.750997717760813e-06, + "loss": 0.6889, + "step": 3189 + }, + { + "epoch": 0.8836565096952909, + "grad_norm": 0.6511489748954773, + "learning_rate": 4.75083920629991e-06, + "loss": 0.7108, + "step": 3190 + }, + { + "epoch": 0.8839335180055402, + "grad_norm": 0.6739808917045593, + "learning_rate": 4.750680647047944e-06, + "loss": 0.6885, + "step": 3191 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 0.6702025532722473, + "learning_rate": 4.750522040008279e-06, + "loss": 0.6607, + "step": 3192 + }, + { + "epoch": 0.8844875346260388, + "grad_norm": 0.6719469428062439, + "learning_rate": 4.750363385184285e-06, + "loss": 0.6732, + "step": 3193 + }, + { + "epoch": 0.884764542936288, + "grad_norm": 0.7011950612068176, + "learning_rate": 4.750204682579328e-06, + "loss": 0.6364, + "step": 3194 + }, + { + "epoch": 0.8850415512465374, + "grad_norm": 0.6801823973655701, + "learning_rate": 4.750045932196781e-06, + "loss": 0.6367, + "step": 3195 + }, + { + "epoch": 0.8853185595567867, + "grad_norm": 0.718298614025116, + "learning_rate": 4.749887134040011e-06, + "loss": 0.7011, + "step": 3196 + }, + { + "epoch": 0.885595567867036, + "grad_norm": 0.7182875275611877, + "learning_rate": 4.7497282881123925e-06, + "loss": 0.6881, + "step": 3197 + }, + { + "epoch": 0.8858725761772853, + "grad_norm": 0.6788876056671143, + "learning_rate": 4.7495693944172975e-06, + "loss": 0.7216, + "step": 3198 + }, + { + "epoch": 0.8861495844875347, + "grad_norm": 0.6881414651870728, + "learning_rate": 4.749410452958099e-06, + "loss": 0.6589, + "step": 3199 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 0.639951229095459, + "learning_rate": 4.7492514637381725e-06, + "loss": 0.6664, + "step": 3200 + }, + { + "epoch": 0.8867036011080333, + "grad_norm": 0.6903998851776123, + "learning_rate": 4.749092426760894e-06, + "loss": 0.6704, + "step": 3201 + }, + { + "epoch": 0.8869806094182825, + "grad_norm": 0.6713690161705017, + "learning_rate": 4.748933342029639e-06, + "loss": 0.6829, + "step": 3202 + }, + { + "epoch": 0.8872576177285318, + "grad_norm": 0.7007774114608765, + "learning_rate": 4.7487742095477865e-06, + "loss": 0.6929, + "step": 3203 + }, + { + "epoch": 0.8875346260387812, + "grad_norm": 0.6859935522079468, + "learning_rate": 4.748615029318714e-06, + "loss": 0.6402, + "step": 3204 + }, + { + "epoch": 0.8878116343490304, + "grad_norm": 0.7014520168304443, + "learning_rate": 4.7484558013458025e-06, + "loss": 0.6759, + "step": 3205 + }, + { + "epoch": 0.8880886426592798, + "grad_norm": 0.651506781578064, + "learning_rate": 4.748296525632432e-06, + "loss": 0.6491, + "step": 3206 + }, + { + "epoch": 0.8883656509695291, + "grad_norm": 0.6785458326339722, + "learning_rate": 4.748137202181985e-06, + "loss": 0.6821, + "step": 3207 + }, + { + "epoch": 0.8886426592797784, + "grad_norm": 0.6694476008415222, + "learning_rate": 4.747977830997845e-06, + "loss": 0.6845, + "step": 3208 + }, + { + "epoch": 0.8889196675900277, + "grad_norm": 0.7240982055664062, + "learning_rate": 4.747818412083395e-06, + "loss": 0.6531, + "step": 3209 + }, + { + "epoch": 0.889196675900277, + "grad_norm": 0.6883110404014587, + "learning_rate": 4.747658945442018e-06, + "loss": 0.6696, + "step": 3210 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 0.6668145656585693, + "learning_rate": 4.747499431077103e-06, + "loss": 0.6814, + "step": 3211 + }, + { + "epoch": 0.8897506925207757, + "grad_norm": 0.6918444037437439, + "learning_rate": 4.7473398689920344e-06, + "loss": 0.686, + "step": 3212 + }, + { + "epoch": 0.8900277008310249, + "grad_norm": 0.6626413464546204, + "learning_rate": 4.747180259190203e-06, + "loss": 0.6781, + "step": 3213 + }, + { + "epoch": 0.8903047091412742, + "grad_norm": 0.6800508499145508, + "learning_rate": 4.747020601674996e-06, + "loss": 0.6627, + "step": 3214 + }, + { + "epoch": 0.8905817174515236, + "grad_norm": 0.638883113861084, + "learning_rate": 4.746860896449802e-06, + "loss": 0.6777, + "step": 3215 + }, + { + "epoch": 0.8908587257617728, + "grad_norm": 0.6268462538719177, + "learning_rate": 4.746701143518013e-06, + "loss": 0.567, + "step": 3216 + }, + { + "epoch": 0.8911357340720222, + "grad_norm": 0.6455804109573364, + "learning_rate": 4.746541342883023e-06, + "loss": 0.6246, + "step": 3217 + }, + { + "epoch": 0.8914127423822714, + "grad_norm": 0.667750358581543, + "learning_rate": 4.746381494548223e-06, + "loss": 0.6854, + "step": 3218 + }, + { + "epoch": 0.8916897506925208, + "grad_norm": 0.7158799171447754, + "learning_rate": 4.746221598517005e-06, + "loss": 0.6546, + "step": 3219 + }, + { + "epoch": 0.8919667590027701, + "grad_norm": 0.687025785446167, + "learning_rate": 4.746061654792769e-06, + "loss": 0.6289, + "step": 3220 + }, + { + "epoch": 0.8922437673130194, + "grad_norm": 0.7073338627815247, + "learning_rate": 4.745901663378907e-06, + "loss": 0.7161, + "step": 3221 + }, + { + "epoch": 0.8925207756232687, + "grad_norm": 0.6431207060813904, + "learning_rate": 4.745741624278817e-06, + "loss": 0.6723, + "step": 3222 + }, + { + "epoch": 0.892797783933518, + "grad_norm": 0.658841073513031, + "learning_rate": 4.745581537495898e-06, + "loss": 0.6587, + "step": 3223 + }, + { + "epoch": 0.8930747922437673, + "grad_norm": 0.6782752275466919, + "learning_rate": 4.745421403033548e-06, + "loss": 0.6692, + "step": 3224 + }, + { + "epoch": 0.8933518005540166, + "grad_norm": 0.6738010048866272, + "learning_rate": 4.745261220895168e-06, + "loss": 0.6891, + "step": 3225 + }, + { + "epoch": 0.8936288088642659, + "grad_norm": 0.6247903108596802, + "learning_rate": 4.745100991084157e-06, + "loss": 0.6206, + "step": 3226 + }, + { + "epoch": 0.8939058171745152, + "grad_norm": 0.6673998236656189, + "learning_rate": 4.744940713603921e-06, + "loss": 0.6054, + "step": 3227 + }, + { + "epoch": 0.8941828254847646, + "grad_norm": 0.7002072930335999, + "learning_rate": 4.744780388457859e-06, + "loss": 0.6975, + "step": 3228 + }, + { + "epoch": 0.8944598337950138, + "grad_norm": 0.6853911876678467, + "learning_rate": 4.7446200156493775e-06, + "loss": 0.6347, + "step": 3229 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.6441922187805176, + "learning_rate": 4.744459595181879e-06, + "loss": 0.7001, + "step": 3230 + }, + { + "epoch": 0.8950138504155125, + "grad_norm": 0.6561487317085266, + "learning_rate": 4.744299127058774e-06, + "loss": 0.6439, + "step": 3231 + }, + { + "epoch": 0.8952908587257618, + "grad_norm": 0.6780251860618591, + "learning_rate": 4.744138611283467e-06, + "loss": 0.7057, + "step": 3232 + }, + { + "epoch": 0.8955678670360111, + "grad_norm": 0.6251480579376221, + "learning_rate": 4.743978047859365e-06, + "loss": 0.6118, + "step": 3233 + }, + { + "epoch": 0.8958448753462603, + "grad_norm": 0.7261717915534973, + "learning_rate": 4.743817436789879e-06, + "loss": 0.7222, + "step": 3234 + }, + { + "epoch": 0.8961218836565097, + "grad_norm": 0.6909134387969971, + "learning_rate": 4.74365677807842e-06, + "loss": 0.6874, + "step": 3235 + }, + { + "epoch": 0.896398891966759, + "grad_norm": 0.621599018573761, + "learning_rate": 4.743496071728396e-06, + "loss": 0.632, + "step": 3236 + }, + { + "epoch": 0.8966759002770083, + "grad_norm": 0.6764978766441345, + "learning_rate": 4.743335317743223e-06, + "loss": 0.6776, + "step": 3237 + }, + { + "epoch": 0.8969529085872576, + "grad_norm": 0.6836737990379333, + "learning_rate": 4.743174516126311e-06, + "loss": 0.7023, + "step": 3238 + }, + { + "epoch": 0.897229916897507, + "grad_norm": 0.6584877967834473, + "learning_rate": 4.7430136668810764e-06, + "loss": 0.6377, + "step": 3239 + }, + { + "epoch": 0.8975069252077562, + "grad_norm": 0.6757174134254456, + "learning_rate": 4.742852770010933e-06, + "loss": 0.6419, + "step": 3240 + }, + { + "epoch": 0.8977839335180056, + "grad_norm": 0.6961698532104492, + "learning_rate": 4.742691825519298e-06, + "loss": 0.676, + "step": 3241 + }, + { + "epoch": 0.8980609418282548, + "grad_norm": 0.6550590991973877, + "learning_rate": 4.742530833409587e-06, + "loss": 0.6471, + "step": 3242 + }, + { + "epoch": 0.8983379501385041, + "grad_norm": 0.7002878189086914, + "learning_rate": 4.742369793685222e-06, + "loss": 0.7044, + "step": 3243 + }, + { + "epoch": 0.8986149584487535, + "grad_norm": 0.7033624649047852, + "learning_rate": 4.742208706349619e-06, + "loss": 0.664, + "step": 3244 + }, + { + "epoch": 0.8988919667590027, + "grad_norm": 0.6921766996383667, + "learning_rate": 4.742047571406198e-06, + "loss": 0.6838, + "step": 3245 + }, + { + "epoch": 0.8991689750692521, + "grad_norm": 0.6530360579490662, + "learning_rate": 4.741886388858384e-06, + "loss": 0.6551, + "step": 3246 + }, + { + "epoch": 0.8994459833795014, + "grad_norm": 0.6938414573669434, + "learning_rate": 4.741725158709594e-06, + "loss": 0.6538, + "step": 3247 + }, + { + "epoch": 0.8997229916897507, + "grad_norm": 0.6908465623855591, + "learning_rate": 4.741563880963256e-06, + "loss": 0.684, + "step": 3248 + }, + { + "epoch": 0.9, + "grad_norm": 0.6564576625823975, + "learning_rate": 4.741402555622791e-06, + "loss": 0.7037, + "step": 3249 + }, + { + "epoch": 0.9002770083102493, + "grad_norm": 0.7001242637634277, + "learning_rate": 4.741241182691627e-06, + "loss": 0.696, + "step": 3250 + }, + { + "epoch": 0.9005540166204986, + "grad_norm": 0.6578646898269653, + "learning_rate": 4.741079762173189e-06, + "loss": 0.6147, + "step": 3251 + }, + { + "epoch": 0.900831024930748, + "grad_norm": 0.5905128717422485, + "learning_rate": 4.7409182940709046e-06, + "loss": 0.6315, + "step": 3252 + }, + { + "epoch": 0.9011080332409972, + "grad_norm": 0.6931217908859253, + "learning_rate": 4.740756778388201e-06, + "loss": 0.6798, + "step": 3253 + }, + { + "epoch": 0.9013850415512465, + "grad_norm": 0.6865070462226868, + "learning_rate": 4.74059521512851e-06, + "loss": 0.6773, + "step": 3254 + }, + { + "epoch": 0.9016620498614959, + "grad_norm": 0.703082799911499, + "learning_rate": 4.74043360429526e-06, + "loss": 0.6764, + "step": 3255 + }, + { + "epoch": 0.9019390581717451, + "grad_norm": 0.7222578525543213, + "learning_rate": 4.7402719458918845e-06, + "loss": 0.7117, + "step": 3256 + }, + { + "epoch": 0.9022160664819945, + "grad_norm": 0.6672399044036865, + "learning_rate": 4.740110239921813e-06, + "loss": 0.6806, + "step": 3257 + }, + { + "epoch": 0.9024930747922437, + "grad_norm": 0.6685750484466553, + "learning_rate": 4.7399484863884815e-06, + "loss": 0.6392, + "step": 3258 + }, + { + "epoch": 0.9027700831024931, + "grad_norm": 0.6670814156532288, + "learning_rate": 4.739786685295323e-06, + "loss": 0.6684, + "step": 3259 + }, + { + "epoch": 0.9030470914127424, + "grad_norm": 0.6570850610733032, + "learning_rate": 4.739624836645773e-06, + "loss": 0.6547, + "step": 3260 + }, + { + "epoch": 0.9033240997229917, + "grad_norm": 0.7073267102241516, + "learning_rate": 4.739462940443269e-06, + "loss": 0.6942, + "step": 3261 + }, + { + "epoch": 0.903601108033241, + "grad_norm": 0.702384889125824, + "learning_rate": 4.739300996691247e-06, + "loss": 0.6594, + "step": 3262 + }, + { + "epoch": 0.9038781163434904, + "grad_norm": 0.6831039190292358, + "learning_rate": 4.739139005393148e-06, + "loss": 0.6795, + "step": 3263 + }, + { + "epoch": 0.9041551246537396, + "grad_norm": 0.7129132151603699, + "learning_rate": 4.738976966552407e-06, + "loss": 0.6783, + "step": 3264 + }, + { + "epoch": 0.9044321329639889, + "grad_norm": 0.706730842590332, + "learning_rate": 4.7388148801724695e-06, + "loss": 0.7052, + "step": 3265 + }, + { + "epoch": 0.9047091412742382, + "grad_norm": 0.6315455436706543, + "learning_rate": 4.738652746256774e-06, + "loss": 0.6489, + "step": 3266 + }, + { + "epoch": 0.9049861495844875, + "grad_norm": 0.6674370169639587, + "learning_rate": 4.7384905648087645e-06, + "loss": 0.6139, + "step": 3267 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 0.654758095741272, + "learning_rate": 4.738328335831883e-06, + "loss": 0.6819, + "step": 3268 + }, + { + "epoch": 0.9055401662049861, + "grad_norm": 0.6712855100631714, + "learning_rate": 4.738166059329575e-06, + "loss": 0.6338, + "step": 3269 + }, + { + "epoch": 0.9058171745152355, + "grad_norm": 0.6991258263587952, + "learning_rate": 4.738003735305287e-06, + "loss": 0.7033, + "step": 3270 + }, + { + "epoch": 0.9060941828254848, + "grad_norm": 0.7066357731819153, + "learning_rate": 4.7378413637624635e-06, + "loss": 0.6532, + "step": 3271 + }, + { + "epoch": 0.9063711911357341, + "grad_norm": 0.6450583338737488, + "learning_rate": 4.737678944704552e-06, + "loss": 0.6506, + "step": 3272 + }, + { + "epoch": 0.9066481994459834, + "grad_norm": 0.6878437399864197, + "learning_rate": 4.737516478135003e-06, + "loss": 0.6555, + "step": 3273 + }, + { + "epoch": 0.9069252077562326, + "grad_norm": 0.686655580997467, + "learning_rate": 4.737353964057266e-06, + "loss": 0.6948, + "step": 3274 + }, + { + "epoch": 0.907202216066482, + "grad_norm": 0.7044336795806885, + "learning_rate": 4.73719140247479e-06, + "loss": 0.7176, + "step": 3275 + }, + { + "epoch": 0.9074792243767313, + "grad_norm": 0.6736946105957031, + "learning_rate": 4.737028793391027e-06, + "loss": 0.6026, + "step": 3276 + }, + { + "epoch": 0.9077562326869806, + "grad_norm": 0.6809951663017273, + "learning_rate": 4.736866136809431e-06, + "loss": 0.6892, + "step": 3277 + }, + { + "epoch": 0.9080332409972299, + "grad_norm": 0.6795469522476196, + "learning_rate": 4.736703432733454e-06, + "loss": 0.6348, + "step": 3278 + }, + { + "epoch": 0.9083102493074793, + "grad_norm": 0.674238383769989, + "learning_rate": 4.736540681166551e-06, + "loss": 0.6695, + "step": 3279 + }, + { + "epoch": 0.9085872576177285, + "grad_norm": 0.6498761177062988, + "learning_rate": 4.736377882112179e-06, + "loss": 0.6518, + "step": 3280 + }, + { + "epoch": 0.9088642659279779, + "grad_norm": 0.6777954697608948, + "learning_rate": 4.736215035573792e-06, + "loss": 0.6654, + "step": 3281 + }, + { + "epoch": 0.9091412742382271, + "grad_norm": 0.6966802477836609, + "learning_rate": 4.736052141554849e-06, + "loss": 0.7087, + "step": 3282 + }, + { + "epoch": 0.9094182825484765, + "grad_norm": 0.6832250356674194, + "learning_rate": 4.73588920005881e-06, + "loss": 0.6806, + "step": 3283 + }, + { + "epoch": 0.9096952908587258, + "grad_norm": 0.6498856544494629, + "learning_rate": 4.735726211089132e-06, + "loss": 0.5972, + "step": 3284 + }, + { + "epoch": 0.909972299168975, + "grad_norm": 0.6577579975128174, + "learning_rate": 4.735563174649278e-06, + "loss": 0.6761, + "step": 3285 + }, + { + "epoch": 0.9102493074792244, + "grad_norm": 0.718026876449585, + "learning_rate": 4.7354000907427095e-06, + "loss": 0.7497, + "step": 3286 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 0.6861085891723633, + "learning_rate": 4.7352369593728875e-06, + "loss": 0.7054, + "step": 3287 + }, + { + "epoch": 0.910803324099723, + "grad_norm": 0.663247287273407, + "learning_rate": 4.735073780543277e-06, + "loss": 0.695, + "step": 3288 + }, + { + "epoch": 0.9110803324099723, + "grad_norm": 0.6910631656646729, + "learning_rate": 4.734910554257342e-06, + "loss": 0.6695, + "step": 3289 + }, + { + "epoch": 0.9113573407202216, + "grad_norm": 0.6977118849754333, + "learning_rate": 4.734747280518549e-06, + "loss": 0.6954, + "step": 3290 + }, + { + "epoch": 0.9116343490304709, + "grad_norm": 0.6666392683982849, + "learning_rate": 4.734583959330364e-06, + "loss": 0.6922, + "step": 3291 + }, + { + "epoch": 0.9119113573407203, + "grad_norm": 0.6943200826644897, + "learning_rate": 4.734420590696256e-06, + "loss": 0.6756, + "step": 3292 + }, + { + "epoch": 0.9121883656509695, + "grad_norm": 0.689194917678833, + "learning_rate": 4.734257174619692e-06, + "loss": 0.6736, + "step": 3293 + }, + { + "epoch": 0.9124653739612189, + "grad_norm": 0.6545650362968445, + "learning_rate": 4.7340937111041426e-06, + "loss": 0.6403, + "step": 3294 + }, + { + "epoch": 0.9127423822714681, + "grad_norm": 0.6711616516113281, + "learning_rate": 4.733930200153078e-06, + "loss": 0.635, + "step": 3295 + }, + { + "epoch": 0.9130193905817174, + "grad_norm": 0.6374195218086243, + "learning_rate": 4.7337666417699715e-06, + "loss": 0.6438, + "step": 3296 + }, + { + "epoch": 0.9132963988919668, + "grad_norm": 0.6885934472084045, + "learning_rate": 4.733603035958293e-06, + "loss": 0.6421, + "step": 3297 + }, + { + "epoch": 0.913573407202216, + "grad_norm": 0.6831621527671814, + "learning_rate": 4.73343938272152e-06, + "loss": 0.6664, + "step": 3298 + }, + { + "epoch": 0.9138504155124654, + "grad_norm": 0.70931077003479, + "learning_rate": 4.733275682063124e-06, + "loss": 0.6999, + "step": 3299 + }, + { + "epoch": 0.9141274238227147, + "grad_norm": 0.6842132806777954, + "learning_rate": 4.733111933986583e-06, + "loss": 0.6761, + "step": 3300 + }, + { + "epoch": 0.914404432132964, + "grad_norm": 0.6851639151573181, + "learning_rate": 4.732948138495373e-06, + "loss": 0.6951, + "step": 3301 + }, + { + "epoch": 0.9146814404432133, + "grad_norm": 0.6987138986587524, + "learning_rate": 4.732784295592971e-06, + "loss": 0.6979, + "step": 3302 + }, + { + "epoch": 0.9149584487534625, + "grad_norm": 0.6682677865028381, + "learning_rate": 4.732620405282856e-06, + "loss": 0.6923, + "step": 3303 + }, + { + "epoch": 0.9152354570637119, + "grad_norm": 0.6620498895645142, + "learning_rate": 4.732456467568509e-06, + "loss": 0.6737, + "step": 3304 + }, + { + "epoch": 0.9155124653739612, + "grad_norm": 0.6930484175682068, + "learning_rate": 4.73229248245341e-06, + "loss": 0.6294, + "step": 3305 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 0.6822475790977478, + "learning_rate": 4.732128449941041e-06, + "loss": 0.6196, + "step": 3306 + }, + { + "epoch": 0.9160664819944598, + "grad_norm": 0.6686652898788452, + "learning_rate": 4.731964370034885e-06, + "loss": 0.635, + "step": 3307 + }, + { + "epoch": 0.9163434903047092, + "grad_norm": 0.6809656620025635, + "learning_rate": 4.7318002427384255e-06, + "loss": 0.6955, + "step": 3308 + }, + { + "epoch": 0.9166204986149584, + "grad_norm": 0.6582330465316772, + "learning_rate": 4.7316360680551475e-06, + "loss": 0.614, + "step": 3309 + }, + { + "epoch": 0.9168975069252078, + "grad_norm": 0.7251480221748352, + "learning_rate": 4.731471845988536e-06, + "loss": 0.6824, + "step": 3310 + }, + { + "epoch": 0.917174515235457, + "grad_norm": 0.645293116569519, + "learning_rate": 4.731307576542079e-06, + "loss": 0.6755, + "step": 3311 + }, + { + "epoch": 0.9174515235457064, + "grad_norm": 0.6753968596458435, + "learning_rate": 4.7311432597192655e-06, + "loss": 0.7234, + "step": 3312 + }, + { + "epoch": 0.9177285318559557, + "grad_norm": 0.6604711413383484, + "learning_rate": 4.730978895523581e-06, + "loss": 0.6388, + "step": 3313 + }, + { + "epoch": 0.918005540166205, + "grad_norm": 0.683600664138794, + "learning_rate": 4.7308144839585175e-06, + "loss": 0.681, + "step": 3314 + }, + { + "epoch": 0.9182825484764543, + "grad_norm": 0.6972975730895996, + "learning_rate": 4.7306500250275664e-06, + "loss": 0.6756, + "step": 3315 + }, + { + "epoch": 0.9185595567867036, + "grad_norm": 0.671061635017395, + "learning_rate": 4.730485518734218e-06, + "loss": 0.6665, + "step": 3316 + }, + { + "epoch": 0.9188365650969529, + "grad_norm": 0.6876084208488464, + "learning_rate": 4.730320965081967e-06, + "loss": 0.7245, + "step": 3317 + }, + { + "epoch": 0.9191135734072022, + "grad_norm": 0.6627107858657837, + "learning_rate": 4.730156364074306e-06, + "loss": 0.6679, + "step": 3318 + }, + { + "epoch": 0.9193905817174515, + "grad_norm": 0.6769272685050964, + "learning_rate": 4.72999171571473e-06, + "loss": 0.6321, + "step": 3319 + }, + { + "epoch": 0.9196675900277008, + "grad_norm": 0.6696699261665344, + "learning_rate": 4.729827020006735e-06, + "loss": 0.6608, + "step": 3320 + }, + { + "epoch": 0.9199445983379502, + "grad_norm": 0.6594526767730713, + "learning_rate": 4.729662276953818e-06, + "loss": 0.6813, + "step": 3321 + }, + { + "epoch": 0.9202216066481994, + "grad_norm": 0.7049792408943176, + "learning_rate": 4.729497486559477e-06, + "loss": 0.6602, + "step": 3322 + }, + { + "epoch": 0.9204986149584488, + "grad_norm": 0.6542843580245972, + "learning_rate": 4.72933264882721e-06, + "loss": 0.6122, + "step": 3323 + }, + { + "epoch": 0.9207756232686981, + "grad_norm": 0.6764165759086609, + "learning_rate": 4.729167763760519e-06, + "loss": 0.7229, + "step": 3324 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.6534932255744934, + "learning_rate": 4.729002831362903e-06, + "loss": 0.6313, + "step": 3325 + }, + { + "epoch": 0.9213296398891967, + "grad_norm": 0.6753476858139038, + "learning_rate": 4.7288378516378655e-06, + "loss": 0.6742, + "step": 3326 + }, + { + "epoch": 0.9216066481994459, + "grad_norm": 0.6870114803314209, + "learning_rate": 4.728672824588908e-06, + "loss": 0.6941, + "step": 3327 + }, + { + "epoch": 0.9218836565096953, + "grad_norm": 0.6485161781311035, + "learning_rate": 4.728507750219535e-06, + "loss": 0.6246, + "step": 3328 + }, + { + "epoch": 0.9221606648199446, + "grad_norm": 0.6565285921096802, + "learning_rate": 4.7283426285332514e-06, + "loss": 0.6628, + "step": 3329 + }, + { + "epoch": 0.9224376731301939, + "grad_norm": 0.6443964242935181, + "learning_rate": 4.728177459533564e-06, + "loss": 0.6868, + "step": 3330 + }, + { + "epoch": 0.9227146814404432, + "grad_norm": 0.6890116333961487, + "learning_rate": 4.728012243223978e-06, + "loss": 0.7014, + "step": 3331 + }, + { + "epoch": 0.9229916897506926, + "grad_norm": 0.6694427132606506, + "learning_rate": 4.727846979608003e-06, + "loss": 0.6275, + "step": 3332 + }, + { + "epoch": 0.9232686980609418, + "grad_norm": 0.733548641204834, + "learning_rate": 4.7276816686891465e-06, + "loss": 0.6947, + "step": 3333 + }, + { + "epoch": 0.9235457063711912, + "grad_norm": 0.7278211116790771, + "learning_rate": 4.72751631047092e-06, + "loss": 0.6703, + "step": 3334 + }, + { + "epoch": 0.9238227146814404, + "grad_norm": 0.6326905488967896, + "learning_rate": 4.7273509049568334e-06, + "loss": 0.616, + "step": 3335 + }, + { + "epoch": 0.9240997229916897, + "grad_norm": 0.6611707210540771, + "learning_rate": 4.7271854521504e-06, + "loss": 0.6735, + "step": 3336 + }, + { + "epoch": 0.9243767313019391, + "grad_norm": 0.7007359266281128, + "learning_rate": 4.727019952055131e-06, + "loss": 0.6975, + "step": 3337 + }, + { + "epoch": 0.9246537396121883, + "grad_norm": 0.713338315486908, + "learning_rate": 4.726854404674541e-06, + "loss": 0.7236, + "step": 3338 + }, + { + "epoch": 0.9249307479224377, + "grad_norm": 0.6926329135894775, + "learning_rate": 4.7266888100121455e-06, + "loss": 0.6923, + "step": 3339 + }, + { + "epoch": 0.925207756232687, + "grad_norm": 0.6664144992828369, + "learning_rate": 4.7265231680714605e-06, + "loss": 0.691, + "step": 3340 + }, + { + "epoch": 0.9254847645429363, + "grad_norm": 0.6612799763679504, + "learning_rate": 4.726357478856002e-06, + "loss": 0.6582, + "step": 3341 + }, + { + "epoch": 0.9257617728531856, + "grad_norm": 0.6593889594078064, + "learning_rate": 4.726191742369289e-06, + "loss": 0.6536, + "step": 3342 + }, + { + "epoch": 0.9260387811634349, + "grad_norm": 0.6893621683120728, + "learning_rate": 4.7260259586148405e-06, + "loss": 0.6766, + "step": 3343 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 0.7140992879867554, + "learning_rate": 4.725860127596176e-06, + "loss": 0.6448, + "step": 3344 + }, + { + "epoch": 0.9265927977839336, + "grad_norm": 0.6934571862220764, + "learning_rate": 4.7256942493168176e-06, + "loss": 0.6908, + "step": 3345 + }, + { + "epoch": 0.9268698060941828, + "grad_norm": 0.644855260848999, + "learning_rate": 4.725528323780285e-06, + "loss": 0.6571, + "step": 3346 + }, + { + "epoch": 0.9271468144044321, + "grad_norm": 0.6686730980873108, + "learning_rate": 4.725362350990104e-06, + "loss": 0.6717, + "step": 3347 + }, + { + "epoch": 0.9274238227146815, + "grad_norm": 0.6989774703979492, + "learning_rate": 4.725196330949798e-06, + "loss": 0.6618, + "step": 3348 + }, + { + "epoch": 0.9277008310249307, + "grad_norm": 0.6894549131393433, + "learning_rate": 4.7250302636628895e-06, + "loss": 0.6222, + "step": 3349 + }, + { + "epoch": 0.9279778393351801, + "grad_norm": 0.7115414142608643, + "learning_rate": 4.724864149132907e-06, + "loss": 0.6424, + "step": 3350 + }, + { + "epoch": 0.9282548476454293, + "grad_norm": 0.6588413715362549, + "learning_rate": 4.7246979873633766e-06, + "loss": 0.6775, + "step": 3351 + }, + { + "epoch": 0.9285318559556787, + "grad_norm": 0.6895866990089417, + "learning_rate": 4.724531778357827e-06, + "loss": 0.6668, + "step": 3352 + }, + { + "epoch": 0.928808864265928, + "grad_norm": 0.7034494876861572, + "learning_rate": 4.724365522119787e-06, + "loss": 0.6918, + "step": 3353 + }, + { + "epoch": 0.9290858725761773, + "grad_norm": 0.685469925403595, + "learning_rate": 4.724199218652787e-06, + "loss": 0.6665, + "step": 3354 + }, + { + "epoch": 0.9293628808864266, + "grad_norm": 0.6953645348548889, + "learning_rate": 4.724032867960357e-06, + "loss": 0.62, + "step": 3355 + }, + { + "epoch": 0.929639889196676, + "grad_norm": 0.6760872006416321, + "learning_rate": 4.72386647004603e-06, + "loss": 0.6415, + "step": 3356 + }, + { + "epoch": 0.9299168975069252, + "grad_norm": 0.7333945035934448, + "learning_rate": 4.723700024913339e-06, + "loss": 0.6358, + "step": 3357 + }, + { + "epoch": 0.9301939058171745, + "grad_norm": 0.6890769600868225, + "learning_rate": 4.723533532565817e-06, + "loss": 0.6157, + "step": 3358 + }, + { + "epoch": 0.9304709141274238, + "grad_norm": 0.7347223162651062, + "learning_rate": 4.723366993007e-06, + "loss": 0.6789, + "step": 3359 + }, + { + "epoch": 0.9307479224376731, + "grad_norm": 0.6665700078010559, + "learning_rate": 4.7232004062404235e-06, + "loss": 0.6589, + "step": 3360 + }, + { + "epoch": 0.9310249307479225, + "grad_norm": 0.618806779384613, + "learning_rate": 4.723033772269626e-06, + "loss": 0.6269, + "step": 3361 + }, + { + "epoch": 0.9313019390581717, + "grad_norm": 0.6587586998939514, + "learning_rate": 4.722867091098144e-06, + "loss": 0.6907, + "step": 3362 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 0.6528121829032898, + "learning_rate": 4.722700362729516e-06, + "loss": 0.6062, + "step": 3363 + }, + { + "epoch": 0.9318559556786704, + "grad_norm": 0.6848546862602234, + "learning_rate": 4.722533587167285e-06, + "loss": 0.6603, + "step": 3364 + }, + { + "epoch": 0.9321329639889196, + "grad_norm": 0.6691559553146362, + "learning_rate": 4.722366764414989e-06, + "loss": 0.7152, + "step": 3365 + }, + { + "epoch": 0.932409972299169, + "grad_norm": 0.7005979418754578, + "learning_rate": 4.722199894476172e-06, + "loss": 0.6947, + "step": 3366 + }, + { + "epoch": 0.9326869806094182, + "grad_norm": 0.6714586615562439, + "learning_rate": 4.722032977354376e-06, + "loss": 0.6685, + "step": 3367 + }, + { + "epoch": 0.9329639889196676, + "grad_norm": 0.6560215353965759, + "learning_rate": 4.721866013053146e-06, + "loss": 0.6315, + "step": 3368 + }, + { + "epoch": 0.9332409972299169, + "grad_norm": 0.7032321095466614, + "learning_rate": 4.721699001576026e-06, + "loss": 0.6492, + "step": 3369 + }, + { + "epoch": 0.9335180055401662, + "grad_norm": 0.6863848567008972, + "learning_rate": 4.721531942926561e-06, + "loss": 0.6619, + "step": 3370 + }, + { + "epoch": 0.9337950138504155, + "grad_norm": 0.65123051404953, + "learning_rate": 4.721364837108302e-06, + "loss": 0.6821, + "step": 3371 + }, + { + "epoch": 0.9340720221606649, + "grad_norm": 0.6889357566833496, + "learning_rate": 4.7211976841247945e-06, + "loss": 0.6522, + "step": 3372 + }, + { + "epoch": 0.9343490304709141, + "grad_norm": 0.6711523532867432, + "learning_rate": 4.721030483979588e-06, + "loss": 0.6599, + "step": 3373 + }, + { + "epoch": 0.9346260387811635, + "grad_norm": 0.6738723516464233, + "learning_rate": 4.7208632366762305e-06, + "loss": 0.6697, + "step": 3374 + }, + { + "epoch": 0.9349030470914127, + "grad_norm": 0.6907211542129517, + "learning_rate": 4.7206959422182764e-06, + "loss": 0.6722, + "step": 3375 + }, + { + "epoch": 0.935180055401662, + "grad_norm": 0.6392459869384766, + "learning_rate": 4.720528600609276e-06, + "loss": 0.6543, + "step": 3376 + }, + { + "epoch": 0.9354570637119114, + "grad_norm": 0.7044897079467773, + "learning_rate": 4.720361211852784e-06, + "loss": 0.6878, + "step": 3377 + }, + { + "epoch": 0.9357340720221606, + "grad_norm": 0.7036895751953125, + "learning_rate": 4.720193775952352e-06, + "loss": 0.6862, + "step": 3378 + }, + { + "epoch": 0.93601108033241, + "grad_norm": 0.6759652495384216, + "learning_rate": 4.720026292911537e-06, + "loss": 0.6192, + "step": 3379 + }, + { + "epoch": 0.9362880886426593, + "grad_norm": 0.6304213404655457, + "learning_rate": 4.719858762733894e-06, + "loss": 0.6286, + "step": 3380 + }, + { + "epoch": 0.9365650969529086, + "grad_norm": 0.6903957724571228, + "learning_rate": 4.719691185422981e-06, + "loss": 0.7026, + "step": 3381 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 0.6814022660255432, + "learning_rate": 4.719523560982355e-06, + "loss": 0.6974, + "step": 3382 + }, + { + "epoch": 0.9371191135734072, + "grad_norm": 0.6745713949203491, + "learning_rate": 4.719355889415577e-06, + "loss": 0.6544, + "step": 3383 + }, + { + "epoch": 0.9373961218836565, + "grad_norm": 0.7096647620201111, + "learning_rate": 4.719188170726204e-06, + "loss": 0.7011, + "step": 3384 + }, + { + "epoch": 0.9376731301939059, + "grad_norm": 0.7082591652870178, + "learning_rate": 4.719020404917801e-06, + "loss": 0.596, + "step": 3385 + }, + { + "epoch": 0.9379501385041551, + "grad_norm": 0.6712037324905396, + "learning_rate": 4.718852591993927e-06, + "loss": 0.6941, + "step": 3386 + }, + { + "epoch": 0.9382271468144044, + "grad_norm": 0.6850805878639221, + "learning_rate": 4.718684731958146e-06, + "loss": 0.654, + "step": 3387 + }, + { + "epoch": 0.9385041551246538, + "grad_norm": 0.6575900316238403, + "learning_rate": 4.718516824814022e-06, + "loss": 0.7019, + "step": 3388 + }, + { + "epoch": 0.938781163434903, + "grad_norm": 0.6687057614326477, + "learning_rate": 4.7183488705651216e-06, + "loss": 0.6384, + "step": 3389 + }, + { + "epoch": 0.9390581717451524, + "grad_norm": 0.6599818468093872, + "learning_rate": 4.718180869215008e-06, + "loss": 0.6785, + "step": 3390 + }, + { + "epoch": 0.9393351800554016, + "grad_norm": 0.6838821172714233, + "learning_rate": 4.718012820767251e-06, + "loss": 0.7252, + "step": 3391 + }, + { + "epoch": 0.939612188365651, + "grad_norm": 0.697323203086853, + "learning_rate": 4.717844725225418e-06, + "loss": 0.6596, + "step": 3392 + }, + { + "epoch": 0.9398891966759003, + "grad_norm": 0.6465617418289185, + "learning_rate": 4.717676582593076e-06, + "loss": 0.6527, + "step": 3393 + }, + { + "epoch": 0.9401662049861496, + "grad_norm": 0.6578206419944763, + "learning_rate": 4.7175083928737984e-06, + "loss": 0.6432, + "step": 3394 + }, + { + "epoch": 0.9404432132963989, + "grad_norm": 0.670072615146637, + "learning_rate": 4.717340156071154e-06, + "loss": 0.6704, + "step": 3395 + }, + { + "epoch": 0.9407202216066483, + "grad_norm": 0.6486279964447021, + "learning_rate": 4.717171872188716e-06, + "loss": 0.6744, + "step": 3396 + }, + { + "epoch": 0.9409972299168975, + "grad_norm": 0.681574285030365, + "learning_rate": 4.717003541230057e-06, + "loss": 0.6656, + "step": 3397 + }, + { + "epoch": 0.9412742382271468, + "grad_norm": 0.6697400808334351, + "learning_rate": 4.71683516319875e-06, + "loss": 0.6755, + "step": 3398 + }, + { + "epoch": 0.9415512465373961, + "grad_norm": 0.7256743907928467, + "learning_rate": 4.716666738098373e-06, + "loss": 0.69, + "step": 3399 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 0.670193612575531, + "learning_rate": 4.716498265932501e-06, + "loss": 0.6904, + "step": 3400 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 0.6912937760353088, + "learning_rate": 4.716329746704709e-06, + "loss": 0.7133, + "step": 3401 + }, + { + "epoch": 0.942382271468144, + "grad_norm": 0.6947979927062988, + "learning_rate": 4.716161180418577e-06, + "loss": 0.6339, + "step": 3402 + }, + { + "epoch": 0.9426592797783934, + "grad_norm": 0.7074637413024902, + "learning_rate": 4.715992567077683e-06, + "loss": 0.6328, + "step": 3403 + }, + { + "epoch": 0.9429362880886426, + "grad_norm": 0.6708641648292542, + "learning_rate": 4.7158239066856096e-06, + "loss": 0.645, + "step": 3404 + }, + { + "epoch": 0.943213296398892, + "grad_norm": 0.6959966421127319, + "learning_rate": 4.7156551992459346e-06, + "loss": 0.6736, + "step": 3405 + }, + { + "epoch": 0.9434903047091413, + "grad_norm": 0.682021975517273, + "learning_rate": 4.715486444762242e-06, + "loss": 0.6571, + "step": 3406 + }, + { + "epoch": 0.9437673130193905, + "grad_norm": 0.7637782096862793, + "learning_rate": 4.715317643238114e-06, + "loss": 0.6335, + "step": 3407 + }, + { + "epoch": 0.9440443213296399, + "grad_norm": 0.6714920401573181, + "learning_rate": 4.715148794677135e-06, + "loss": 0.7047, + "step": 3408 + }, + { + "epoch": 0.9443213296398892, + "grad_norm": 0.6588884592056274, + "learning_rate": 4.714979899082891e-06, + "loss": 0.6834, + "step": 3409 + }, + { + "epoch": 0.9445983379501385, + "grad_norm": 0.6203243732452393, + "learning_rate": 4.714810956458967e-06, + "loss": 0.5646, + "step": 3410 + }, + { + "epoch": 0.9448753462603878, + "grad_norm": 0.6979451179504395, + "learning_rate": 4.71464196680895e-06, + "loss": 0.6774, + "step": 3411 + }, + { + "epoch": 0.9451523545706371, + "grad_norm": 0.6701903343200684, + "learning_rate": 4.714472930136429e-06, + "loss": 0.6498, + "step": 3412 + }, + { + "epoch": 0.9454293628808864, + "grad_norm": 0.7109444737434387, + "learning_rate": 4.714303846444992e-06, + "loss": 0.6503, + "step": 3413 + }, + { + "epoch": 0.9457063711911358, + "grad_norm": 0.7005951404571533, + "learning_rate": 4.71413471573823e-06, + "loss": 0.6476, + "step": 3414 + }, + { + "epoch": 0.945983379501385, + "grad_norm": 0.6723399758338928, + "learning_rate": 4.7139655380197336e-06, + "loss": 0.6349, + "step": 3415 + }, + { + "epoch": 0.9462603878116344, + "grad_norm": 0.7047520875930786, + "learning_rate": 4.713796313293096e-06, + "loss": 0.7124, + "step": 3416 + }, + { + "epoch": 0.9465373961218837, + "grad_norm": 0.6956657767295837, + "learning_rate": 4.7136270415619075e-06, + "loss": 0.6001, + "step": 3417 + }, + { + "epoch": 0.9468144044321329, + "grad_norm": 0.71816486120224, + "learning_rate": 4.713457722829765e-06, + "loss": 0.6532, + "step": 3418 + }, + { + "epoch": 0.9470914127423823, + "grad_norm": 0.6962186098098755, + "learning_rate": 4.713288357100262e-06, + "loss": 0.6849, + "step": 3419 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.746861457824707, + "learning_rate": 4.713118944376996e-06, + "loss": 0.6546, + "step": 3420 + }, + { + "epoch": 0.9476454293628809, + "grad_norm": 0.6704689264297485, + "learning_rate": 4.712949484663561e-06, + "loss": 0.6357, + "step": 3421 + }, + { + "epoch": 0.9479224376731302, + "grad_norm": 0.6442332863807678, + "learning_rate": 4.712779977963559e-06, + "loss": 0.6806, + "step": 3422 + }, + { + "epoch": 0.9481994459833795, + "grad_norm": 0.6512526869773865, + "learning_rate": 4.7126104242805866e-06, + "loss": 0.6601, + "step": 3423 + }, + { + "epoch": 0.9484764542936288, + "grad_norm": 0.6544331908226013, + "learning_rate": 4.712440823618245e-06, + "loss": 0.6574, + "step": 3424 + }, + { + "epoch": 0.9487534626038782, + "grad_norm": 0.7471049427986145, + "learning_rate": 4.712271175980134e-06, + "loss": 0.6827, + "step": 3425 + }, + { + "epoch": 0.9490304709141274, + "grad_norm": 0.6754305362701416, + "learning_rate": 4.712101481369857e-06, + "loss": 0.7138, + "step": 3426 + }, + { + "epoch": 0.9493074792243767, + "grad_norm": 0.6700847148895264, + "learning_rate": 4.711931739791017e-06, + "loss": 0.6737, + "step": 3427 + }, + { + "epoch": 0.949584487534626, + "grad_norm": 0.6627695560455322, + "learning_rate": 4.711761951247216e-06, + "loss": 0.6215, + "step": 3428 + }, + { + "epoch": 0.9498614958448753, + "grad_norm": 0.6418578028678894, + "learning_rate": 4.711592115742061e-06, + "loss": 0.6435, + "step": 3429 + }, + { + "epoch": 0.9501385041551247, + "grad_norm": 0.666086733341217, + "learning_rate": 4.711422233279158e-06, + "loss": 0.666, + "step": 3430 + }, + { + "epoch": 0.9504155124653739, + "grad_norm": 0.6819273829460144, + "learning_rate": 4.711252303862114e-06, + "loss": 0.6654, + "step": 3431 + }, + { + "epoch": 0.9506925207756233, + "grad_norm": 0.69044429063797, + "learning_rate": 4.711082327494536e-06, + "loss": 0.6637, + "step": 3432 + }, + { + "epoch": 0.9509695290858726, + "grad_norm": 0.6500899791717529, + "learning_rate": 4.710912304180034e-06, + "loss": 0.6722, + "step": 3433 + }, + { + "epoch": 0.9512465373961219, + "grad_norm": 0.6798520684242249, + "learning_rate": 4.710742233922217e-06, + "loss": 0.6408, + "step": 3434 + }, + { + "epoch": 0.9515235457063712, + "grad_norm": 0.6969449520111084, + "learning_rate": 4.710572116724698e-06, + "loss": 0.6702, + "step": 3435 + }, + { + "epoch": 0.9518005540166204, + "grad_norm": 0.7223117351531982, + "learning_rate": 4.710401952591088e-06, + "loss": 0.6647, + "step": 3436 + }, + { + "epoch": 0.9520775623268698, + "grad_norm": 0.6953335404396057, + "learning_rate": 4.710231741524999e-06, + "loss": 0.6811, + "step": 3437 + }, + { + "epoch": 0.9523545706371191, + "grad_norm": 0.6859951019287109, + "learning_rate": 4.7100614835300454e-06, + "loss": 0.6611, + "step": 3438 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 0.6977407336235046, + "learning_rate": 4.709891178609843e-06, + "loss": 0.7132, + "step": 3439 + }, + { + "epoch": 0.9529085872576177, + "grad_norm": 0.7158917784690857, + "learning_rate": 4.709720826768007e-06, + "loss": 0.655, + "step": 3440 + }, + { + "epoch": 0.9531855955678671, + "grad_norm": 0.6699243187904358, + "learning_rate": 4.709550428008155e-06, + "loss": 0.6466, + "step": 3441 + }, + { + "epoch": 0.9534626038781163, + "grad_norm": 0.6597692966461182, + "learning_rate": 4.709379982333906e-06, + "loss": 0.6597, + "step": 3442 + }, + { + "epoch": 0.9537396121883657, + "grad_norm": 0.6464130282402039, + "learning_rate": 4.709209489748877e-06, + "loss": 0.6717, + "step": 3443 + }, + { + "epoch": 0.9540166204986149, + "grad_norm": 0.6933079957962036, + "learning_rate": 4.7090389502566884e-06, + "loss": 0.6533, + "step": 3444 + }, + { + "epoch": 0.9542936288088643, + "grad_norm": 0.6588965058326721, + "learning_rate": 4.708868363860962e-06, + "loss": 0.6764, + "step": 3445 + }, + { + "epoch": 0.9545706371191136, + "grad_norm": 0.6584217548370361, + "learning_rate": 4.7086977305653195e-06, + "loss": 0.6812, + "step": 3446 + }, + { + "epoch": 0.9548476454293628, + "grad_norm": 0.6467680335044861, + "learning_rate": 4.7085270503733835e-06, + "loss": 0.7133, + "step": 3447 + }, + { + "epoch": 0.9551246537396122, + "grad_norm": 0.6523286700248718, + "learning_rate": 4.708356323288779e-06, + "loss": 0.6373, + "step": 3448 + }, + { + "epoch": 0.9554016620498615, + "grad_norm": 0.6429436206817627, + "learning_rate": 4.708185549315129e-06, + "loss": 0.6679, + "step": 3449 + }, + { + "epoch": 0.9556786703601108, + "grad_norm": 0.6593552231788635, + "learning_rate": 4.708014728456062e-06, + "loss": 0.6264, + "step": 3450 + }, + { + "epoch": 0.9559556786703601, + "grad_norm": 0.692894697189331, + "learning_rate": 4.707843860715204e-06, + "loss": 0.6142, + "step": 3451 + }, + { + "epoch": 0.9562326869806094, + "grad_norm": 0.6877021193504333, + "learning_rate": 4.70767294609618e-06, + "loss": 0.5669, + "step": 3452 + }, + { + "epoch": 0.9565096952908587, + "grad_norm": 0.7020219564437866, + "learning_rate": 4.707501984602624e-06, + "loss": 0.6703, + "step": 3453 + }, + { + "epoch": 0.9567867036011081, + "grad_norm": 0.7109591960906982, + "learning_rate": 4.707330976238162e-06, + "loss": 0.6907, + "step": 3454 + }, + { + "epoch": 0.9570637119113573, + "grad_norm": 0.7097887396812439, + "learning_rate": 4.7071599210064275e-06, + "loss": 0.6712, + "step": 3455 + }, + { + "epoch": 0.9573407202216067, + "grad_norm": 0.7130573391914368, + "learning_rate": 4.7069888189110516e-06, + "loss": 0.6989, + "step": 3456 + }, + { + "epoch": 0.957617728531856, + "grad_norm": 0.6701627969741821, + "learning_rate": 4.706817669955667e-06, + "loss": 0.704, + "step": 3457 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 0.6597571969032288, + "learning_rate": 4.7066464741439064e-06, + "loss": 0.6759, + "step": 3458 + }, + { + "epoch": 0.9581717451523546, + "grad_norm": 0.6537261009216309, + "learning_rate": 4.706475231479407e-06, + "loss": 0.6853, + "step": 3459 + }, + { + "epoch": 0.9584487534626038, + "grad_norm": 0.6804844737052917, + "learning_rate": 4.706303941965804e-06, + "loss": 0.6645, + "step": 3460 + }, + { + "epoch": 0.9587257617728532, + "grad_norm": 0.6807675361633301, + "learning_rate": 4.706132605606733e-06, + "loss": 0.6846, + "step": 3461 + }, + { + "epoch": 0.9590027700831025, + "grad_norm": 0.7017925381660461, + "learning_rate": 4.7059612224058335e-06, + "loss": 0.6329, + "step": 3462 + }, + { + "epoch": 0.9592797783933518, + "grad_norm": 0.6846858859062195, + "learning_rate": 4.705789792366744e-06, + "loss": 0.6858, + "step": 3463 + }, + { + "epoch": 0.9595567867036011, + "grad_norm": 0.7076050639152527, + "learning_rate": 4.705618315493104e-06, + "loss": 0.653, + "step": 3464 + }, + { + "epoch": 0.9598337950138505, + "grad_norm": 0.6549046039581299, + "learning_rate": 4.7054467917885545e-06, + "loss": 0.6799, + "step": 3465 + }, + { + "epoch": 0.9601108033240997, + "grad_norm": 0.668217658996582, + "learning_rate": 4.705275221256738e-06, + "loss": 0.6367, + "step": 3466 + }, + { + "epoch": 0.960387811634349, + "grad_norm": 0.6847381591796875, + "learning_rate": 4.705103603901296e-06, + "loss": 0.6493, + "step": 3467 + }, + { + "epoch": 0.9606648199445983, + "grad_norm": 0.7230570316314697, + "learning_rate": 4.704931939725874e-06, + "loss": 0.6785, + "step": 3468 + }, + { + "epoch": 0.9609418282548476, + "grad_norm": 0.6642930507659912, + "learning_rate": 4.704760228734116e-06, + "loss": 0.6195, + "step": 3469 + }, + { + "epoch": 0.961218836565097, + "grad_norm": 0.6809485554695129, + "learning_rate": 4.704588470929668e-06, + "loss": 0.6773, + "step": 3470 + }, + { + "epoch": 0.9614958448753462, + "grad_norm": 0.6308563351631165, + "learning_rate": 4.704416666316177e-06, + "loss": 0.6313, + "step": 3471 + }, + { + "epoch": 0.9617728531855956, + "grad_norm": 0.6679580211639404, + "learning_rate": 4.704244814897291e-06, + "loss": 0.6274, + "step": 3472 + }, + { + "epoch": 0.9620498614958449, + "grad_norm": 0.7183241844177246, + "learning_rate": 4.704072916676658e-06, + "loss": 0.6432, + "step": 3473 + }, + { + "epoch": 0.9623268698060942, + "grad_norm": 0.6756312847137451, + "learning_rate": 4.703900971657929e-06, + "loss": 0.6477, + "step": 3474 + }, + { + "epoch": 0.9626038781163435, + "grad_norm": 0.7039386034011841, + "learning_rate": 4.703728979844753e-06, + "loss": 0.6681, + "step": 3475 + }, + { + "epoch": 0.9628808864265928, + "grad_norm": 0.6932883262634277, + "learning_rate": 4.703556941240784e-06, + "loss": 0.6858, + "step": 3476 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 0.9859476089477539, + "learning_rate": 4.7033848558496735e-06, + "loss": 0.6897, + "step": 3477 + }, + { + "epoch": 0.9634349030470915, + "grad_norm": 0.7097819447517395, + "learning_rate": 4.703212723675076e-06, + "loss": 0.6639, + "step": 3478 + }, + { + "epoch": 0.9637119113573407, + "grad_norm": 0.6593777537345886, + "learning_rate": 4.7030405447206465e-06, + "loss": 0.6823, + "step": 3479 + }, + { + "epoch": 0.96398891966759, + "grad_norm": 0.696010410785675, + "learning_rate": 4.702868318990038e-06, + "loss": 0.6707, + "step": 3480 + }, + { + "epoch": 0.9642659279778394, + "grad_norm": 0.681574821472168, + "learning_rate": 4.702696046486912e-06, + "loss": 0.6909, + "step": 3481 + }, + { + "epoch": 0.9645429362880886, + "grad_norm": 0.6492648720741272, + "learning_rate": 4.702523727214922e-06, + "loss": 0.6436, + "step": 3482 + }, + { + "epoch": 0.964819944598338, + "grad_norm": 0.6883756518363953, + "learning_rate": 4.7023513611777305e-06, + "loss": 0.6718, + "step": 3483 + }, + { + "epoch": 0.9650969529085872, + "grad_norm": 0.6744667887687683, + "learning_rate": 4.702178948378995e-06, + "loss": 0.6664, + "step": 3484 + }, + { + "epoch": 0.9653739612188366, + "grad_norm": 0.676249086856842, + "learning_rate": 4.702006488822376e-06, + "loss": 0.6607, + "step": 3485 + }, + { + "epoch": 0.9656509695290859, + "grad_norm": 0.6973958015441895, + "learning_rate": 4.701833982511537e-06, + "loss": 0.6672, + "step": 3486 + }, + { + "epoch": 0.9659279778393352, + "grad_norm": 0.6347668170928955, + "learning_rate": 4.701661429450139e-06, + "loss": 0.5975, + "step": 3487 + }, + { + "epoch": 0.9662049861495845, + "grad_norm": 0.6446684002876282, + "learning_rate": 4.701488829641845e-06, + "loss": 0.6622, + "step": 3488 + }, + { + "epoch": 0.9664819944598338, + "grad_norm": 0.7026192545890808, + "learning_rate": 4.7013161830903235e-06, + "loss": 0.6786, + "step": 3489 + }, + { + "epoch": 0.9667590027700831, + "grad_norm": 0.6588451862335205, + "learning_rate": 4.701143489799236e-06, + "loss": 0.6729, + "step": 3490 + }, + { + "epoch": 0.9670360110803324, + "grad_norm": 0.6893823146820068, + "learning_rate": 4.700970749772253e-06, + "loss": 0.6637, + "step": 3491 + }, + { + "epoch": 0.9673130193905817, + "grad_norm": 0.7043506503105164, + "learning_rate": 4.700797963013039e-06, + "loss": 0.6648, + "step": 3492 + }, + { + "epoch": 0.967590027700831, + "grad_norm": 0.6461666226387024, + "learning_rate": 4.700625129525263e-06, + "loss": 0.6728, + "step": 3493 + }, + { + "epoch": 0.9678670360110804, + "grad_norm": 0.6638358235359192, + "learning_rate": 4.7004522493125975e-06, + "loss": 0.6815, + "step": 3494 + }, + { + "epoch": 0.9681440443213296, + "grad_norm": 0.6599012017250061, + "learning_rate": 4.700279322378712e-06, + "loss": 0.6831, + "step": 3495 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 0.7229272127151489, + "learning_rate": 4.700106348727276e-06, + "loss": 0.6823, + "step": 3496 + }, + { + "epoch": 0.9686980609418283, + "grad_norm": 0.6756724119186401, + "learning_rate": 4.699933328361964e-06, + "loss": 0.6549, + "step": 3497 + }, + { + "epoch": 0.9689750692520775, + "grad_norm": 0.6939228773117065, + "learning_rate": 4.699760261286449e-06, + "loss": 0.7094, + "step": 3498 + }, + { + "epoch": 0.9692520775623269, + "grad_norm": 0.6404763460159302, + "learning_rate": 4.699587147504407e-06, + "loss": 0.6724, + "step": 3499 + }, + { + "epoch": 0.9695290858725761, + "grad_norm": 0.7101907134056091, + "learning_rate": 4.6994139870195125e-06, + "loss": 0.712, + "step": 3500 + }, + { + "epoch": 0.9698060941828255, + "grad_norm": 0.6645216345787048, + "learning_rate": 4.699240779835443e-06, + "loss": 0.6843, + "step": 3501 + }, + { + "epoch": 0.9700831024930748, + "grad_norm": 0.6481513381004333, + "learning_rate": 4.699067525955874e-06, + "loss": 0.6724, + "step": 3502 + }, + { + "epoch": 0.9703601108033241, + "grad_norm": 0.7024312615394592, + "learning_rate": 4.698894225384487e-06, + "loss": 0.6837, + "step": 3503 + }, + { + "epoch": 0.9706371191135734, + "grad_norm": 0.6771408319473267, + "learning_rate": 4.6987208781249595e-06, + "loss": 0.6889, + "step": 3504 + }, + { + "epoch": 0.9709141274238227, + "grad_norm": 0.6590652465820312, + "learning_rate": 4.698547484180973e-06, + "loss": 0.6646, + "step": 3505 + }, + { + "epoch": 0.971191135734072, + "grad_norm": 0.7020836472511292, + "learning_rate": 4.698374043556209e-06, + "loss": 0.7133, + "step": 3506 + }, + { + "epoch": 0.9714681440443214, + "grad_norm": 0.6353387236595154, + "learning_rate": 4.69820055625435e-06, + "loss": 0.6207, + "step": 3507 + }, + { + "epoch": 0.9717451523545706, + "grad_norm": 0.6599223613739014, + "learning_rate": 4.698027022279079e-06, + "loss": 0.6412, + "step": 3508 + }, + { + "epoch": 0.97202216066482, + "grad_norm": 0.6925457715988159, + "learning_rate": 4.697853441634082e-06, + "loss": 0.6453, + "step": 3509 + }, + { + "epoch": 0.9722991689750693, + "grad_norm": 0.6730129718780518, + "learning_rate": 4.697679814323044e-06, + "loss": 0.6777, + "step": 3510 + }, + { + "epoch": 0.9725761772853185, + "grad_norm": 0.6823524832725525, + "learning_rate": 4.69750614034965e-06, + "loss": 0.7242, + "step": 3511 + }, + { + "epoch": 0.9728531855955679, + "grad_norm": 0.6779269576072693, + "learning_rate": 4.697332419717589e-06, + "loss": 0.6541, + "step": 3512 + }, + { + "epoch": 0.9731301939058171, + "grad_norm": 0.6031319499015808, + "learning_rate": 4.69715865243055e-06, + "loss": 0.6121, + "step": 3513 + }, + { + "epoch": 0.9734072022160665, + "grad_norm": 0.6461656093597412, + "learning_rate": 4.696984838492222e-06, + "loss": 0.7018, + "step": 3514 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.6950158476829529, + "learning_rate": 4.696810977906296e-06, + "loss": 0.6666, + "step": 3515 + }, + { + "epoch": 0.9739612188365651, + "grad_norm": 0.658356785774231, + "learning_rate": 4.696637070676462e-06, + "loss": 0.6518, + "step": 3516 + }, + { + "epoch": 0.9742382271468144, + "grad_norm": 0.7311317920684814, + "learning_rate": 4.696463116806414e-06, + "loss": 0.7112, + "step": 3517 + }, + { + "epoch": 0.9745152354570638, + "grad_norm": 0.6782693266868591, + "learning_rate": 4.696289116299844e-06, + "loss": 0.6726, + "step": 3518 + }, + { + "epoch": 0.974792243767313, + "grad_norm": 0.6792790293693542, + "learning_rate": 4.696115069160448e-06, + "loss": 0.648, + "step": 3519 + }, + { + "epoch": 0.9750692520775623, + "grad_norm": 0.7299467325210571, + "learning_rate": 4.695940975391919e-06, + "loss": 0.6764, + "step": 3520 + }, + { + "epoch": 0.9753462603878116, + "grad_norm": 0.6922098398208618, + "learning_rate": 4.695766834997957e-06, + "loss": 0.6705, + "step": 3521 + }, + { + "epoch": 0.9756232686980609, + "grad_norm": 0.6680386662483215, + "learning_rate": 4.695592647982258e-06, + "loss": 0.6282, + "step": 3522 + }, + { + "epoch": 0.9759002770083103, + "grad_norm": 0.6693275570869446, + "learning_rate": 4.695418414348519e-06, + "loss": 0.6997, + "step": 3523 + }, + { + "epoch": 0.9761772853185595, + "grad_norm": 0.6710549592971802, + "learning_rate": 4.695244134100442e-06, + "loss": 0.6322, + "step": 3524 + }, + { + "epoch": 0.9764542936288089, + "grad_norm": 0.6785879135131836, + "learning_rate": 4.695069807241724e-06, + "loss": 0.6538, + "step": 3525 + }, + { + "epoch": 0.9767313019390582, + "grad_norm": 0.6887544989585876, + "learning_rate": 4.694895433776071e-06, + "loss": 0.6663, + "step": 3526 + }, + { + "epoch": 0.9770083102493075, + "grad_norm": 0.7070327997207642, + "learning_rate": 4.694721013707181e-06, + "loss": 0.6693, + "step": 3527 + }, + { + "epoch": 0.9772853185595568, + "grad_norm": 0.6848759651184082, + "learning_rate": 4.6945465470387595e-06, + "loss": 0.6784, + "step": 3528 + }, + { + "epoch": 0.977562326869806, + "grad_norm": 0.6864799857139587, + "learning_rate": 4.69437203377451e-06, + "loss": 0.6472, + "step": 3529 + }, + { + "epoch": 0.9778393351800554, + "grad_norm": 0.65505051612854, + "learning_rate": 4.694197473918139e-06, + "loss": 0.6529, + "step": 3530 + }, + { + "epoch": 0.9781163434903047, + "grad_norm": 0.6809942126274109, + "learning_rate": 4.694022867473352e-06, + "loss": 0.669, + "step": 3531 + }, + { + "epoch": 0.978393351800554, + "grad_norm": 0.7042862176895142, + "learning_rate": 4.693848214443858e-06, + "loss": 0.6792, + "step": 3532 + }, + { + "epoch": 0.9786703601108033, + "grad_norm": 0.6662670373916626, + "learning_rate": 4.693673514833363e-06, + "loss": 0.6066, + "step": 3533 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 0.6451749205589294, + "learning_rate": 4.693498768645577e-06, + "loss": 0.6622, + "step": 3534 + }, + { + "epoch": 0.9792243767313019, + "grad_norm": 0.6562368273735046, + "learning_rate": 4.69332397588421e-06, + "loss": 0.6225, + "step": 3535 + }, + { + "epoch": 0.9795013850415513, + "grad_norm": 0.7036115527153015, + "learning_rate": 4.693149136552975e-06, + "loss": 0.6379, + "step": 3536 + }, + { + "epoch": 0.9797783933518005, + "grad_norm": 0.7075529098510742, + "learning_rate": 4.692974250655582e-06, + "loss": 0.6859, + "step": 3537 + }, + { + "epoch": 0.9800554016620499, + "grad_norm": 0.7317038774490356, + "learning_rate": 4.6927993181957456e-06, + "loss": 0.6833, + "step": 3538 + }, + { + "epoch": 0.9803324099722992, + "grad_norm": 0.6922315955162048, + "learning_rate": 4.692624339177181e-06, + "loss": 0.6971, + "step": 3539 + }, + { + "epoch": 0.9806094182825484, + "grad_norm": 0.6648818254470825, + "learning_rate": 4.692449313603601e-06, + "loss": 0.6506, + "step": 3540 + }, + { + "epoch": 0.9808864265927978, + "grad_norm": 0.6564773917198181, + "learning_rate": 4.6922742414787235e-06, + "loss": 0.6696, + "step": 3541 + }, + { + "epoch": 0.9811634349030471, + "grad_norm": 0.6901625990867615, + "learning_rate": 4.6920991228062664e-06, + "loss": 0.6548, + "step": 3542 + }, + { + "epoch": 0.9814404432132964, + "grad_norm": 0.6555216908454895, + "learning_rate": 4.691923957589946e-06, + "loss": 0.6035, + "step": 3543 + }, + { + "epoch": 0.9817174515235457, + "grad_norm": 0.6624067425727844, + "learning_rate": 4.6917487458334826e-06, + "loss": 0.6725, + "step": 3544 + }, + { + "epoch": 0.981994459833795, + "grad_norm": 0.691081702709198, + "learning_rate": 4.691573487540596e-06, + "loss": 0.6944, + "step": 3545 + }, + { + "epoch": 0.9822714681440443, + "grad_norm": 0.6511529684066772, + "learning_rate": 4.691398182715008e-06, + "loss": 0.661, + "step": 3546 + }, + { + "epoch": 0.9825484764542937, + "grad_norm": 0.6528334617614746, + "learning_rate": 4.69122283136044e-06, + "loss": 0.6942, + "step": 3547 + }, + { + "epoch": 0.9828254847645429, + "grad_norm": 0.6623634696006775, + "learning_rate": 4.691047433480616e-06, + "loss": 0.6701, + "step": 3548 + }, + { + "epoch": 0.9831024930747922, + "grad_norm": 0.6403034329414368, + "learning_rate": 4.69087198907926e-06, + "loss": 0.6517, + "step": 3549 + }, + { + "epoch": 0.9833795013850416, + "grad_norm": 0.6423508524894714, + "learning_rate": 4.690696498160096e-06, + "loss": 0.6115, + "step": 3550 + }, + { + "epoch": 0.9836565096952908, + "grad_norm": 0.6804136037826538, + "learning_rate": 4.69052096072685e-06, + "loss": 0.699, + "step": 3551 + }, + { + "epoch": 0.9839335180055402, + "grad_norm": 0.6722675561904907, + "learning_rate": 4.6903453767832514e-06, + "loss": 0.667, + "step": 3552 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 0.6349672675132751, + "learning_rate": 4.690169746333027e-06, + "loss": 0.6409, + "step": 3553 + }, + { + "epoch": 0.9844875346260388, + "grad_norm": 0.6569719910621643, + "learning_rate": 4.689994069379905e-06, + "loss": 0.5844, + "step": 3554 + }, + { + "epoch": 0.9847645429362881, + "grad_norm": 0.6513596773147583, + "learning_rate": 4.689818345927617e-06, + "loss": 0.6613, + "step": 3555 + }, + { + "epoch": 0.9850415512465374, + "grad_norm": 0.6963537931442261, + "learning_rate": 4.6896425759798925e-06, + "loss": 0.6691, + "step": 3556 + }, + { + "epoch": 0.9853185595567867, + "grad_norm": 0.6589019894599915, + "learning_rate": 4.6894667595404654e-06, + "loss": 0.6759, + "step": 3557 + }, + { + "epoch": 0.9855955678670361, + "grad_norm": 0.711513876914978, + "learning_rate": 4.6892908966130665e-06, + "loss": 0.6919, + "step": 3558 + }, + { + "epoch": 0.9858725761772853, + "grad_norm": 0.6605263352394104, + "learning_rate": 4.689114987201432e-06, + "loss": 0.6374, + "step": 3559 + }, + { + "epoch": 0.9861495844875346, + "grad_norm": 0.6809438467025757, + "learning_rate": 4.688939031309296e-06, + "loss": 0.7072, + "step": 3560 + }, + { + "epoch": 0.9864265927977839, + "grad_norm": 0.6980892419815063, + "learning_rate": 4.688763028940394e-06, + "loss": 0.6791, + "step": 3561 + }, + { + "epoch": 0.9867036011080332, + "grad_norm": 0.6453639268875122, + "learning_rate": 4.6885869800984634e-06, + "loss": 0.6198, + "step": 3562 + }, + { + "epoch": 0.9869806094182826, + "grad_norm": 0.7025441527366638, + "learning_rate": 4.6884108847872425e-06, + "loss": 0.6646, + "step": 3563 + }, + { + "epoch": 0.9872576177285318, + "grad_norm": 0.6834899187088013, + "learning_rate": 4.688234743010469e-06, + "loss": 0.6535, + "step": 3564 + }, + { + "epoch": 0.9875346260387812, + "grad_norm": 0.6900626420974731, + "learning_rate": 4.6880585547718845e-06, + "loss": 0.6207, + "step": 3565 + }, + { + "epoch": 0.9878116343490305, + "grad_norm": 0.683000385761261, + "learning_rate": 4.687882320075229e-06, + "loss": 0.6541, + "step": 3566 + }, + { + "epoch": 0.9880886426592798, + "grad_norm": 0.6564151048660278, + "learning_rate": 4.687706038924244e-06, + "loss": 0.636, + "step": 3567 + }, + { + "epoch": 0.9883656509695291, + "grad_norm": 0.6594918370246887, + "learning_rate": 4.687529711322675e-06, + "loss": 0.6138, + "step": 3568 + }, + { + "epoch": 0.9886426592797783, + "grad_norm": 0.6680994629859924, + "learning_rate": 4.687353337274262e-06, + "loss": 0.6581, + "step": 3569 + }, + { + "epoch": 0.9889196675900277, + "grad_norm": 0.6732953786849976, + "learning_rate": 4.687176916782753e-06, + "loss": 0.6795, + "step": 3570 + }, + { + "epoch": 0.989196675900277, + "grad_norm": 0.679394006729126, + "learning_rate": 4.687000449851892e-06, + "loss": 0.6376, + "step": 3571 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.7351693511009216, + "learning_rate": 4.686823936485426e-06, + "loss": 0.6874, + "step": 3572 + }, + { + "epoch": 0.9897506925207756, + "grad_norm": 0.7224422693252563, + "learning_rate": 4.686647376687104e-06, + "loss": 0.6522, + "step": 3573 + }, + { + "epoch": 0.990027700831025, + "grad_norm": 0.6476606726646423, + "learning_rate": 4.686470770460674e-06, + "loss": 0.6529, + "step": 3574 + }, + { + "epoch": 0.9903047091412742, + "grad_norm": 0.6556124091148376, + "learning_rate": 4.6862941178098855e-06, + "loss": 0.5976, + "step": 3575 + }, + { + "epoch": 0.9905817174515236, + "grad_norm": 0.6533079147338867, + "learning_rate": 4.686117418738489e-06, + "loss": 0.6716, + "step": 3576 + }, + { + "epoch": 0.9908587257617728, + "grad_norm": 0.6209880709648132, + "learning_rate": 4.685940673250239e-06, + "loss": 0.5748, + "step": 3577 + }, + { + "epoch": 0.9911357340720222, + "grad_norm": 0.6872321367263794, + "learning_rate": 4.685763881348885e-06, + "loss": 0.6536, + "step": 3578 + }, + { + "epoch": 0.9914127423822715, + "grad_norm": 0.7070121169090271, + "learning_rate": 4.685587043038182e-06, + "loss": 0.6236, + "step": 3579 + }, + { + "epoch": 0.9916897506925207, + "grad_norm": 0.6352293491363525, + "learning_rate": 4.685410158321885e-06, + "loss": 0.6185, + "step": 3580 + }, + { + "epoch": 0.9919667590027701, + "grad_norm": 0.6836139559745789, + "learning_rate": 4.685233227203749e-06, + "loss": 0.6413, + "step": 3581 + }, + { + "epoch": 0.9922437673130194, + "grad_norm": 0.6847363710403442, + "learning_rate": 4.685056249687531e-06, + "loss": 0.671, + "step": 3582 + }, + { + "epoch": 0.9925207756232687, + "grad_norm": 0.6621556282043457, + "learning_rate": 4.684879225776989e-06, + "loss": 0.6815, + "step": 3583 + }, + { + "epoch": 0.992797783933518, + "grad_norm": 0.6949865221977234, + "learning_rate": 4.684702155475882e-06, + "loss": 0.6364, + "step": 3584 + }, + { + "epoch": 0.9930747922437673, + "grad_norm": 0.6503170728683472, + "learning_rate": 4.684525038787968e-06, + "loss": 0.6872, + "step": 3585 + }, + { + "epoch": 0.9933518005540166, + "grad_norm": 0.6437188386917114, + "learning_rate": 4.6843478757170095e-06, + "loss": 0.6351, + "step": 3586 + }, + { + "epoch": 0.993628808864266, + "grad_norm": 0.6677717566490173, + "learning_rate": 4.684170666266767e-06, + "loss": 0.7332, + "step": 3587 + }, + { + "epoch": 0.9939058171745152, + "grad_norm": 0.6620640754699707, + "learning_rate": 4.683993410441004e-06, + "loss": 0.6612, + "step": 3588 + }, + { + "epoch": 0.9941828254847646, + "grad_norm": 0.6833710074424744, + "learning_rate": 4.683816108243484e-06, + "loss": 0.6662, + "step": 3589 + }, + { + "epoch": 0.9944598337950139, + "grad_norm": 0.7151123881340027, + "learning_rate": 4.68363875967797e-06, + "loss": 0.6797, + "step": 3590 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 0.7432909607887268, + "learning_rate": 4.683461364748229e-06, + "loss": 0.6629, + "step": 3591 + }, + { + "epoch": 0.9950138504155125, + "grad_norm": 0.6998159289360046, + "learning_rate": 4.683283923458028e-06, + "loss": 0.6761, + "step": 3592 + }, + { + "epoch": 0.9952908587257617, + "grad_norm": 0.670008659362793, + "learning_rate": 4.683106435811133e-06, + "loss": 0.6579, + "step": 3593 + }, + { + "epoch": 0.9955678670360111, + "grad_norm": 0.6878396272659302, + "learning_rate": 4.682928901811315e-06, + "loss": 0.7188, + "step": 3594 + }, + { + "epoch": 0.9958448753462604, + "grad_norm": 0.7093462347984314, + "learning_rate": 4.68275132146234e-06, + "loss": 0.6486, + "step": 3595 + }, + { + "epoch": 0.9961218836565097, + "grad_norm": 0.7437435388565063, + "learning_rate": 4.68257369476798e-06, + "loss": 0.6911, + "step": 3596 + }, + { + "epoch": 0.996398891966759, + "grad_norm": 0.6841958165168762, + "learning_rate": 4.682396021732008e-06, + "loss": 0.6565, + "step": 3597 + }, + { + "epoch": 0.9966759002770084, + "grad_norm": 0.6469076871871948, + "learning_rate": 4.6822183023581945e-06, + "loss": 0.713, + "step": 3598 + }, + { + "epoch": 0.9969529085872576, + "grad_norm": 0.6475204229354858, + "learning_rate": 4.6820405366503145e-06, + "loss": 0.6907, + "step": 3599 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 0.7169883251190186, + "learning_rate": 4.6818627246121404e-06, + "loss": 0.6636, + "step": 3600 + }, + { + "epoch": 0.9975069252077562, + "grad_norm": 0.6753641366958618, + "learning_rate": 4.68168486624745e-06, + "loss": 0.6268, + "step": 3601 + }, + { + "epoch": 0.9977839335180055, + "grad_norm": 0.6558805108070374, + "learning_rate": 4.681506961560018e-06, + "loss": 0.682, + "step": 3602 + }, + { + "epoch": 0.9980609418282549, + "grad_norm": 0.7053347229957581, + "learning_rate": 4.681329010553622e-06, + "loss": 0.7016, + "step": 3603 + }, + { + "epoch": 0.9983379501385041, + "grad_norm": 0.6714473366737366, + "learning_rate": 4.681151013232041e-06, + "loss": 0.6639, + "step": 3604 + }, + { + "epoch": 0.9986149584487535, + "grad_norm": 0.6419603824615479, + "learning_rate": 4.680972969599053e-06, + "loss": 0.6326, + "step": 3605 + }, + { + "epoch": 0.9988919667590028, + "grad_norm": 0.6689679622650146, + "learning_rate": 4.68079487965844e-06, + "loss": 0.651, + "step": 3606 + }, + { + "epoch": 0.9991689750692521, + "grad_norm": 0.6623345613479614, + "learning_rate": 4.680616743413982e-06, + "loss": 0.6543, + "step": 3607 + }, + { + "epoch": 0.9994459833795014, + "grad_norm": 0.6692014932632446, + "learning_rate": 4.680438560869463e-06, + "loss": 0.6411, + "step": 3608 + }, + { + "epoch": 0.9997229916897507, + "grad_norm": 0.7073262333869934, + "learning_rate": 4.680260332028664e-06, + "loss": 0.6296, + "step": 3609 + }, + { + "epoch": 1.0, + "grad_norm": 0.698577880859375, + "learning_rate": 4.680082056895371e-06, + "loss": 0.6524, + "step": 3610 + }, + { + "epoch": 1.0002770083102492, + "grad_norm": 0.6731170415878296, + "learning_rate": 4.679903735473369e-06, + "loss": 0.6464, + "step": 3611 + }, + { + "epoch": 1.0005540166204987, + "grad_norm": 0.6520345211029053, + "learning_rate": 4.679725367766443e-06, + "loss": 0.6776, + "step": 3612 + }, + { + "epoch": 1.000831024930748, + "grad_norm": 0.6469970941543579, + "learning_rate": 4.679546953778381e-06, + "loss": 0.6336, + "step": 3613 + }, + { + "epoch": 1.0011080332409972, + "grad_norm": 0.627105176448822, + "learning_rate": 4.679368493512972e-06, + "loss": 0.6445, + "step": 3614 + }, + { + "epoch": 1.0013850415512466, + "grad_norm": 0.6837867498397827, + "learning_rate": 4.679189986974004e-06, + "loss": 0.6986, + "step": 3615 + }, + { + "epoch": 1.0016620498614959, + "grad_norm": 0.6424873471260071, + "learning_rate": 4.679011434165267e-06, + "loss": 0.5822, + "step": 3616 + }, + { + "epoch": 1.0019390581717451, + "grad_norm": 0.6437291502952576, + "learning_rate": 4.6788328350905535e-06, + "loss": 0.6721, + "step": 3617 + }, + { + "epoch": 1.0022160664819943, + "grad_norm": 0.6488319635391235, + "learning_rate": 4.678654189753654e-06, + "loss": 0.6695, + "step": 3618 + }, + { + "epoch": 1.0024930747922438, + "grad_norm": 0.7707616090774536, + "learning_rate": 4.678475498158363e-06, + "loss": 0.6881, + "step": 3619 + }, + { + "epoch": 1.002770083102493, + "grad_norm": 0.6751238703727722, + "learning_rate": 4.678296760308474e-06, + "loss": 0.5906, + "step": 3620 + }, + { + "epoch": 1.0030470914127423, + "grad_norm": 0.6540700793266296, + "learning_rate": 4.678117976207781e-06, + "loss": 0.5704, + "step": 3621 + }, + { + "epoch": 1.0033240997229917, + "grad_norm": 0.6810097694396973, + "learning_rate": 4.677939145860082e-06, + "loss": 0.6235, + "step": 3622 + }, + { + "epoch": 1.003601108033241, + "grad_norm": 0.6593797206878662, + "learning_rate": 4.677760269269173e-06, + "loss": 0.6184, + "step": 3623 + }, + { + "epoch": 1.0038781163434902, + "grad_norm": 0.6637284755706787, + "learning_rate": 4.677581346438852e-06, + "loss": 0.6539, + "step": 3624 + }, + { + "epoch": 1.0041551246537397, + "grad_norm": 0.6583884954452515, + "learning_rate": 4.677402377372918e-06, + "loss": 0.5725, + "step": 3625 + }, + { + "epoch": 1.004432132963989, + "grad_norm": 0.7044674754142761, + "learning_rate": 4.677223362075171e-06, + "loss": 0.7045, + "step": 3626 + }, + { + "epoch": 1.0047091412742382, + "grad_norm": 0.7017073631286621, + "learning_rate": 4.677044300549412e-06, + "loss": 0.6892, + "step": 3627 + }, + { + "epoch": 1.0049861495844876, + "grad_norm": 0.6834173798561096, + "learning_rate": 4.676865192799443e-06, + "loss": 0.5998, + "step": 3628 + }, + { + "epoch": 1.0052631578947369, + "grad_norm": 0.694014310836792, + "learning_rate": 4.6766860388290665e-06, + "loss": 0.6685, + "step": 3629 + }, + { + "epoch": 1.005540166204986, + "grad_norm": 0.7017285823822021, + "learning_rate": 4.676506838642088e-06, + "loss": 0.6742, + "step": 3630 + }, + { + "epoch": 1.0058171745152356, + "grad_norm": 0.6789602041244507, + "learning_rate": 4.6763275922423105e-06, + "loss": 0.732, + "step": 3631 + }, + { + "epoch": 1.0060941828254848, + "grad_norm": 0.724633514881134, + "learning_rate": 4.67614829963354e-06, + "loss": 0.6794, + "step": 3632 + }, + { + "epoch": 1.006371191135734, + "grad_norm": 0.6596531867980957, + "learning_rate": 4.675968960819584e-06, + "loss": 0.6641, + "step": 3633 + }, + { + "epoch": 1.0066481994459833, + "grad_norm": 0.7008945345878601, + "learning_rate": 4.67578957580425e-06, + "loss": 0.6609, + "step": 3634 + }, + { + "epoch": 1.0069252077562327, + "grad_norm": 0.662340521812439, + "learning_rate": 4.675610144591347e-06, + "loss": 0.6137, + "step": 3635 + }, + { + "epoch": 1.007202216066482, + "grad_norm": 0.703514575958252, + "learning_rate": 4.675430667184685e-06, + "loss": 0.61, + "step": 3636 + }, + { + "epoch": 1.0074792243767312, + "grad_norm": 0.675926148891449, + "learning_rate": 4.675251143588074e-06, + "loss": 0.6713, + "step": 3637 + }, + { + "epoch": 1.0077562326869807, + "grad_norm": 0.6686869263648987, + "learning_rate": 4.675071573805327e-06, + "loss": 0.6487, + "step": 3638 + }, + { + "epoch": 1.00803324099723, + "grad_norm": 0.7041005492210388, + "learning_rate": 4.674891957840255e-06, + "loss": 0.6618, + "step": 3639 + }, + { + "epoch": 1.0083102493074791, + "grad_norm": 0.7006294131278992, + "learning_rate": 4.674712295696674e-06, + "loss": 0.6877, + "step": 3640 + }, + { + "epoch": 1.0085872576177286, + "grad_norm": 0.6780726909637451, + "learning_rate": 4.674532587378396e-06, + "loss": 0.6457, + "step": 3641 + }, + { + "epoch": 1.0088642659279778, + "grad_norm": 0.6753490567207336, + "learning_rate": 4.674352832889239e-06, + "loss": 0.6326, + "step": 3642 + }, + { + "epoch": 1.009141274238227, + "grad_norm": 0.7531911730766296, + "learning_rate": 4.674173032233018e-06, + "loss": 0.6644, + "step": 3643 + }, + { + "epoch": 1.0094182825484765, + "grad_norm": 0.6730330586433411, + "learning_rate": 4.673993185413552e-06, + "loss": 0.6329, + "step": 3644 + }, + { + "epoch": 1.0096952908587258, + "grad_norm": 0.7184603214263916, + "learning_rate": 4.673813292434658e-06, + "loss": 0.6643, + "step": 3645 + }, + { + "epoch": 1.009972299168975, + "grad_norm": 0.7501634359359741, + "learning_rate": 4.673633353300157e-06, + "loss": 0.6286, + "step": 3646 + }, + { + "epoch": 1.0102493074792245, + "grad_norm": 0.7249453067779541, + "learning_rate": 4.67345336801387e-06, + "loss": 0.6791, + "step": 3647 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 0.6539937853813171, + "learning_rate": 4.673273336579616e-06, + "loss": 0.583, + "step": 3648 + }, + { + "epoch": 1.010803324099723, + "grad_norm": 0.6474573016166687, + "learning_rate": 4.673093259001219e-06, + "loss": 0.6132, + "step": 3649 + }, + { + "epoch": 1.0110803324099722, + "grad_norm": 0.684526264667511, + "learning_rate": 4.672913135282504e-06, + "loss": 0.5963, + "step": 3650 + }, + { + "epoch": 1.0113573407202217, + "grad_norm": 0.7160072922706604, + "learning_rate": 4.672732965427294e-06, + "loss": 0.6651, + "step": 3651 + }, + { + "epoch": 1.011634349030471, + "grad_norm": 0.6646113991737366, + "learning_rate": 4.672552749439413e-06, + "loss": 0.6264, + "step": 3652 + }, + { + "epoch": 1.0119113573407201, + "grad_norm": 0.6403873562812805, + "learning_rate": 4.672372487322691e-06, + "loss": 0.6132, + "step": 3653 + }, + { + "epoch": 1.0121883656509696, + "grad_norm": 0.6751793622970581, + "learning_rate": 4.672192179080952e-06, + "loss": 0.6898, + "step": 3654 + }, + { + "epoch": 1.0124653739612188, + "grad_norm": 0.7224442958831787, + "learning_rate": 4.6720118247180265e-06, + "loss": 0.6685, + "step": 3655 + }, + { + "epoch": 1.012742382271468, + "grad_norm": 0.7179122567176819, + "learning_rate": 4.671831424237743e-06, + "loss": 0.6731, + "step": 3656 + }, + { + "epoch": 1.0130193905817175, + "grad_norm": 0.6969308853149414, + "learning_rate": 4.671650977643932e-06, + "loss": 0.6385, + "step": 3657 + }, + { + "epoch": 1.0132963988919668, + "grad_norm": 0.7241250872612, + "learning_rate": 4.671470484940426e-06, + "loss": 0.6828, + "step": 3658 + }, + { + "epoch": 1.013573407202216, + "grad_norm": 0.6995605230331421, + "learning_rate": 4.671289946131055e-06, + "loss": 0.6591, + "step": 3659 + }, + { + "epoch": 1.0138504155124655, + "grad_norm": 0.6992942690849304, + "learning_rate": 4.671109361219655e-06, + "loss": 0.6478, + "step": 3660 + }, + { + "epoch": 1.0141274238227147, + "grad_norm": 0.6783244609832764, + "learning_rate": 4.670928730210057e-06, + "loss": 0.6515, + "step": 3661 + }, + { + "epoch": 1.014404432132964, + "grad_norm": 0.6964882016181946, + "learning_rate": 4.670748053106099e-06, + "loss": 0.6639, + "step": 3662 + }, + { + "epoch": 1.0146814404432134, + "grad_norm": 0.7556734681129456, + "learning_rate": 4.670567329911616e-06, + "loss": 0.6472, + "step": 3663 + }, + { + "epoch": 1.0149584487534626, + "grad_norm": 0.6755543351173401, + "learning_rate": 4.670386560630446e-06, + "loss": 0.6372, + "step": 3664 + }, + { + "epoch": 1.0152354570637119, + "grad_norm": 0.6663053631782532, + "learning_rate": 4.670205745266427e-06, + "loss": 0.6537, + "step": 3665 + }, + { + "epoch": 1.0155124653739611, + "grad_norm": 0.6746417880058289, + "learning_rate": 4.670024883823397e-06, + "loss": 0.6397, + "step": 3666 + }, + { + "epoch": 1.0157894736842106, + "grad_norm": 0.6543819904327393, + "learning_rate": 4.669843976305199e-06, + "loss": 0.6905, + "step": 3667 + }, + { + "epoch": 1.0160664819944598, + "grad_norm": 0.6693983674049377, + "learning_rate": 4.66966302271567e-06, + "loss": 0.6429, + "step": 3668 + }, + { + "epoch": 1.016343490304709, + "grad_norm": 0.6612565517425537, + "learning_rate": 4.669482023058656e-06, + "loss": 0.6052, + "step": 3669 + }, + { + "epoch": 1.0166204986149585, + "grad_norm": 0.6457585096359253, + "learning_rate": 4.6693009773379975e-06, + "loss": 0.6464, + "step": 3670 + }, + { + "epoch": 1.0168975069252078, + "grad_norm": 0.691620945930481, + "learning_rate": 4.669119885557539e-06, + "loss": 0.6357, + "step": 3671 + }, + { + "epoch": 1.017174515235457, + "grad_norm": 0.6583114862442017, + "learning_rate": 4.668938747721127e-06, + "loss": 0.6049, + "step": 3672 + }, + { + "epoch": 1.0174515235457064, + "grad_norm": 0.7024505734443665, + "learning_rate": 4.668757563832606e-06, + "loss": 0.6073, + "step": 3673 + }, + { + "epoch": 1.0177285318559557, + "grad_norm": 0.7239954471588135, + "learning_rate": 4.668576333895824e-06, + "loss": 0.672, + "step": 3674 + }, + { + "epoch": 1.018005540166205, + "grad_norm": 0.7404740452766418, + "learning_rate": 4.668395057914627e-06, + "loss": 0.671, + "step": 3675 + }, + { + "epoch": 1.0182825484764544, + "grad_norm": 0.692775547504425, + "learning_rate": 4.668213735892867e-06, + "loss": 0.6503, + "step": 3676 + }, + { + "epoch": 1.0185595567867036, + "grad_norm": 0.7027653455734253, + "learning_rate": 4.668032367834392e-06, + "loss": 0.6527, + "step": 3677 + }, + { + "epoch": 1.0188365650969529, + "grad_norm": 0.6956178545951843, + "learning_rate": 4.667850953743053e-06, + "loss": 0.6622, + "step": 3678 + }, + { + "epoch": 1.0191135734072023, + "grad_norm": 0.6842972040176392, + "learning_rate": 4.667669493622703e-06, + "loss": 0.6211, + "step": 3679 + }, + { + "epoch": 1.0193905817174516, + "grad_norm": 0.6833047866821289, + "learning_rate": 4.6674879874771925e-06, + "loss": 0.6739, + "step": 3680 + }, + { + "epoch": 1.0196675900277008, + "grad_norm": 0.7252957224845886, + "learning_rate": 4.667306435310379e-06, + "loss": 0.6258, + "step": 3681 + }, + { + "epoch": 1.01994459833795, + "grad_norm": 0.6874592900276184, + "learning_rate": 4.667124837126114e-06, + "loss": 0.6801, + "step": 3682 + }, + { + "epoch": 1.0202216066481995, + "grad_norm": 0.8430461883544922, + "learning_rate": 4.666943192928254e-06, + "loss": 0.6102, + "step": 3683 + }, + { + "epoch": 1.0204986149584487, + "grad_norm": 0.7195748090744019, + "learning_rate": 4.666761502720657e-06, + "loss": 0.6824, + "step": 3684 + }, + { + "epoch": 1.020775623268698, + "grad_norm": 0.6784130930900574, + "learning_rate": 4.666579766507179e-06, + "loss": 0.6756, + "step": 3685 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 0.7014387845993042, + "learning_rate": 4.66639798429168e-06, + "loss": 0.6086, + "step": 3686 + }, + { + "epoch": 1.0213296398891967, + "grad_norm": 0.6831591129302979, + "learning_rate": 4.66621615607802e-06, + "loss": 0.6833, + "step": 3687 + }, + { + "epoch": 1.021606648199446, + "grad_norm": 0.6831374168395996, + "learning_rate": 4.6660342818700596e-06, + "loss": 0.6541, + "step": 3688 + }, + { + "epoch": 1.0218836565096954, + "grad_norm": 0.6437398791313171, + "learning_rate": 4.665852361671659e-06, + "loss": 0.6125, + "step": 3689 + }, + { + "epoch": 1.0221606648199446, + "grad_norm": 0.670684278011322, + "learning_rate": 4.665670395486683e-06, + "loss": 0.6064, + "step": 3690 + }, + { + "epoch": 1.0224376731301938, + "grad_norm": 0.6811419725418091, + "learning_rate": 4.665488383318992e-06, + "loss": 0.6238, + "step": 3691 + }, + { + "epoch": 1.0227146814404433, + "grad_norm": 0.690031111240387, + "learning_rate": 4.665306325172453e-06, + "loss": 0.6418, + "step": 3692 + }, + { + "epoch": 1.0229916897506925, + "grad_norm": 0.6779752969741821, + "learning_rate": 4.665124221050931e-06, + "loss": 0.6248, + "step": 3693 + }, + { + "epoch": 1.0232686980609418, + "grad_norm": 0.726576030254364, + "learning_rate": 4.6649420709582945e-06, + "loss": 0.6345, + "step": 3694 + }, + { + "epoch": 1.0235457063711912, + "grad_norm": 0.7005026340484619, + "learning_rate": 4.664759874898408e-06, + "loss": 0.6493, + "step": 3695 + }, + { + "epoch": 1.0238227146814405, + "grad_norm": 0.7161099314689636, + "learning_rate": 4.664577632875141e-06, + "loss": 0.6785, + "step": 3696 + }, + { + "epoch": 1.0240997229916897, + "grad_norm": 0.6901417970657349, + "learning_rate": 4.664395344892363e-06, + "loss": 0.6378, + "step": 3697 + }, + { + "epoch": 1.024376731301939, + "grad_norm": 0.687687337398529, + "learning_rate": 4.664213010953945e-06, + "loss": 0.6133, + "step": 3698 + }, + { + "epoch": 1.0246537396121884, + "grad_norm": 0.7225800156593323, + "learning_rate": 4.664030631063758e-06, + "loss": 0.6473, + "step": 3699 + }, + { + "epoch": 1.0249307479224377, + "grad_norm": 0.6865379214286804, + "learning_rate": 4.663848205225674e-06, + "loss": 0.6705, + "step": 3700 + }, + { + "epoch": 1.025207756232687, + "grad_norm": 0.6914872527122498, + "learning_rate": 4.6636657334435675e-06, + "loss": 0.6339, + "step": 3701 + }, + { + "epoch": 1.0254847645429364, + "grad_norm": 0.687730073928833, + "learning_rate": 4.663483215721311e-06, + "loss": 0.6886, + "step": 3702 + }, + { + "epoch": 1.0257617728531856, + "grad_norm": 0.6916108727455139, + "learning_rate": 4.663300652062782e-06, + "loss": 0.6786, + "step": 3703 + }, + { + "epoch": 1.0260387811634348, + "grad_norm": 0.7067069411277771, + "learning_rate": 4.663118042471854e-06, + "loss": 0.657, + "step": 3704 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.7158690094947815, + "learning_rate": 4.662935386952408e-06, + "loss": 0.5958, + "step": 3705 + }, + { + "epoch": 1.0265927977839335, + "grad_norm": 0.72146075963974, + "learning_rate": 4.6627526855083196e-06, + "loss": 0.6631, + "step": 3706 + }, + { + "epoch": 1.0268698060941828, + "grad_norm": 0.685907244682312, + "learning_rate": 4.662569938143468e-06, + "loss": 0.6688, + "step": 3707 + }, + { + "epoch": 1.0271468144044322, + "grad_norm": 0.6628360748291016, + "learning_rate": 4.6623871448617345e-06, + "loss": 0.6427, + "step": 3708 + }, + { + "epoch": 1.0274238227146815, + "grad_norm": 0.7283874154090881, + "learning_rate": 4.662204305666999e-06, + "loss": 0.595, + "step": 3709 + }, + { + "epoch": 1.0277008310249307, + "grad_norm": 0.7178004384040833, + "learning_rate": 4.662021420563145e-06, + "loss": 0.6544, + "step": 3710 + }, + { + "epoch": 1.02797783933518, + "grad_norm": 0.6764352321624756, + "learning_rate": 4.6618384895540554e-06, + "loss": 0.5956, + "step": 3711 + }, + { + "epoch": 1.0282548476454294, + "grad_norm": 0.7078940868377686, + "learning_rate": 4.661655512643614e-06, + "loss": 0.6723, + "step": 3712 + }, + { + "epoch": 1.0285318559556786, + "grad_norm": 0.7133868336677551, + "learning_rate": 4.661472489835705e-06, + "loss": 0.6424, + "step": 3713 + }, + { + "epoch": 1.0288088642659279, + "grad_norm": 0.6933633685112, + "learning_rate": 4.661289421134215e-06, + "loss": 0.634, + "step": 3714 + }, + { + "epoch": 1.0290858725761773, + "grad_norm": 0.6836314797401428, + "learning_rate": 4.661106306543033e-06, + "loss": 0.6222, + "step": 3715 + }, + { + "epoch": 1.0293628808864266, + "grad_norm": 0.6919975876808167, + "learning_rate": 4.660923146066044e-06, + "loss": 0.6397, + "step": 3716 + }, + { + "epoch": 1.0296398891966758, + "grad_norm": 0.6854752898216248, + "learning_rate": 4.660739939707138e-06, + "loss": 0.6793, + "step": 3717 + }, + { + "epoch": 1.0299168975069253, + "grad_norm": 0.6768549680709839, + "learning_rate": 4.660556687470206e-06, + "loss": 0.6808, + "step": 3718 + }, + { + "epoch": 1.0301939058171745, + "grad_norm": 0.7218035459518433, + "learning_rate": 4.660373389359137e-06, + "loss": 0.6675, + "step": 3719 + }, + { + "epoch": 1.0304709141274238, + "grad_norm": 0.6770936250686646, + "learning_rate": 4.660190045377825e-06, + "loss": 0.6293, + "step": 3720 + }, + { + "epoch": 1.0307479224376732, + "grad_norm": 0.6360563635826111, + "learning_rate": 4.660006655530161e-06, + "loss": 0.5993, + "step": 3721 + }, + { + "epoch": 1.0310249307479225, + "grad_norm": 0.6777840852737427, + "learning_rate": 4.65982321982004e-06, + "loss": 0.6694, + "step": 3722 + }, + { + "epoch": 1.0313019390581717, + "grad_norm": 0.7174295783042908, + "learning_rate": 4.659639738251356e-06, + "loss": 0.6579, + "step": 3723 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 0.7186366319656372, + "learning_rate": 4.659456210828006e-06, + "loss": 0.689, + "step": 3724 + }, + { + "epoch": 1.0318559556786704, + "grad_norm": 0.6772783398628235, + "learning_rate": 4.659272637553885e-06, + "loss": 0.611, + "step": 3725 + }, + { + "epoch": 1.0321329639889196, + "grad_norm": 0.7406778335571289, + "learning_rate": 4.659089018432893e-06, + "loss": 0.6801, + "step": 3726 + }, + { + "epoch": 1.032409972299169, + "grad_norm": 0.7386046051979065, + "learning_rate": 4.6589053534689266e-06, + "loss": 0.6512, + "step": 3727 + }, + { + "epoch": 1.0326869806094183, + "grad_norm": 0.6807011365890503, + "learning_rate": 4.658721642665887e-06, + "loss": 0.5922, + "step": 3728 + }, + { + "epoch": 1.0329639889196676, + "grad_norm": 0.7346801161766052, + "learning_rate": 4.658537886027673e-06, + "loss": 0.6891, + "step": 3729 + }, + { + "epoch": 1.0332409972299168, + "grad_norm": 0.7221609354019165, + "learning_rate": 4.6583540835581885e-06, + "loss": 0.6662, + "step": 3730 + }, + { + "epoch": 1.0335180055401663, + "grad_norm": 0.6834601759910583, + "learning_rate": 4.6581702352613345e-06, + "loss": 0.6249, + "step": 3731 + }, + { + "epoch": 1.0337950138504155, + "grad_norm": 0.6952083110809326, + "learning_rate": 4.657986341141015e-06, + "loss": 0.6403, + "step": 3732 + }, + { + "epoch": 1.0340720221606647, + "grad_norm": 0.7092493772506714, + "learning_rate": 4.657802401201135e-06, + "loss": 0.6295, + "step": 3733 + }, + { + "epoch": 1.0343490304709142, + "grad_norm": 0.7145261764526367, + "learning_rate": 4.657618415445599e-06, + "loss": 0.6624, + "step": 3734 + }, + { + "epoch": 1.0346260387811634, + "grad_norm": 0.7718804478645325, + "learning_rate": 4.657434383878314e-06, + "loss": 0.6584, + "step": 3735 + }, + { + "epoch": 1.0349030470914127, + "grad_norm": 0.7761196494102478, + "learning_rate": 4.657250306503189e-06, + "loss": 0.6235, + "step": 3736 + }, + { + "epoch": 1.0351800554016621, + "grad_norm": 0.6836169362068176, + "learning_rate": 4.6570661833241295e-06, + "loss": 0.6169, + "step": 3737 + }, + { + "epoch": 1.0354570637119114, + "grad_norm": 0.7584412097930908, + "learning_rate": 4.656882014345046e-06, + "loss": 0.6676, + "step": 3738 + }, + { + "epoch": 1.0357340720221606, + "grad_norm": 0.733180820941925, + "learning_rate": 4.656697799569851e-06, + "loss": 0.6969, + "step": 3739 + }, + { + "epoch": 1.03601108033241, + "grad_norm": 0.7027671933174133, + "learning_rate": 4.656513539002452e-06, + "loss": 0.6319, + "step": 3740 + }, + { + "epoch": 1.0362880886426593, + "grad_norm": 0.7461979985237122, + "learning_rate": 4.656329232646765e-06, + "loss": 0.6715, + "step": 3741 + }, + { + "epoch": 1.0365650969529085, + "grad_norm": 0.7621151208877563, + "learning_rate": 4.656144880506701e-06, + "loss": 0.6923, + "step": 3742 + }, + { + "epoch": 1.0368421052631578, + "grad_norm": 0.7271478176116943, + "learning_rate": 4.655960482586175e-06, + "loss": 0.6726, + "step": 3743 + }, + { + "epoch": 1.0371191135734072, + "grad_norm": 0.6788163185119629, + "learning_rate": 4.655776038889103e-06, + "loss": 0.6465, + "step": 3744 + }, + { + "epoch": 1.0373961218836565, + "grad_norm": 0.6891070604324341, + "learning_rate": 4.6555915494193996e-06, + "loss": 0.6635, + "step": 3745 + }, + { + "epoch": 1.0376731301939057, + "grad_norm": 0.6785054802894592, + "learning_rate": 4.655407014180984e-06, + "loss": 0.6266, + "step": 3746 + }, + { + "epoch": 1.0379501385041552, + "grad_norm": 0.6859633922576904, + "learning_rate": 4.655222433177772e-06, + "loss": 0.683, + "step": 3747 + }, + { + "epoch": 1.0382271468144044, + "grad_norm": 0.6922705173492432, + "learning_rate": 4.655037806413684e-06, + "loss": 0.6311, + "step": 3748 + }, + { + "epoch": 1.0385041551246537, + "grad_norm": 0.6790265440940857, + "learning_rate": 4.654853133892639e-06, + "loss": 0.6471, + "step": 3749 + }, + { + "epoch": 1.0387811634349031, + "grad_norm": 0.705460786819458, + "learning_rate": 4.654668415618561e-06, + "loss": 0.6347, + "step": 3750 + }, + { + "epoch": 1.0390581717451524, + "grad_norm": 0.7426798343658447, + "learning_rate": 4.6544836515953695e-06, + "loss": 0.7269, + "step": 3751 + }, + { + "epoch": 1.0393351800554016, + "grad_norm": 0.7155759334564209, + "learning_rate": 4.654298841826988e-06, + "loss": 0.6246, + "step": 3752 + }, + { + "epoch": 1.039612188365651, + "grad_norm": 0.6881625652313232, + "learning_rate": 4.654113986317341e-06, + "loss": 0.6356, + "step": 3753 + }, + { + "epoch": 1.0398891966759003, + "grad_norm": 0.7087181210517883, + "learning_rate": 4.653929085070352e-06, + "loss": 0.6633, + "step": 3754 + }, + { + "epoch": 1.0401662049861495, + "grad_norm": 0.7184396386146545, + "learning_rate": 4.653744138089949e-06, + "loss": 0.6779, + "step": 3755 + }, + { + "epoch": 1.040443213296399, + "grad_norm": 0.7010110020637512, + "learning_rate": 4.653559145380058e-06, + "loss": 0.671, + "step": 3756 + }, + { + "epoch": 1.0407202216066482, + "grad_norm": 0.7426394820213318, + "learning_rate": 4.653374106944606e-06, + "loss": 0.6824, + "step": 3757 + }, + { + "epoch": 1.0409972299168975, + "grad_norm": 0.671944797039032, + "learning_rate": 4.653189022787523e-06, + "loss": 0.6487, + "step": 3758 + }, + { + "epoch": 1.0412742382271467, + "grad_norm": 0.6617016792297363, + "learning_rate": 4.6530038929127385e-06, + "loss": 0.6362, + "step": 3759 + }, + { + "epoch": 1.0415512465373962, + "grad_norm": 0.6958049535751343, + "learning_rate": 4.652818717324182e-06, + "loss": 0.7073, + "step": 3760 + }, + { + "epoch": 1.0418282548476454, + "grad_norm": 0.6825550198554993, + "learning_rate": 4.652633496025788e-06, + "loss": 0.6001, + "step": 3761 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 0.6797915101051331, + "learning_rate": 4.652448229021488e-06, + "loss": 0.6436, + "step": 3762 + }, + { + "epoch": 1.042382271468144, + "grad_norm": 0.6714237332344055, + "learning_rate": 4.652262916315213e-06, + "loss": 0.6443, + "step": 3763 + }, + { + "epoch": 1.0426592797783933, + "grad_norm": 0.6620033979415894, + "learning_rate": 4.652077557910902e-06, + "loss": 0.6283, + "step": 3764 + }, + { + "epoch": 1.0429362880886426, + "grad_norm": 0.6751744747161865, + "learning_rate": 4.6518921538124885e-06, + "loss": 0.6158, + "step": 3765 + }, + { + "epoch": 1.043213296398892, + "grad_norm": 0.723050594329834, + "learning_rate": 4.651706704023908e-06, + "loss": 0.6551, + "step": 3766 + }, + { + "epoch": 1.0434903047091413, + "grad_norm": 0.6662552952766418, + "learning_rate": 4.651521208549101e-06, + "loss": 0.6189, + "step": 3767 + }, + { + "epoch": 1.0437673130193905, + "grad_norm": 0.6784934401512146, + "learning_rate": 4.651335667392003e-06, + "loss": 0.6286, + "step": 3768 + }, + { + "epoch": 1.04404432132964, + "grad_norm": 0.6911255121231079, + "learning_rate": 4.6511500805565555e-06, + "loss": 0.6145, + "step": 3769 + }, + { + "epoch": 1.0443213296398892, + "grad_norm": 0.7118515372276306, + "learning_rate": 4.6509644480466984e-06, + "loss": 0.6615, + "step": 3770 + }, + { + "epoch": 1.0445983379501385, + "grad_norm": 0.6700476408004761, + "learning_rate": 4.650778769866372e-06, + "loss": 0.6113, + "step": 3771 + }, + { + "epoch": 1.044875346260388, + "grad_norm": 0.7546408176422119, + "learning_rate": 4.650593046019521e-06, + "loss": 0.6646, + "step": 3772 + }, + { + "epoch": 1.0451523545706372, + "grad_norm": 0.7420517206192017, + "learning_rate": 4.650407276510088e-06, + "loss": 0.7298, + "step": 3773 + }, + { + "epoch": 1.0454293628808864, + "grad_norm": 0.6817473769187927, + "learning_rate": 4.6502214613420164e-06, + "loss": 0.6105, + "step": 3774 + }, + { + "epoch": 1.0457063711911356, + "grad_norm": 0.7240308523178101, + "learning_rate": 4.6500356005192514e-06, + "loss": 0.6463, + "step": 3775 + }, + { + "epoch": 1.045983379501385, + "grad_norm": 0.6789447069168091, + "learning_rate": 4.649849694045741e-06, + "loss": 0.654, + "step": 3776 + }, + { + "epoch": 1.0462603878116343, + "grad_norm": 0.6913023591041565, + "learning_rate": 4.649663741925431e-06, + "loss": 0.6083, + "step": 3777 + }, + { + "epoch": 1.0465373961218836, + "grad_norm": 0.6867649555206299, + "learning_rate": 4.649477744162271e-06, + "loss": 0.6776, + "step": 3778 + }, + { + "epoch": 1.046814404432133, + "grad_norm": 0.6704279780387878, + "learning_rate": 4.6492917007602075e-06, + "loss": 0.6611, + "step": 3779 + }, + { + "epoch": 1.0470914127423823, + "grad_norm": 0.6176725029945374, + "learning_rate": 4.6491056117231936e-06, + "loss": 0.5878, + "step": 3780 + }, + { + "epoch": 1.0473684210526315, + "grad_norm": 0.6866704821586609, + "learning_rate": 4.648919477055179e-06, + "loss": 0.6346, + "step": 3781 + }, + { + "epoch": 1.047645429362881, + "grad_norm": 0.6737282872200012, + "learning_rate": 4.648733296760117e-06, + "loss": 0.6675, + "step": 3782 + }, + { + "epoch": 1.0479224376731302, + "grad_norm": 0.6623876690864563, + "learning_rate": 4.6485470708419595e-06, + "loss": 0.6386, + "step": 3783 + }, + { + "epoch": 1.0481994459833794, + "grad_norm": 0.6881468892097473, + "learning_rate": 4.648360799304661e-06, + "loss": 0.6733, + "step": 3784 + }, + { + "epoch": 1.048476454293629, + "grad_norm": 0.7181234955787659, + "learning_rate": 4.648174482152176e-06, + "loss": 0.6804, + "step": 3785 + }, + { + "epoch": 1.0487534626038781, + "grad_norm": 0.7076879739761353, + "learning_rate": 4.647988119388461e-06, + "loss": 0.6428, + "step": 3786 + }, + { + "epoch": 1.0490304709141274, + "grad_norm": 0.7163893580436707, + "learning_rate": 4.647801711017473e-06, + "loss": 0.659, + "step": 3787 + }, + { + "epoch": 1.0493074792243768, + "grad_norm": 0.6863222122192383, + "learning_rate": 4.64761525704317e-06, + "loss": 0.6169, + "step": 3788 + }, + { + "epoch": 1.049584487534626, + "grad_norm": 0.6541346907615662, + "learning_rate": 4.64742875746951e-06, + "loss": 0.6575, + "step": 3789 + }, + { + "epoch": 1.0498614958448753, + "grad_norm": 0.7049393057823181, + "learning_rate": 4.647242212300453e-06, + "loss": 0.6744, + "step": 3790 + }, + { + "epoch": 1.0501385041551246, + "grad_norm": 0.697417140007019, + "learning_rate": 4.647055621539962e-06, + "loss": 0.6321, + "step": 3791 + }, + { + "epoch": 1.050415512465374, + "grad_norm": 0.7294650077819824, + "learning_rate": 4.646868985191996e-06, + "loss": 0.7125, + "step": 3792 + }, + { + "epoch": 1.0506925207756233, + "grad_norm": 0.6390814185142517, + "learning_rate": 4.64668230326052e-06, + "loss": 0.5747, + "step": 3793 + }, + { + "epoch": 1.0509695290858725, + "grad_norm": 0.6700131893157959, + "learning_rate": 4.6464955757494955e-06, + "loss": 0.6229, + "step": 3794 + }, + { + "epoch": 1.051246537396122, + "grad_norm": 0.7338569760322571, + "learning_rate": 4.646308802662889e-06, + "loss": 0.6474, + "step": 3795 + }, + { + "epoch": 1.0515235457063712, + "grad_norm": 0.6908360123634338, + "learning_rate": 4.646121984004666e-06, + "loss": 0.6504, + "step": 3796 + }, + { + "epoch": 1.0518005540166204, + "grad_norm": 0.6733314394950867, + "learning_rate": 4.645935119778792e-06, + "loss": 0.665, + "step": 3797 + }, + { + "epoch": 1.0520775623268699, + "grad_norm": 0.7230425477027893, + "learning_rate": 4.6457482099892355e-06, + "loss": 0.6875, + "step": 3798 + }, + { + "epoch": 1.0523545706371191, + "grad_norm": 0.6543940305709839, + "learning_rate": 4.645561254639965e-06, + "loss": 0.648, + "step": 3799 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.718337893486023, + "learning_rate": 4.645374253734949e-06, + "loss": 0.6047, + "step": 3800 + }, + { + "epoch": 1.0529085872576178, + "grad_norm": 0.6767460703849792, + "learning_rate": 4.64518720727816e-06, + "loss": 0.6132, + "step": 3801 + }, + { + "epoch": 1.053185595567867, + "grad_norm": 0.6737857460975647, + "learning_rate": 4.645000115273568e-06, + "loss": 0.6429, + "step": 3802 + }, + { + "epoch": 1.0534626038781163, + "grad_norm": 0.7279229760169983, + "learning_rate": 4.644812977725146e-06, + "loss": 0.6202, + "step": 3803 + }, + { + "epoch": 1.0537396121883655, + "grad_norm": 0.6861441731452942, + "learning_rate": 4.644625794636867e-06, + "loss": 0.6381, + "step": 3804 + }, + { + "epoch": 1.054016620498615, + "grad_norm": 0.6784233450889587, + "learning_rate": 4.644438566012706e-06, + "loss": 0.6411, + "step": 3805 + }, + { + "epoch": 1.0542936288088642, + "grad_norm": 0.672813892364502, + "learning_rate": 4.644251291856637e-06, + "loss": 0.6455, + "step": 3806 + }, + { + "epoch": 1.0545706371191135, + "grad_norm": 0.6707661747932434, + "learning_rate": 4.6440639721726375e-06, + "loss": 0.648, + "step": 3807 + }, + { + "epoch": 1.054847645429363, + "grad_norm": 0.7086052894592285, + "learning_rate": 4.643876606964685e-06, + "loss": 0.6721, + "step": 3808 + }, + { + "epoch": 1.0551246537396122, + "grad_norm": 0.688983142375946, + "learning_rate": 4.643689196236757e-06, + "loss": 0.6487, + "step": 3809 + }, + { + "epoch": 1.0554016620498614, + "grad_norm": 0.6924951672554016, + "learning_rate": 4.643501739992834e-06, + "loss": 0.6605, + "step": 3810 + }, + { + "epoch": 1.0556786703601109, + "grad_norm": 0.6518945693969727, + "learning_rate": 4.643314238236893e-06, + "loss": 0.5965, + "step": 3811 + }, + { + "epoch": 1.05595567867036, + "grad_norm": 0.6460810899734497, + "learning_rate": 4.643126690972919e-06, + "loss": 0.6296, + "step": 3812 + }, + { + "epoch": 1.0562326869806093, + "grad_norm": 0.671865701675415, + "learning_rate": 4.642939098204892e-06, + "loss": 0.6657, + "step": 3813 + }, + { + "epoch": 1.0565096952908588, + "grad_norm": 0.6928796172142029, + "learning_rate": 4.642751459936796e-06, + "loss": 0.6628, + "step": 3814 + }, + { + "epoch": 1.056786703601108, + "grad_norm": 0.7006489634513855, + "learning_rate": 4.642563776172614e-06, + "loss": 0.6696, + "step": 3815 + }, + { + "epoch": 1.0570637119113573, + "grad_norm": 0.7267358303070068, + "learning_rate": 4.642376046916331e-06, + "loss": 0.6708, + "step": 3816 + }, + { + "epoch": 1.0573407202216067, + "grad_norm": 0.6783285737037659, + "learning_rate": 4.642188272171933e-06, + "loss": 0.6437, + "step": 3817 + }, + { + "epoch": 1.057617728531856, + "grad_norm": 0.6970617771148682, + "learning_rate": 4.642000451943409e-06, + "loss": 0.6242, + "step": 3818 + }, + { + "epoch": 1.0578947368421052, + "grad_norm": 0.7154437899589539, + "learning_rate": 4.641812586234745e-06, + "loss": 0.6422, + "step": 3819 + }, + { + "epoch": 1.0581717451523547, + "grad_norm": 0.6935372948646545, + "learning_rate": 4.64162467504993e-06, + "loss": 0.6757, + "step": 3820 + }, + { + "epoch": 1.058448753462604, + "grad_norm": 0.7333564758300781, + "learning_rate": 4.641436718392954e-06, + "loss": 0.632, + "step": 3821 + }, + { + "epoch": 1.0587257617728532, + "grad_norm": 0.7167134881019592, + "learning_rate": 4.641248716267808e-06, + "loss": 0.6473, + "step": 3822 + }, + { + "epoch": 1.0590027700831024, + "grad_norm": 0.7148312330245972, + "learning_rate": 4.641060668678483e-06, + "loss": 0.6946, + "step": 3823 + }, + { + "epoch": 1.0592797783933519, + "grad_norm": 0.7001869678497314, + "learning_rate": 4.640872575628973e-06, + "loss": 0.6365, + "step": 3824 + }, + { + "epoch": 1.059556786703601, + "grad_norm": 0.680051326751709, + "learning_rate": 4.640684437123271e-06, + "loss": 0.6184, + "step": 3825 + }, + { + "epoch": 1.0598337950138503, + "grad_norm": 0.715980589389801, + "learning_rate": 4.640496253165371e-06, + "loss": 0.6697, + "step": 3826 + }, + { + "epoch": 1.0601108033240998, + "grad_norm": 0.70005863904953, + "learning_rate": 4.6403080237592704e-06, + "loss": 0.6385, + "step": 3827 + }, + { + "epoch": 1.060387811634349, + "grad_norm": 0.67072993516922, + "learning_rate": 4.640119748908964e-06, + "loss": 0.58, + "step": 3828 + }, + { + "epoch": 1.0606648199445983, + "grad_norm": 0.6752764582633972, + "learning_rate": 4.6399314286184496e-06, + "loss": 0.634, + "step": 3829 + }, + { + "epoch": 1.0609418282548477, + "grad_norm": 0.676100492477417, + "learning_rate": 4.639743062891727e-06, + "loss": 0.6329, + "step": 3830 + }, + { + "epoch": 1.061218836565097, + "grad_norm": 0.6580525636672974, + "learning_rate": 4.639554651732794e-06, + "loss": 0.5743, + "step": 3831 + }, + { + "epoch": 1.0614958448753462, + "grad_norm": 0.7047877311706543, + "learning_rate": 4.639366195145653e-06, + "loss": 0.6801, + "step": 3832 + }, + { + "epoch": 1.0617728531855957, + "grad_norm": 0.6486926078796387, + "learning_rate": 4.639177693134304e-06, + "loss": 0.6117, + "step": 3833 + }, + { + "epoch": 1.062049861495845, + "grad_norm": 0.704925000667572, + "learning_rate": 4.638989145702749e-06, + "loss": 0.6711, + "step": 3834 + }, + { + "epoch": 1.0623268698060941, + "grad_norm": 0.6557621359825134, + "learning_rate": 4.638800552854993e-06, + "loss": 0.5999, + "step": 3835 + }, + { + "epoch": 1.0626038781163434, + "grad_norm": 0.6929731369018555, + "learning_rate": 4.638611914595038e-06, + "loss": 0.6597, + "step": 3836 + }, + { + "epoch": 1.0628808864265928, + "grad_norm": 0.7148190140724182, + "learning_rate": 4.638423230926891e-06, + "loss": 0.6758, + "step": 3837 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 0.6981825828552246, + "learning_rate": 4.638234501854559e-06, + "loss": 0.6246, + "step": 3838 + }, + { + "epoch": 1.0634349030470913, + "grad_norm": 0.6970887780189514, + "learning_rate": 4.638045727382048e-06, + "loss": 0.6225, + "step": 3839 + }, + { + "epoch": 1.0637119113573408, + "grad_norm": 0.6663398146629333, + "learning_rate": 4.637856907513366e-06, + "loss": 0.6148, + "step": 3840 + }, + { + "epoch": 1.06398891966759, + "grad_norm": 0.7192431092262268, + "learning_rate": 4.637668042252522e-06, + "loss": 0.6335, + "step": 3841 + }, + { + "epoch": 1.0642659279778393, + "grad_norm": 0.6780756115913391, + "learning_rate": 4.637479131603526e-06, + "loss": 0.6896, + "step": 3842 + }, + { + "epoch": 1.0645429362880887, + "grad_norm": 0.7636496424674988, + "learning_rate": 4.63729017557039e-06, + "loss": 0.6415, + "step": 3843 + }, + { + "epoch": 1.064819944598338, + "grad_norm": 0.6857926845550537, + "learning_rate": 4.637101174157126e-06, + "loss": 0.5899, + "step": 3844 + }, + { + "epoch": 1.0650969529085872, + "grad_norm": 0.6750476360321045, + "learning_rate": 4.636912127367746e-06, + "loss": 0.6186, + "step": 3845 + }, + { + "epoch": 1.0653739612188367, + "grad_norm": 0.6938686966896057, + "learning_rate": 4.636723035206265e-06, + "loss": 0.6909, + "step": 3846 + }, + { + "epoch": 1.065650969529086, + "grad_norm": 0.7091078162193298, + "learning_rate": 4.636533897676696e-06, + "loss": 0.6578, + "step": 3847 + }, + { + "epoch": 1.0659279778393351, + "grad_norm": 0.6912356615066528, + "learning_rate": 4.636344714783058e-06, + "loss": 0.6169, + "step": 3848 + }, + { + "epoch": 1.0662049861495846, + "grad_norm": 0.6690759658813477, + "learning_rate": 4.636155486529365e-06, + "loss": 0.6258, + "step": 3849 + }, + { + "epoch": 1.0664819944598338, + "grad_norm": 0.704440712928772, + "learning_rate": 4.6359662129196355e-06, + "loss": 0.6696, + "step": 3850 + }, + { + "epoch": 1.066759002770083, + "grad_norm": 0.6986096501350403, + "learning_rate": 4.63577689395789e-06, + "loss": 0.6342, + "step": 3851 + }, + { + "epoch": 1.0670360110803325, + "grad_norm": 0.7509374618530273, + "learning_rate": 4.635587529648146e-06, + "loss": 0.6382, + "step": 3852 + }, + { + "epoch": 1.0673130193905818, + "grad_norm": 0.6818401217460632, + "learning_rate": 4.635398119994425e-06, + "loss": 0.6343, + "step": 3853 + }, + { + "epoch": 1.067590027700831, + "grad_norm": 0.6807084083557129, + "learning_rate": 4.6352086650007484e-06, + "loss": 0.6749, + "step": 3854 + }, + { + "epoch": 1.0678670360110802, + "grad_norm": 0.6697701811790466, + "learning_rate": 4.635019164671141e-06, + "loss": 0.6586, + "step": 3855 + }, + { + "epoch": 1.0681440443213297, + "grad_norm": 0.6940995454788208, + "learning_rate": 4.634829619009622e-06, + "loss": 0.6672, + "step": 3856 + }, + { + "epoch": 1.068421052631579, + "grad_norm": 0.7035492658615112, + "learning_rate": 4.63464002802022e-06, + "loss": 0.6256, + "step": 3857 + }, + { + "epoch": 1.0686980609418282, + "grad_norm": 0.7253122329711914, + "learning_rate": 4.634450391706959e-06, + "loss": 0.6276, + "step": 3858 + }, + { + "epoch": 1.0689750692520776, + "grad_norm": 0.7146328091621399, + "learning_rate": 4.6342607100738645e-06, + "loss": 0.6419, + "step": 3859 + }, + { + "epoch": 1.0692520775623269, + "grad_norm": 0.7397799491882324, + "learning_rate": 4.634070983124965e-06, + "loss": 0.6172, + "step": 3860 + }, + { + "epoch": 1.0695290858725761, + "grad_norm": 0.6944563388824463, + "learning_rate": 4.633881210864289e-06, + "loss": 0.6411, + "step": 3861 + }, + { + "epoch": 1.0698060941828256, + "grad_norm": 0.7174142599105835, + "learning_rate": 4.633691393295865e-06, + "loss": 0.6253, + "step": 3862 + }, + { + "epoch": 1.0700831024930748, + "grad_norm": 0.6977531313896179, + "learning_rate": 4.633501530423724e-06, + "loss": 0.6395, + "step": 3863 + }, + { + "epoch": 1.070360110803324, + "grad_norm": 0.6565867066383362, + "learning_rate": 4.6333116222518985e-06, + "loss": 0.6364, + "step": 3864 + }, + { + "epoch": 1.0706371191135735, + "grad_norm": 0.724623441696167, + "learning_rate": 4.633121668784417e-06, + "loss": 0.6946, + "step": 3865 + }, + { + "epoch": 1.0709141274238227, + "grad_norm": 0.6574693918228149, + "learning_rate": 4.632931670025317e-06, + "loss": 0.5963, + "step": 3866 + }, + { + "epoch": 1.071191135734072, + "grad_norm": 0.6685285568237305, + "learning_rate": 4.63274162597863e-06, + "loss": 0.6138, + "step": 3867 + }, + { + "epoch": 1.0714681440443212, + "grad_norm": 0.679956316947937, + "learning_rate": 4.632551536648392e-06, + "loss": 0.6449, + "step": 3868 + }, + { + "epoch": 1.0717451523545707, + "grad_norm": 0.7049286961555481, + "learning_rate": 4.6323614020386385e-06, + "loss": 0.6302, + "step": 3869 + }, + { + "epoch": 1.07202216066482, + "grad_norm": 0.6563496589660645, + "learning_rate": 4.632171222153407e-06, + "loss": 0.657, + "step": 3870 + }, + { + "epoch": 1.0722991689750692, + "grad_norm": 0.6608796119689941, + "learning_rate": 4.631980996996737e-06, + "loss": 0.6301, + "step": 3871 + }, + { + "epoch": 1.0725761772853186, + "grad_norm": 0.7565630674362183, + "learning_rate": 4.6317907265726645e-06, + "loss": 0.6184, + "step": 3872 + }, + { + "epoch": 1.0728531855955679, + "grad_norm": 0.6970774531364441, + "learning_rate": 4.631600410885231e-06, + "loss": 0.6511, + "step": 3873 + }, + { + "epoch": 1.073130193905817, + "grad_norm": 0.6813990473747253, + "learning_rate": 4.6314100499384775e-06, + "loss": 0.6551, + "step": 3874 + }, + { + "epoch": 1.0734072022160666, + "grad_norm": 0.655053436756134, + "learning_rate": 4.631219643736445e-06, + "loss": 0.6385, + "step": 3875 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 0.6602583527565002, + "learning_rate": 4.631029192283178e-06, + "loss": 0.6619, + "step": 3876 + }, + { + "epoch": 1.073961218836565, + "grad_norm": 0.6874777674674988, + "learning_rate": 4.630838695582718e-06, + "loss": 0.6414, + "step": 3877 + }, + { + "epoch": 1.0742382271468145, + "grad_norm": 0.6865057349205017, + "learning_rate": 4.6306481536391125e-06, + "loss": 0.6874, + "step": 3878 + }, + { + "epoch": 1.0745152354570637, + "grad_norm": 0.6830350160598755, + "learning_rate": 4.630457566456405e-06, + "loss": 0.6246, + "step": 3879 + }, + { + "epoch": 1.074792243767313, + "grad_norm": 0.6801131367683411, + "learning_rate": 4.630266934038642e-06, + "loss": 0.6192, + "step": 3880 + }, + { + "epoch": 1.0750692520775624, + "grad_norm": 0.7242031693458557, + "learning_rate": 4.630076256389872e-06, + "loss": 0.6639, + "step": 3881 + }, + { + "epoch": 1.0753462603878117, + "grad_norm": 0.6798548698425293, + "learning_rate": 4.629885533514143e-06, + "loss": 0.6434, + "step": 3882 + }, + { + "epoch": 1.075623268698061, + "grad_norm": 0.65250164270401, + "learning_rate": 4.629694765415506e-06, + "loss": 0.663, + "step": 3883 + }, + { + "epoch": 1.0759002770083101, + "grad_norm": 0.7037781476974487, + "learning_rate": 4.629503952098011e-06, + "loss": 0.6613, + "step": 3884 + }, + { + "epoch": 1.0761772853185596, + "grad_norm": 0.7319658994674683, + "learning_rate": 4.6293130935657086e-06, + "loss": 0.6283, + "step": 3885 + }, + { + "epoch": 1.0764542936288088, + "grad_norm": 0.7035661935806274, + "learning_rate": 4.6291221898226505e-06, + "loss": 0.6061, + "step": 3886 + }, + { + "epoch": 1.076731301939058, + "grad_norm": 0.6649536490440369, + "learning_rate": 4.628931240872892e-06, + "loss": 0.6054, + "step": 3887 + }, + { + "epoch": 1.0770083102493075, + "grad_norm": 0.6910920143127441, + "learning_rate": 4.628740246720486e-06, + "loss": 0.638, + "step": 3888 + }, + { + "epoch": 1.0772853185595568, + "grad_norm": 0.6866145133972168, + "learning_rate": 4.628549207369489e-06, + "loss": 0.6503, + "step": 3889 + }, + { + "epoch": 1.077562326869806, + "grad_norm": 0.6962095499038696, + "learning_rate": 4.628358122823956e-06, + "loss": 0.6534, + "step": 3890 + }, + { + "epoch": 1.0778393351800555, + "grad_norm": 0.6696187853813171, + "learning_rate": 4.628166993087945e-06, + "loss": 0.664, + "step": 3891 + }, + { + "epoch": 1.0781163434903047, + "grad_norm": 0.6706064343452454, + "learning_rate": 4.627975818165514e-06, + "loss": 0.6615, + "step": 3892 + }, + { + "epoch": 1.078393351800554, + "grad_norm": 0.7948892712593079, + "learning_rate": 4.627784598060723e-06, + "loss": 0.6638, + "step": 3893 + }, + { + "epoch": 1.0786703601108034, + "grad_norm": 0.7281169891357422, + "learning_rate": 4.62759333277763e-06, + "loss": 0.6554, + "step": 3894 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.673255980014801, + "learning_rate": 4.627402022320298e-06, + "loss": 0.6496, + "step": 3895 + }, + { + "epoch": 1.079224376731302, + "grad_norm": 0.7022432088851929, + "learning_rate": 4.627210666692787e-06, + "loss": 0.6301, + "step": 3896 + }, + { + "epoch": 1.0795013850415511, + "grad_norm": 0.6839655637741089, + "learning_rate": 4.627019265899163e-06, + "loss": 0.6362, + "step": 3897 + }, + { + "epoch": 1.0797783933518006, + "grad_norm": 0.7075784802436829, + "learning_rate": 4.626827819943488e-06, + "loss": 0.6091, + "step": 3898 + }, + { + "epoch": 1.0800554016620498, + "grad_norm": 0.7186004519462585, + "learning_rate": 4.626636328829826e-06, + "loss": 0.6602, + "step": 3899 + }, + { + "epoch": 1.080332409972299, + "grad_norm": 0.7156257033348083, + "learning_rate": 4.6264447925622445e-06, + "loss": 0.6283, + "step": 3900 + }, + { + "epoch": 1.0806094182825485, + "grad_norm": 0.6951586008071899, + "learning_rate": 4.626253211144809e-06, + "loss": 0.6733, + "step": 3901 + }, + { + "epoch": 1.0808864265927978, + "grad_norm": 0.7055180072784424, + "learning_rate": 4.626061584581589e-06, + "loss": 0.6538, + "step": 3902 + }, + { + "epoch": 1.081163434903047, + "grad_norm": 0.7053461074829102, + "learning_rate": 4.625869912876651e-06, + "loss": 0.6503, + "step": 3903 + }, + { + "epoch": 1.0814404432132965, + "grad_norm": 0.6794017553329468, + "learning_rate": 4.6256781960340665e-06, + "loss": 0.6354, + "step": 3904 + }, + { + "epoch": 1.0817174515235457, + "grad_norm": 0.7033610939979553, + "learning_rate": 4.625486434057906e-06, + "loss": 0.7112, + "step": 3905 + }, + { + "epoch": 1.081994459833795, + "grad_norm": 0.7239475846290588, + "learning_rate": 4.6252946269522406e-06, + "loss": 0.6583, + "step": 3906 + }, + { + "epoch": 1.0822714681440444, + "grad_norm": 0.6499510407447815, + "learning_rate": 4.625102774721142e-06, + "loss": 0.6023, + "step": 3907 + }, + { + "epoch": 1.0825484764542936, + "grad_norm": 0.6923518776893616, + "learning_rate": 4.6249108773686846e-06, + "loss": 0.6733, + "step": 3908 + }, + { + "epoch": 1.0828254847645429, + "grad_norm": 0.6752975583076477, + "learning_rate": 4.624718934898944e-06, + "loss": 0.6746, + "step": 3909 + }, + { + "epoch": 1.0831024930747923, + "grad_norm": 0.6957419514656067, + "learning_rate": 4.624526947315994e-06, + "loss": 0.6517, + "step": 3910 + }, + { + "epoch": 1.0833795013850416, + "grad_norm": 0.7053093910217285, + "learning_rate": 4.62433491462391e-06, + "loss": 0.5989, + "step": 3911 + }, + { + "epoch": 1.0836565096952908, + "grad_norm": 0.7093052268028259, + "learning_rate": 4.624142836826773e-06, + "loss": 0.6169, + "step": 3912 + }, + { + "epoch": 1.0839335180055403, + "grad_norm": 0.6362714767456055, + "learning_rate": 4.623950713928658e-06, + "loss": 0.5909, + "step": 3913 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 0.6835925579071045, + "learning_rate": 4.623758545933645e-06, + "loss": 0.6006, + "step": 3914 + }, + { + "epoch": 1.0844875346260388, + "grad_norm": 0.7387256622314453, + "learning_rate": 4.623566332845815e-06, + "loss": 0.6884, + "step": 3915 + }, + { + "epoch": 1.084764542936288, + "grad_norm": 0.6731743216514587, + "learning_rate": 4.6233740746692495e-06, + "loss": 0.6236, + "step": 3916 + }, + { + "epoch": 1.0850415512465375, + "grad_norm": 0.667972981929779, + "learning_rate": 4.623181771408029e-06, + "loss": 0.6199, + "step": 3917 + }, + { + "epoch": 1.0853185595567867, + "grad_norm": 0.6656007766723633, + "learning_rate": 4.622989423066238e-06, + "loss": 0.6551, + "step": 3918 + }, + { + "epoch": 1.085595567867036, + "grad_norm": 0.6904618740081787, + "learning_rate": 4.622797029647958e-06, + "loss": 0.6831, + "step": 3919 + }, + { + "epoch": 1.0858725761772854, + "grad_norm": 0.6999261975288391, + "learning_rate": 4.622604591157279e-06, + "loss": 0.6216, + "step": 3920 + }, + { + "epoch": 1.0861495844875346, + "grad_norm": 0.7652238011360168, + "learning_rate": 4.622412107598282e-06, + "loss": 0.6708, + "step": 3921 + }, + { + "epoch": 1.0864265927977839, + "grad_norm": 0.6936379671096802, + "learning_rate": 4.622219578975057e-06, + "loss": 0.6148, + "step": 3922 + }, + { + "epoch": 1.0867036011080333, + "grad_norm": 0.7080146074295044, + "learning_rate": 4.622027005291691e-06, + "loss": 0.6505, + "step": 3923 + }, + { + "epoch": 1.0869806094182826, + "grad_norm": 0.7341291308403015, + "learning_rate": 4.621834386552271e-06, + "loss": 0.6059, + "step": 3924 + }, + { + "epoch": 1.0872576177285318, + "grad_norm": 0.7292314767837524, + "learning_rate": 4.62164172276089e-06, + "loss": 0.6471, + "step": 3925 + }, + { + "epoch": 1.0875346260387813, + "grad_norm": 0.6754866242408752, + "learning_rate": 4.621449013921636e-06, + "loss": 0.6214, + "step": 3926 + }, + { + "epoch": 1.0878116343490305, + "grad_norm": 0.6615110039710999, + "learning_rate": 4.621256260038602e-06, + "loss": 0.6625, + "step": 3927 + }, + { + "epoch": 1.0880886426592797, + "grad_norm": 0.6877681016921997, + "learning_rate": 4.621063461115882e-06, + "loss": 0.6483, + "step": 3928 + }, + { + "epoch": 1.088365650969529, + "grad_norm": 0.687233030796051, + "learning_rate": 4.6208706171575665e-06, + "loss": 0.6702, + "step": 3929 + }, + { + "epoch": 1.0886426592797784, + "grad_norm": 0.6554688215255737, + "learning_rate": 4.6206777281677525e-06, + "loss": 0.6056, + "step": 3930 + }, + { + "epoch": 1.0889196675900277, + "grad_norm": 0.6812751889228821, + "learning_rate": 4.620484794150534e-06, + "loss": 0.6578, + "step": 3931 + }, + { + "epoch": 1.089196675900277, + "grad_norm": 0.7219202518463135, + "learning_rate": 4.6202918151100085e-06, + "loss": 0.6641, + "step": 3932 + }, + { + "epoch": 1.0894736842105264, + "grad_norm": 0.68025141954422, + "learning_rate": 4.6200987910502735e-06, + "loss": 0.6567, + "step": 3933 + }, + { + "epoch": 1.0897506925207756, + "grad_norm": 0.7100472450256348, + "learning_rate": 4.6199057219754265e-06, + "loss": 0.6972, + "step": 3934 + }, + { + "epoch": 1.0900277008310248, + "grad_norm": 0.7465969920158386, + "learning_rate": 4.619712607889568e-06, + "loss": 0.623, + "step": 3935 + }, + { + "epoch": 1.0903047091412743, + "grad_norm": 0.7210541367530823, + "learning_rate": 4.619519448796797e-06, + "loss": 0.6612, + "step": 3936 + }, + { + "epoch": 1.0905817174515235, + "grad_norm": 0.6653692722320557, + "learning_rate": 4.619326244701216e-06, + "loss": 0.5711, + "step": 3937 + }, + { + "epoch": 1.0908587257617728, + "grad_norm": 0.7188334465026855, + "learning_rate": 4.619132995606926e-06, + "loss": 0.6296, + "step": 3938 + }, + { + "epoch": 1.0911357340720222, + "grad_norm": 0.7091854214668274, + "learning_rate": 4.618939701518032e-06, + "loss": 0.6647, + "step": 3939 + }, + { + "epoch": 1.0914127423822715, + "grad_norm": 0.7106387615203857, + "learning_rate": 4.618746362438636e-06, + "loss": 0.6711, + "step": 3940 + }, + { + "epoch": 1.0916897506925207, + "grad_norm": 0.6800446510314941, + "learning_rate": 4.618552978372844e-06, + "loss": 0.6541, + "step": 3941 + }, + { + "epoch": 1.0919667590027702, + "grad_norm": 0.6591231822967529, + "learning_rate": 4.618359549324762e-06, + "loss": 0.6561, + "step": 3942 + }, + { + "epoch": 1.0922437673130194, + "grad_norm": 0.6881027221679688, + "learning_rate": 4.618166075298498e-06, + "loss": 0.6558, + "step": 3943 + }, + { + "epoch": 1.0925207756232687, + "grad_norm": 0.7059905529022217, + "learning_rate": 4.617972556298158e-06, + "loss": 0.6297, + "step": 3944 + }, + { + "epoch": 1.0927977839335181, + "grad_norm": 0.695412278175354, + "learning_rate": 4.617778992327852e-06, + "loss": 0.6747, + "step": 3945 + }, + { + "epoch": 1.0930747922437674, + "grad_norm": 0.6626036763191223, + "learning_rate": 4.61758538339169e-06, + "loss": 0.6412, + "step": 3946 + }, + { + "epoch": 1.0933518005540166, + "grad_norm": 0.7440068125724792, + "learning_rate": 4.617391729493783e-06, + "loss": 0.6632, + "step": 3947 + }, + { + "epoch": 1.0936288088642658, + "grad_norm": 0.6759104132652283, + "learning_rate": 4.617198030638242e-06, + "loss": 0.5918, + "step": 3948 + }, + { + "epoch": 1.0939058171745153, + "grad_norm": 0.6452819108963013, + "learning_rate": 4.61700428682918e-06, + "loss": 0.615, + "step": 3949 + }, + { + "epoch": 1.0941828254847645, + "grad_norm": 0.6776657700538635, + "learning_rate": 4.6168104980707105e-06, + "loss": 0.6719, + "step": 3950 + }, + { + "epoch": 1.0944598337950138, + "grad_norm": 0.7503976821899414, + "learning_rate": 4.616616664366949e-06, + "loss": 0.6442, + "step": 3951 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 0.7288950085639954, + "learning_rate": 4.61642278572201e-06, + "loss": 0.6398, + "step": 3952 + }, + { + "epoch": 1.0950138504155125, + "grad_norm": 0.6706618070602417, + "learning_rate": 4.616228862140011e-06, + "loss": 0.6616, + "step": 3953 + }, + { + "epoch": 1.0952908587257617, + "grad_norm": 0.6500645875930786, + "learning_rate": 4.616034893625069e-06, + "loss": 0.6003, + "step": 3954 + }, + { + "epoch": 1.0955678670360112, + "grad_norm": 0.7065473794937134, + "learning_rate": 4.6158408801813015e-06, + "loss": 0.663, + "step": 3955 + }, + { + "epoch": 1.0958448753462604, + "grad_norm": 0.6544113755226135, + "learning_rate": 4.6156468218128296e-06, + "loss": 0.6369, + "step": 3956 + }, + { + "epoch": 1.0961218836565096, + "grad_norm": 0.6566190719604492, + "learning_rate": 4.615452718523773e-06, + "loss": 0.6221, + "step": 3957 + }, + { + "epoch": 1.096398891966759, + "grad_norm": 0.6906711459159851, + "learning_rate": 4.615258570318253e-06, + "loss": 0.6615, + "step": 3958 + }, + { + "epoch": 1.0966759002770083, + "grad_norm": 0.7138991951942444, + "learning_rate": 4.615064377200391e-06, + "loss": 0.6421, + "step": 3959 + }, + { + "epoch": 1.0969529085872576, + "grad_norm": 0.6587837934494019, + "learning_rate": 4.614870139174311e-06, + "loss": 0.6443, + "step": 3960 + }, + { + "epoch": 1.0972299168975068, + "grad_norm": 0.6848393082618713, + "learning_rate": 4.614675856244138e-06, + "loss": 0.5864, + "step": 3961 + }, + { + "epoch": 1.0975069252077563, + "grad_norm": 0.752826988697052, + "learning_rate": 4.614481528413995e-06, + "loss": 0.6359, + "step": 3962 + }, + { + "epoch": 1.0977839335180055, + "grad_norm": 0.7032961249351501, + "learning_rate": 4.614287155688011e-06, + "loss": 0.6627, + "step": 3963 + }, + { + "epoch": 1.0980609418282548, + "grad_norm": 0.6835402846336365, + "learning_rate": 4.614092738070311e-06, + "loss": 0.6927, + "step": 3964 + }, + { + "epoch": 1.0983379501385042, + "grad_norm": 0.6948668956756592, + "learning_rate": 4.613898275565022e-06, + "loss": 0.6491, + "step": 3965 + }, + { + "epoch": 1.0986149584487535, + "grad_norm": 0.6744138598442078, + "learning_rate": 4.613703768176276e-06, + "loss": 0.6413, + "step": 3966 + }, + { + "epoch": 1.0988919667590027, + "grad_norm": 0.7132924199104309, + "learning_rate": 4.613509215908201e-06, + "loss": 0.6307, + "step": 3967 + }, + { + "epoch": 1.0991689750692522, + "grad_norm": 0.6486527919769287, + "learning_rate": 4.613314618764927e-06, + "loss": 0.6537, + "step": 3968 + }, + { + "epoch": 1.0994459833795014, + "grad_norm": 0.693931519985199, + "learning_rate": 4.613119976750588e-06, + "loss": 0.6721, + "step": 3969 + }, + { + "epoch": 1.0997229916897506, + "grad_norm": 0.6713244318962097, + "learning_rate": 4.612925289869314e-06, + "loss": 0.6658, + "step": 3970 + }, + { + "epoch": 1.1, + "grad_norm": 0.6835699081420898, + "learning_rate": 4.6127305581252414e-06, + "loss": 0.6678, + "step": 3971 + }, + { + "epoch": 1.1002770083102493, + "grad_norm": 0.7053037285804749, + "learning_rate": 4.612535781522504e-06, + "loss": 0.6403, + "step": 3972 + }, + { + "epoch": 1.1005540166204986, + "grad_norm": 0.6976191997528076, + "learning_rate": 4.612340960065236e-06, + "loss": 0.6336, + "step": 3973 + }, + { + "epoch": 1.100831024930748, + "grad_norm": 0.716823935508728, + "learning_rate": 4.612146093757576e-06, + "loss": 0.6577, + "step": 3974 + }, + { + "epoch": 1.1011080332409973, + "grad_norm": 0.6851913928985596, + "learning_rate": 4.61195118260366e-06, + "loss": 0.6297, + "step": 3975 + }, + { + "epoch": 1.1013850415512465, + "grad_norm": 0.7345995306968689, + "learning_rate": 4.611756226607627e-06, + "loss": 0.6526, + "step": 3976 + }, + { + "epoch": 1.101662049861496, + "grad_norm": 0.696278989315033, + "learning_rate": 4.611561225773617e-06, + "loss": 0.6526, + "step": 3977 + }, + { + "epoch": 1.1019390581717452, + "grad_norm": 0.6616877317428589, + "learning_rate": 4.61136618010577e-06, + "loss": 0.6268, + "step": 3978 + }, + { + "epoch": 1.1022160664819944, + "grad_norm": 0.688875138759613, + "learning_rate": 4.611171089608226e-06, + "loss": 0.6434, + "step": 3979 + }, + { + "epoch": 1.1024930747922437, + "grad_norm": 0.7036260366439819, + "learning_rate": 4.610975954285129e-06, + "loss": 0.6641, + "step": 3980 + }, + { + "epoch": 1.1027700831024931, + "grad_norm": 0.6837557554244995, + "learning_rate": 4.610780774140623e-06, + "loss": 0.6485, + "step": 3981 + }, + { + "epoch": 1.1030470914127424, + "grad_norm": 0.7002822756767273, + "learning_rate": 4.61058554917885e-06, + "loss": 0.6616, + "step": 3982 + }, + { + "epoch": 1.1033240997229916, + "grad_norm": 0.6870222687721252, + "learning_rate": 4.610390279403955e-06, + "loss": 0.6418, + "step": 3983 + }, + { + "epoch": 1.103601108033241, + "grad_norm": 0.727690577507019, + "learning_rate": 4.610194964820086e-06, + "loss": 0.6673, + "step": 3984 + }, + { + "epoch": 1.1038781163434903, + "grad_norm": 0.7704971432685852, + "learning_rate": 4.609999605431389e-06, + "loss": 0.6677, + "step": 3985 + }, + { + "epoch": 1.1041551246537396, + "grad_norm": 0.7142701745033264, + "learning_rate": 4.6098042012420115e-06, + "loss": 0.6743, + "step": 3986 + }, + { + "epoch": 1.104432132963989, + "grad_norm": 0.6958649754524231, + "learning_rate": 4.609608752256103e-06, + "loss": 0.6094, + "step": 3987 + }, + { + "epoch": 1.1047091412742382, + "grad_norm": 0.6734898686408997, + "learning_rate": 4.609413258477814e-06, + "loss": 0.667, + "step": 3988 + }, + { + "epoch": 1.1049861495844875, + "grad_norm": 0.6905788779258728, + "learning_rate": 4.609217719911293e-06, + "loss": 0.6301, + "step": 3989 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.7093279957771301, + "learning_rate": 4.609022136560695e-06, + "loss": 0.6646, + "step": 3990 + }, + { + "epoch": 1.1055401662049862, + "grad_norm": 0.6776278018951416, + "learning_rate": 4.608826508430171e-06, + "loss": 0.5878, + "step": 3991 + }, + { + "epoch": 1.1058171745152354, + "grad_norm": 0.6801807284355164, + "learning_rate": 4.608630835523875e-06, + "loss": 0.6163, + "step": 3992 + }, + { + "epoch": 1.1060941828254847, + "grad_norm": 0.7167036533355713, + "learning_rate": 4.608435117845961e-06, + "loss": 0.6394, + "step": 3993 + }, + { + "epoch": 1.1063711911357341, + "grad_norm": 0.6812177896499634, + "learning_rate": 4.6082393554005855e-06, + "loss": 0.6594, + "step": 3994 + }, + { + "epoch": 1.1066481994459834, + "grad_norm": 0.7752406001091003, + "learning_rate": 4.608043548191904e-06, + "loss": 0.6783, + "step": 3995 + }, + { + "epoch": 1.1069252077562326, + "grad_norm": 0.6911202669143677, + "learning_rate": 4.607847696224076e-06, + "loss": 0.6464, + "step": 3996 + }, + { + "epoch": 1.107202216066482, + "grad_norm": 0.6800339818000793, + "learning_rate": 4.607651799501257e-06, + "loss": 0.6368, + "step": 3997 + }, + { + "epoch": 1.1074792243767313, + "grad_norm": 0.7221301794052124, + "learning_rate": 4.607455858027608e-06, + "loss": 0.6962, + "step": 3998 + }, + { + "epoch": 1.1077562326869805, + "grad_norm": 0.7182201743125916, + "learning_rate": 4.60725987180729e-06, + "loss": 0.6438, + "step": 3999 + }, + { + "epoch": 1.10803324099723, + "grad_norm": 0.6805998086929321, + "learning_rate": 4.6070638408444635e-06, + "loss": 0.5945, + "step": 4000 + }, + { + "epoch": 1.1083102493074792, + "grad_norm": 0.7249844074249268, + "learning_rate": 4.6068677651432915e-06, + "loss": 0.6334, + "step": 4001 + }, + { + "epoch": 1.1085872576177285, + "grad_norm": 0.6792013049125671, + "learning_rate": 4.606671644707935e-06, + "loss": 0.6646, + "step": 4002 + }, + { + "epoch": 1.108864265927978, + "grad_norm": 0.7077414989471436, + "learning_rate": 4.6064754795425606e-06, + "loss": 0.6588, + "step": 4003 + }, + { + "epoch": 1.1091412742382272, + "grad_norm": 0.6874167919158936, + "learning_rate": 4.606279269651332e-06, + "loss": 0.649, + "step": 4004 + }, + { + "epoch": 1.1094182825484764, + "grad_norm": 0.6752064228057861, + "learning_rate": 4.606083015038416e-06, + "loss": 0.6216, + "step": 4005 + }, + { + "epoch": 1.1096952908587259, + "grad_norm": 0.7003196477890015, + "learning_rate": 4.605886715707979e-06, + "loss": 0.6617, + "step": 4006 + }, + { + "epoch": 1.109972299168975, + "grad_norm": 0.727846622467041, + "learning_rate": 4.605690371664189e-06, + "loss": 0.6484, + "step": 4007 + }, + { + "epoch": 1.1102493074792243, + "grad_norm": 0.7528225779533386, + "learning_rate": 4.605493982911216e-06, + "loss": 0.7293, + "step": 4008 + }, + { + "epoch": 1.1105263157894736, + "grad_norm": 0.6964515447616577, + "learning_rate": 4.605297549453228e-06, + "loss": 0.6379, + "step": 4009 + }, + { + "epoch": 1.110803324099723, + "grad_norm": 0.7198163866996765, + "learning_rate": 4.605101071294397e-06, + "loss": 0.6376, + "step": 4010 + }, + { + "epoch": 1.1110803324099723, + "grad_norm": 0.6878595352172852, + "learning_rate": 4.604904548438894e-06, + "loss": 0.6473, + "step": 4011 + }, + { + "epoch": 1.1113573407202215, + "grad_norm": 0.6337695717811584, + "learning_rate": 4.604707980890892e-06, + "loss": 0.5738, + "step": 4012 + }, + { + "epoch": 1.111634349030471, + "grad_norm": 0.6804854869842529, + "learning_rate": 4.604511368654565e-06, + "loss": 0.605, + "step": 4013 + }, + { + "epoch": 1.1119113573407202, + "grad_norm": 0.6958677768707275, + "learning_rate": 4.604314711734088e-06, + "loss": 0.6795, + "step": 4014 + }, + { + "epoch": 1.1121883656509695, + "grad_norm": 0.7208279371261597, + "learning_rate": 4.604118010133634e-06, + "loss": 0.6608, + "step": 4015 + }, + { + "epoch": 1.112465373961219, + "grad_norm": 0.752122163772583, + "learning_rate": 4.6039212638573835e-06, + "loss": 0.6512, + "step": 4016 + }, + { + "epoch": 1.1127423822714682, + "grad_norm": 0.7201291918754578, + "learning_rate": 4.6037244729095104e-06, + "loss": 0.6476, + "step": 4017 + }, + { + "epoch": 1.1130193905817174, + "grad_norm": 0.7278525829315186, + "learning_rate": 4.603527637294195e-06, + "loss": 0.5995, + "step": 4018 + }, + { + "epoch": 1.1132963988919669, + "grad_norm": 0.7007999420166016, + "learning_rate": 4.603330757015616e-06, + "loss": 0.6716, + "step": 4019 + }, + { + "epoch": 1.113573407202216, + "grad_norm": 0.6690526008605957, + "learning_rate": 4.603133832077953e-06, + "loss": 0.6476, + "step": 4020 + }, + { + "epoch": 1.1138504155124653, + "grad_norm": 0.7289823889732361, + "learning_rate": 4.6029368624853885e-06, + "loss": 0.6441, + "step": 4021 + }, + { + "epoch": 1.1141274238227146, + "grad_norm": 0.674523651599884, + "learning_rate": 4.602739848242105e-06, + "loss": 0.626, + "step": 4022 + }, + { + "epoch": 1.114404432132964, + "grad_norm": 0.7001068592071533, + "learning_rate": 4.602542789352283e-06, + "loss": 0.6816, + "step": 4023 + }, + { + "epoch": 1.1146814404432133, + "grad_norm": 0.7051699161529541, + "learning_rate": 4.602345685820109e-06, + "loss": 0.6761, + "step": 4024 + }, + { + "epoch": 1.1149584487534625, + "grad_norm": 0.699550449848175, + "learning_rate": 4.602148537649768e-06, + "loss": 0.6442, + "step": 4025 + }, + { + "epoch": 1.115235457063712, + "grad_norm": 0.7217336893081665, + "learning_rate": 4.601951344845445e-06, + "loss": 0.6612, + "step": 4026 + }, + { + "epoch": 1.1155124653739612, + "grad_norm": 0.7049715518951416, + "learning_rate": 4.601754107411326e-06, + "loss": 0.5907, + "step": 4027 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 0.7184882164001465, + "learning_rate": 4.6015568253516005e-06, + "loss": 0.6234, + "step": 4028 + }, + { + "epoch": 1.11606648199446, + "grad_norm": 0.6838163733482361, + "learning_rate": 4.601359498670457e-06, + "loss": 0.6032, + "step": 4029 + }, + { + "epoch": 1.1163434903047091, + "grad_norm": 0.6776624321937561, + "learning_rate": 4.601162127372085e-06, + "loss": 0.664, + "step": 4030 + }, + { + "epoch": 1.1166204986149584, + "grad_norm": 0.6719553470611572, + "learning_rate": 4.600964711460675e-06, + "loss": 0.6842, + "step": 4031 + }, + { + "epoch": 1.1168975069252078, + "grad_norm": 0.7092738747596741, + "learning_rate": 4.60076725094042e-06, + "loss": 0.6525, + "step": 4032 + }, + { + "epoch": 1.117174515235457, + "grad_norm": 0.6958156824111938, + "learning_rate": 4.600569745815511e-06, + "loss": 0.6388, + "step": 4033 + }, + { + "epoch": 1.1174515235457063, + "grad_norm": 0.7308716773986816, + "learning_rate": 4.600372196090142e-06, + "loss": 0.659, + "step": 4034 + }, + { + "epoch": 1.1177285318559558, + "grad_norm": 0.6616610884666443, + "learning_rate": 4.600174601768506e-06, + "loss": 0.6024, + "step": 4035 + }, + { + "epoch": 1.118005540166205, + "grad_norm": 0.6373092532157898, + "learning_rate": 4.599976962854802e-06, + "loss": 0.6173, + "step": 4036 + }, + { + "epoch": 1.1182825484764543, + "grad_norm": 0.6987490653991699, + "learning_rate": 4.599779279353224e-06, + "loss": 0.6275, + "step": 4037 + }, + { + "epoch": 1.1185595567867037, + "grad_norm": 0.6965712904930115, + "learning_rate": 4.599581551267969e-06, + "loss": 0.6338, + "step": 4038 + }, + { + "epoch": 1.118836565096953, + "grad_norm": 0.6864861249923706, + "learning_rate": 4.5993837786032355e-06, + "loss": 0.6241, + "step": 4039 + }, + { + "epoch": 1.1191135734072022, + "grad_norm": 0.717839241027832, + "learning_rate": 4.599185961363224e-06, + "loss": 0.6982, + "step": 4040 + }, + { + "epoch": 1.1193905817174514, + "grad_norm": 0.7153016924858093, + "learning_rate": 4.598988099552133e-06, + "loss": 0.687, + "step": 4041 + }, + { + "epoch": 1.1196675900277009, + "grad_norm": 0.7333785891532898, + "learning_rate": 4.5987901931741655e-06, + "loss": 0.6703, + "step": 4042 + }, + { + "epoch": 1.1199445983379501, + "grad_norm": 0.6576941609382629, + "learning_rate": 4.598592242233522e-06, + "loss": 0.6394, + "step": 4043 + }, + { + "epoch": 1.1202216066481994, + "grad_norm": 0.6860183477401733, + "learning_rate": 4.598394246734406e-06, + "loss": 0.6788, + "step": 4044 + }, + { + "epoch": 1.1204986149584488, + "grad_norm": 0.6848655343055725, + "learning_rate": 4.598196206681021e-06, + "loss": 0.6213, + "step": 4045 + }, + { + "epoch": 1.120775623268698, + "grad_norm": 0.6786518692970276, + "learning_rate": 4.597998122077573e-06, + "loss": 0.6325, + "step": 4046 + }, + { + "epoch": 1.1210526315789473, + "grad_norm": 0.7070546746253967, + "learning_rate": 4.597799992928266e-06, + "loss": 0.67, + "step": 4047 + }, + { + "epoch": 1.1213296398891968, + "grad_norm": 0.6792629361152649, + "learning_rate": 4.597601819237309e-06, + "loss": 0.6638, + "step": 4048 + }, + { + "epoch": 1.121606648199446, + "grad_norm": 0.6677889823913574, + "learning_rate": 4.597403601008908e-06, + "loss": 0.6391, + "step": 4049 + }, + { + "epoch": 1.1218836565096952, + "grad_norm": 0.7167168259620667, + "learning_rate": 4.597205338247274e-06, + "loss": 0.6547, + "step": 4050 + }, + { + "epoch": 1.1221606648199447, + "grad_norm": 0.6899300217628479, + "learning_rate": 4.597007030956612e-06, + "loss": 0.6093, + "step": 4051 + }, + { + "epoch": 1.122437673130194, + "grad_norm": 0.6876311898231506, + "learning_rate": 4.596808679141138e-06, + "loss": 0.6596, + "step": 4052 + }, + { + "epoch": 1.1227146814404432, + "grad_norm": 0.6713271737098694, + "learning_rate": 4.596610282805061e-06, + "loss": 0.6324, + "step": 4053 + }, + { + "epoch": 1.1229916897506924, + "grad_norm": 0.7074865698814392, + "learning_rate": 4.5964118419525925e-06, + "loss": 0.605, + "step": 4054 + }, + { + "epoch": 1.1232686980609419, + "grad_norm": 0.708007276058197, + "learning_rate": 4.596213356587948e-06, + "loss": 0.7101, + "step": 4055 + }, + { + "epoch": 1.123545706371191, + "grad_norm": 0.6596668362617493, + "learning_rate": 4.59601482671534e-06, + "loss": 0.6427, + "step": 4056 + }, + { + "epoch": 1.1238227146814403, + "grad_norm": 0.6609944701194763, + "learning_rate": 4.5958162523389845e-06, + "loss": 0.6166, + "step": 4057 + }, + { + "epoch": 1.1240997229916898, + "grad_norm": 0.6856138110160828, + "learning_rate": 4.595617633463099e-06, + "loss": 0.6993, + "step": 4058 + }, + { + "epoch": 1.124376731301939, + "grad_norm": 0.6793034672737122, + "learning_rate": 4.595418970091898e-06, + "loss": 0.6075, + "step": 4059 + }, + { + "epoch": 1.1246537396121883, + "grad_norm": 0.6801674962043762, + "learning_rate": 4.5952202622296015e-06, + "loss": 0.6518, + "step": 4060 + }, + { + "epoch": 1.1249307479224377, + "grad_norm": 0.7081454992294312, + "learning_rate": 4.595021509880429e-06, + "loss": 0.6733, + "step": 4061 + }, + { + "epoch": 1.125207756232687, + "grad_norm": 0.7073140740394592, + "learning_rate": 4.594822713048599e-06, + "loss": 0.6269, + "step": 4062 + }, + { + "epoch": 1.1254847645429362, + "grad_norm": 0.7026985287666321, + "learning_rate": 4.594623871738334e-06, + "loss": 0.6626, + "step": 4063 + }, + { + "epoch": 1.1257617728531857, + "grad_norm": 0.6450572609901428, + "learning_rate": 4.594424985953855e-06, + "loss": 0.6018, + "step": 4064 + }, + { + "epoch": 1.126038781163435, + "grad_norm": 0.6867948770523071, + "learning_rate": 4.594226055699384e-06, + "loss": 0.6411, + "step": 4065 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 0.6572390198707581, + "learning_rate": 4.594027080979147e-06, + "loss": 0.6537, + "step": 4066 + }, + { + "epoch": 1.1265927977839336, + "grad_norm": 0.6882357597351074, + "learning_rate": 4.593828061797366e-06, + "loss": 0.6781, + "step": 4067 + }, + { + "epoch": 1.1268698060941829, + "grad_norm": 0.7123607397079468, + "learning_rate": 4.59362899815827e-06, + "loss": 0.6529, + "step": 4068 + }, + { + "epoch": 1.127146814404432, + "grad_norm": 0.7132445573806763, + "learning_rate": 4.593429890066083e-06, + "loss": 0.6516, + "step": 4069 + }, + { + "epoch": 1.1274238227146816, + "grad_norm": 0.6925243735313416, + "learning_rate": 4.593230737525032e-06, + "loss": 0.6489, + "step": 4070 + }, + { + "epoch": 1.1277008310249308, + "grad_norm": 0.7030454874038696, + "learning_rate": 4.593031540539349e-06, + "loss": 0.6918, + "step": 4071 + }, + { + "epoch": 1.12797783933518, + "grad_norm": 0.7076012492179871, + "learning_rate": 4.59283229911326e-06, + "loss": 0.6412, + "step": 4072 + }, + { + "epoch": 1.1282548476454293, + "grad_norm": 0.6813455820083618, + "learning_rate": 4.592633013250997e-06, + "loss": 0.5779, + "step": 4073 + }, + { + "epoch": 1.1285318559556787, + "grad_norm": 0.7080217003822327, + "learning_rate": 4.592433682956792e-06, + "loss": 0.6446, + "step": 4074 + }, + { + "epoch": 1.128808864265928, + "grad_norm": 0.699512243270874, + "learning_rate": 4.592234308234876e-06, + "loss": 0.6654, + "step": 4075 + }, + { + "epoch": 1.1290858725761772, + "grad_norm": 0.7339920997619629, + "learning_rate": 4.592034889089482e-06, + "loss": 0.6161, + "step": 4076 + }, + { + "epoch": 1.1293628808864267, + "grad_norm": 0.6406992077827454, + "learning_rate": 4.591835425524845e-06, + "loss": 0.5995, + "step": 4077 + }, + { + "epoch": 1.129639889196676, + "grad_norm": 0.6801450252532959, + "learning_rate": 4.591635917545201e-06, + "loss": 0.6729, + "step": 4078 + }, + { + "epoch": 1.1299168975069251, + "grad_norm": 0.7015729546546936, + "learning_rate": 4.5914363651547846e-06, + "loss": 0.6838, + "step": 4079 + }, + { + "epoch": 1.1301939058171746, + "grad_norm": 0.6717569828033447, + "learning_rate": 4.591236768357833e-06, + "loss": 0.5861, + "step": 4080 + }, + { + "epoch": 1.1304709141274238, + "grad_norm": 0.7090352177619934, + "learning_rate": 4.591037127158584e-06, + "loss": 0.644, + "step": 4081 + }, + { + "epoch": 1.130747922437673, + "grad_norm": 0.662742555141449, + "learning_rate": 4.590837441561277e-06, + "loss": 0.6224, + "step": 4082 + }, + { + "epoch": 1.1310249307479223, + "grad_norm": 0.7132098078727722, + "learning_rate": 4.590637711570152e-06, + "loss": 0.6407, + "step": 4083 + }, + { + "epoch": 1.1313019390581718, + "grad_norm": 0.7207354307174683, + "learning_rate": 4.590437937189449e-06, + "loss": 0.6605, + "step": 4084 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.6709021329879761, + "learning_rate": 4.5902381184234116e-06, + "loss": 0.6759, + "step": 4085 + }, + { + "epoch": 1.1318559556786703, + "grad_norm": 0.6779177188873291, + "learning_rate": 4.590038255276279e-06, + "loss": 0.6342, + "step": 4086 + }, + { + "epoch": 1.1321329639889197, + "grad_norm": 0.6814398169517517, + "learning_rate": 4.589838347752298e-06, + "loss": 0.6431, + "step": 4087 + }, + { + "epoch": 1.132409972299169, + "grad_norm": 0.6873770952224731, + "learning_rate": 4.589638395855712e-06, + "loss": 0.6253, + "step": 4088 + }, + { + "epoch": 1.1326869806094182, + "grad_norm": 0.6760203242301941, + "learning_rate": 4.589438399590766e-06, + "loss": 0.6744, + "step": 4089 + }, + { + "epoch": 1.1329639889196677, + "grad_norm": 0.7229450345039368, + "learning_rate": 4.589238358961707e-06, + "loss": 0.6845, + "step": 4090 + }, + { + "epoch": 1.133240997229917, + "grad_norm": 0.7091704607009888, + "learning_rate": 4.589038273972783e-06, + "loss": 0.5978, + "step": 4091 + }, + { + "epoch": 1.1335180055401661, + "grad_norm": 0.6956828236579895, + "learning_rate": 4.588838144628241e-06, + "loss": 0.646, + "step": 4092 + }, + { + "epoch": 1.1337950138504156, + "grad_norm": 0.6587649583816528, + "learning_rate": 4.588637970932332e-06, + "loss": 0.6249, + "step": 4093 + }, + { + "epoch": 1.1340720221606648, + "grad_norm": 0.690669596195221, + "learning_rate": 4.588437752889305e-06, + "loss": 0.6517, + "step": 4094 + }, + { + "epoch": 1.134349030470914, + "grad_norm": 0.6745144724845886, + "learning_rate": 4.588237490503409e-06, + "loss": 0.6255, + "step": 4095 + }, + { + "epoch": 1.1346260387811635, + "grad_norm": 0.6996001601219177, + "learning_rate": 4.5880371837789e-06, + "loss": 0.6747, + "step": 4096 + }, + { + "epoch": 1.1349030470914128, + "grad_norm": 0.7193899154663086, + "learning_rate": 4.587836832720029e-06, + "loss": 0.6316, + "step": 4097 + }, + { + "epoch": 1.135180055401662, + "grad_norm": 0.716244637966156, + "learning_rate": 4.587636437331051e-06, + "loss": 0.6515, + "step": 4098 + }, + { + "epoch": 1.1354570637119115, + "grad_norm": 0.6850689053535461, + "learning_rate": 4.587435997616219e-06, + "loss": 0.6456, + "step": 4099 + }, + { + "epoch": 1.1357340720221607, + "grad_norm": 0.6978815197944641, + "learning_rate": 4.587235513579791e-06, + "loss": 0.6909, + "step": 4100 + }, + { + "epoch": 1.13601108033241, + "grad_norm": 0.692414402961731, + "learning_rate": 4.587034985226022e-06, + "loss": 0.6706, + "step": 4101 + }, + { + "epoch": 1.1362880886426594, + "grad_norm": 0.7044286131858826, + "learning_rate": 4.586834412559171e-06, + "loss": 0.651, + "step": 4102 + }, + { + "epoch": 1.1365650969529086, + "grad_norm": 0.7050232887268066, + "learning_rate": 4.586633795583496e-06, + "loss": 0.6158, + "step": 4103 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 0.7236217856407166, + "learning_rate": 4.586433134303257e-06, + "loss": 0.6257, + "step": 4104 + }, + { + "epoch": 1.1371191135734071, + "grad_norm": 0.7170664668083191, + "learning_rate": 4.5862324287227145e-06, + "loss": 0.6654, + "step": 4105 + }, + { + "epoch": 1.1373961218836566, + "grad_norm": 0.6874409317970276, + "learning_rate": 4.58603167884613e-06, + "loss": 0.5662, + "step": 4106 + }, + { + "epoch": 1.1376731301939058, + "grad_norm": 0.6953471302986145, + "learning_rate": 4.585830884677765e-06, + "loss": 0.6646, + "step": 4107 + }, + { + "epoch": 1.137950138504155, + "grad_norm": 0.7238764762878418, + "learning_rate": 4.585630046221884e-06, + "loss": 0.6542, + "step": 4108 + }, + { + "epoch": 1.1382271468144045, + "grad_norm": 0.6525726914405823, + "learning_rate": 4.585429163482752e-06, + "loss": 0.6734, + "step": 4109 + }, + { + "epoch": 1.1385041551246537, + "grad_norm": 0.682162880897522, + "learning_rate": 4.585228236464632e-06, + "loss": 0.614, + "step": 4110 + }, + { + "epoch": 1.138781163434903, + "grad_norm": 0.6903233528137207, + "learning_rate": 4.585027265171793e-06, + "loss": 0.6457, + "step": 4111 + }, + { + "epoch": 1.1390581717451524, + "grad_norm": 0.713100016117096, + "learning_rate": 4.5848262496084995e-06, + "loss": 0.6154, + "step": 4112 + }, + { + "epoch": 1.1393351800554017, + "grad_norm": 0.6778297424316406, + "learning_rate": 4.58462518977902e-06, + "loss": 0.5965, + "step": 4113 + }, + { + "epoch": 1.139612188365651, + "grad_norm": 0.6337829232215881, + "learning_rate": 4.584424085687625e-06, + "loss": 0.5248, + "step": 4114 + }, + { + "epoch": 1.1398891966759002, + "grad_norm": 0.693355143070221, + "learning_rate": 4.584222937338584e-06, + "loss": 0.6749, + "step": 4115 + }, + { + "epoch": 1.1401662049861496, + "grad_norm": 0.7208781838417053, + "learning_rate": 4.584021744736167e-06, + "loss": 0.6706, + "step": 4116 + }, + { + "epoch": 1.1404432132963989, + "grad_norm": 0.7035034894943237, + "learning_rate": 4.583820507884646e-06, + "loss": 0.5913, + "step": 4117 + }, + { + "epoch": 1.140720221606648, + "grad_norm": 0.7289274334907532, + "learning_rate": 4.583619226788294e-06, + "loss": 0.6392, + "step": 4118 + }, + { + "epoch": 1.1409972299168976, + "grad_norm": 0.7026898264884949, + "learning_rate": 4.583417901451386e-06, + "loss": 0.6338, + "step": 4119 + }, + { + "epoch": 1.1412742382271468, + "grad_norm": 0.6910617351531982, + "learning_rate": 4.583216531878195e-06, + "loss": 0.6512, + "step": 4120 + }, + { + "epoch": 1.141551246537396, + "grad_norm": 0.6606723666191101, + "learning_rate": 4.5830151180729964e-06, + "loss": 0.6401, + "step": 4121 + }, + { + "epoch": 1.1418282548476455, + "grad_norm": 0.7039617896080017, + "learning_rate": 4.582813660040068e-06, + "loss": 0.6573, + "step": 4122 + }, + { + "epoch": 1.1421052631578947, + "grad_norm": 0.7446121573448181, + "learning_rate": 4.582612157783687e-06, + "loss": 0.6593, + "step": 4123 + }, + { + "epoch": 1.142382271468144, + "grad_norm": 0.6769001483917236, + "learning_rate": 4.58241061130813e-06, + "loss": 0.6238, + "step": 4124 + }, + { + "epoch": 1.1426592797783934, + "grad_norm": 0.7253504395484924, + "learning_rate": 4.582209020617679e-06, + "loss": 0.6597, + "step": 4125 + }, + { + "epoch": 1.1429362880886427, + "grad_norm": 0.7634405493736267, + "learning_rate": 4.582007385716614e-06, + "loss": 0.6808, + "step": 4126 + }, + { + "epoch": 1.143213296398892, + "grad_norm": 0.7294143438339233, + "learning_rate": 4.581805706609213e-06, + "loss": 0.614, + "step": 4127 + }, + { + "epoch": 1.1434903047091414, + "grad_norm": 0.700397253036499, + "learning_rate": 4.5816039832997625e-06, + "loss": 0.662, + "step": 4128 + }, + { + "epoch": 1.1437673130193906, + "grad_norm": 0.747981071472168, + "learning_rate": 4.581402215792543e-06, + "loss": 0.6671, + "step": 4129 + }, + { + "epoch": 1.1440443213296398, + "grad_norm": 0.6845051646232605, + "learning_rate": 4.581200404091839e-06, + "loss": 0.6468, + "step": 4130 + }, + { + "epoch": 1.1443213296398893, + "grad_norm": 0.6928357481956482, + "learning_rate": 4.580998548201936e-06, + "loss": 0.674, + "step": 4131 + }, + { + "epoch": 1.1445983379501385, + "grad_norm": 0.6433817744255066, + "learning_rate": 4.5807966481271184e-06, + "loss": 0.6253, + "step": 4132 + }, + { + "epoch": 1.1448753462603878, + "grad_norm": 0.6889892816543579, + "learning_rate": 4.5805947038716754e-06, + "loss": 0.6596, + "step": 4133 + }, + { + "epoch": 1.1451523545706372, + "grad_norm": 0.6575981974601746, + "learning_rate": 4.580392715439893e-06, + "loss": 0.633, + "step": 4134 + }, + { + "epoch": 1.1454293628808865, + "grad_norm": 0.6686013340950012, + "learning_rate": 4.580190682836061e-06, + "loss": 0.6355, + "step": 4135 + }, + { + "epoch": 1.1457063711911357, + "grad_norm": 0.6737796068191528, + "learning_rate": 4.579988606064468e-06, + "loss": 0.6032, + "step": 4136 + }, + { + "epoch": 1.145983379501385, + "grad_norm": 0.7231971621513367, + "learning_rate": 4.579786485129405e-06, + "loss": 0.6805, + "step": 4137 + }, + { + "epoch": 1.1462603878116344, + "grad_norm": 0.6811356544494629, + "learning_rate": 4.579584320035164e-06, + "loss": 0.6046, + "step": 4138 + }, + { + "epoch": 1.1465373961218837, + "grad_norm": 0.7172678112983704, + "learning_rate": 4.579382110786038e-06, + "loss": 0.6553, + "step": 4139 + }, + { + "epoch": 1.146814404432133, + "grad_norm": 0.6764629483222961, + "learning_rate": 4.579179857386319e-06, + "loss": 0.6216, + "step": 4140 + }, + { + "epoch": 1.1470914127423824, + "grad_norm": 0.6475736498832703, + "learning_rate": 4.578977559840302e-06, + "loss": 0.6237, + "step": 4141 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 0.7342414855957031, + "learning_rate": 4.578775218152282e-06, + "loss": 0.6655, + "step": 4142 + }, + { + "epoch": 1.1476454293628808, + "grad_norm": 0.7185686826705933, + "learning_rate": 4.578572832326555e-06, + "loss": 0.6126, + "step": 4143 + }, + { + "epoch": 1.14792243767313, + "grad_norm": 0.7499321699142456, + "learning_rate": 4.57837040236742e-06, + "loss": 0.6875, + "step": 4144 + }, + { + "epoch": 1.1481994459833795, + "grad_norm": 0.683867335319519, + "learning_rate": 4.578167928279174e-06, + "loss": 0.6397, + "step": 4145 + }, + { + "epoch": 1.1484764542936288, + "grad_norm": 0.6831228137016296, + "learning_rate": 4.577965410066114e-06, + "loss": 0.662, + "step": 4146 + }, + { + "epoch": 1.148753462603878, + "grad_norm": 0.686525821685791, + "learning_rate": 4.577762847732543e-06, + "loss": 0.6128, + "step": 4147 + }, + { + "epoch": 1.1490304709141275, + "grad_norm": 0.6958313584327698, + "learning_rate": 4.57756024128276e-06, + "loss": 0.6187, + "step": 4148 + }, + { + "epoch": 1.1493074792243767, + "grad_norm": 0.6785929799079895, + "learning_rate": 4.577357590721069e-06, + "loss": 0.6062, + "step": 4149 + }, + { + "epoch": 1.149584487534626, + "grad_norm": 0.7083978056907654, + "learning_rate": 4.57715489605177e-06, + "loss": 0.6558, + "step": 4150 + }, + { + "epoch": 1.1498614958448754, + "grad_norm": 0.6996467113494873, + "learning_rate": 4.576952157279169e-06, + "loss": 0.6411, + "step": 4151 + }, + { + "epoch": 1.1501385041551246, + "grad_norm": 0.7247840166091919, + "learning_rate": 4.576749374407569e-06, + "loss": 0.6825, + "step": 4152 + }, + { + "epoch": 1.1504155124653739, + "grad_norm": 0.712798535823822, + "learning_rate": 4.576546547441276e-06, + "loss": 0.6215, + "step": 4153 + }, + { + "epoch": 1.1506925207756233, + "grad_norm": 0.7341089248657227, + "learning_rate": 4.576343676384598e-06, + "loss": 0.6737, + "step": 4154 + }, + { + "epoch": 1.1509695290858726, + "grad_norm": 0.6770998239517212, + "learning_rate": 4.576140761241841e-06, + "loss": 0.5728, + "step": 4155 + }, + { + "epoch": 1.1512465373961218, + "grad_norm": 0.7066571712493896, + "learning_rate": 4.575937802017313e-06, + "loss": 0.6122, + "step": 4156 + }, + { + "epoch": 1.1515235457063713, + "grad_norm": 0.7351990938186646, + "learning_rate": 4.575734798715325e-06, + "loss": 0.6717, + "step": 4157 + }, + { + "epoch": 1.1518005540166205, + "grad_norm": 0.6787825226783752, + "learning_rate": 4.5755317513401855e-06, + "loss": 0.6366, + "step": 4158 + }, + { + "epoch": 1.1520775623268698, + "grad_norm": 0.6576148271560669, + "learning_rate": 4.575328659896207e-06, + "loss": 0.6444, + "step": 4159 + }, + { + "epoch": 1.1523545706371192, + "grad_norm": 0.6681926846504211, + "learning_rate": 4.575125524387702e-06, + "loss": 0.6042, + "step": 4160 + }, + { + "epoch": 1.1526315789473685, + "grad_norm": 0.67631596326828, + "learning_rate": 4.574922344818982e-06, + "loss": 0.6265, + "step": 4161 + }, + { + "epoch": 1.1529085872576177, + "grad_norm": 0.6682254076004028, + "learning_rate": 4.574719121194362e-06, + "loss": 0.6007, + "step": 4162 + }, + { + "epoch": 1.1531855955678671, + "grad_norm": 0.6621627807617188, + "learning_rate": 4.574515853518157e-06, + "loss": 0.6509, + "step": 4163 + }, + { + "epoch": 1.1534626038781164, + "grad_norm": 0.7396254539489746, + "learning_rate": 4.574312541794683e-06, + "loss": 0.6482, + "step": 4164 + }, + { + "epoch": 1.1537396121883656, + "grad_norm": 0.7040379643440247, + "learning_rate": 4.574109186028255e-06, + "loss": 0.6774, + "step": 4165 + }, + { + "epoch": 1.1540166204986149, + "grad_norm": 0.7313653230667114, + "learning_rate": 4.573905786223194e-06, + "loss": 0.6583, + "step": 4166 + }, + { + "epoch": 1.1542936288088643, + "grad_norm": 0.6954412460327148, + "learning_rate": 4.573702342383816e-06, + "loss": 0.6682, + "step": 4167 + }, + { + "epoch": 1.1545706371191136, + "grad_norm": 0.7022963166236877, + "learning_rate": 4.573498854514442e-06, + "loss": 0.6149, + "step": 4168 + }, + { + "epoch": 1.1548476454293628, + "grad_norm": 0.6929730176925659, + "learning_rate": 4.5732953226193925e-06, + "loss": 0.5669, + "step": 4169 + }, + { + "epoch": 1.1551246537396123, + "grad_norm": 0.688026487827301, + "learning_rate": 4.573091746702988e-06, + "loss": 0.6446, + "step": 4170 + }, + { + "epoch": 1.1554016620498615, + "grad_norm": 0.7175707221031189, + "learning_rate": 4.572888126769552e-06, + "loss": 0.6663, + "step": 4171 + }, + { + "epoch": 1.1556786703601107, + "grad_norm": 0.6707695126533508, + "learning_rate": 4.5726844628234075e-06, + "loss": 0.6774, + "step": 4172 + }, + { + "epoch": 1.1559556786703602, + "grad_norm": 0.6873745322227478, + "learning_rate": 4.572480754868879e-06, + "loss": 0.6272, + "step": 4173 + }, + { + "epoch": 1.1562326869806094, + "grad_norm": 0.6898880004882812, + "learning_rate": 4.572277002910291e-06, + "loss": 0.5811, + "step": 4174 + }, + { + "epoch": 1.1565096952908587, + "grad_norm": 0.7220594882965088, + "learning_rate": 4.57207320695197e-06, + "loss": 0.6973, + "step": 4175 + }, + { + "epoch": 1.156786703601108, + "grad_norm": 0.6856008172035217, + "learning_rate": 4.571869366998244e-06, + "loss": 0.6165, + "step": 4176 + }, + { + "epoch": 1.1570637119113574, + "grad_norm": 0.6618354916572571, + "learning_rate": 4.57166548305344e-06, + "loss": 0.5833, + "step": 4177 + }, + { + "epoch": 1.1573407202216066, + "grad_norm": 0.7045774459838867, + "learning_rate": 4.571461555121888e-06, + "loss": 0.6329, + "step": 4178 + }, + { + "epoch": 1.1576177285318558, + "grad_norm": 0.7029802799224854, + "learning_rate": 4.571257583207917e-06, + "loss": 0.637, + "step": 4179 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.6747250556945801, + "learning_rate": 4.571053567315858e-06, + "loss": 0.6293, + "step": 4180 + }, + { + "epoch": 1.1581717451523545, + "grad_norm": 0.6726357340812683, + "learning_rate": 4.570849507450042e-06, + "loss": 0.6595, + "step": 4181 + }, + { + "epoch": 1.1584487534626038, + "grad_norm": 0.7162138223648071, + "learning_rate": 4.570645403614803e-06, + "loss": 0.677, + "step": 4182 + }, + { + "epoch": 1.1587257617728532, + "grad_norm": 0.6681135892868042, + "learning_rate": 4.5704412558144744e-06, + "loss": 0.6358, + "step": 4183 + }, + { + "epoch": 1.1590027700831025, + "grad_norm": 0.7122220993041992, + "learning_rate": 4.570237064053391e-06, + "loss": 0.6678, + "step": 4184 + }, + { + "epoch": 1.1592797783933517, + "grad_norm": 0.6930856704711914, + "learning_rate": 4.570032828335887e-06, + "loss": 0.6539, + "step": 4185 + }, + { + "epoch": 1.1595567867036012, + "grad_norm": 0.7084271311759949, + "learning_rate": 4.569828548666301e-06, + "loss": 0.6265, + "step": 4186 + }, + { + "epoch": 1.1598337950138504, + "grad_norm": 0.6743333339691162, + "learning_rate": 4.569624225048968e-06, + "loss": 0.5993, + "step": 4187 + }, + { + "epoch": 1.1601108033240997, + "grad_norm": 0.7010241746902466, + "learning_rate": 4.569419857488228e-06, + "loss": 0.6733, + "step": 4188 + }, + { + "epoch": 1.1603878116343491, + "grad_norm": 0.656864583492279, + "learning_rate": 4.56921544598842e-06, + "loss": 0.5831, + "step": 4189 + }, + { + "epoch": 1.1606648199445984, + "grad_norm": 0.6931899189949036, + "learning_rate": 4.569010990553883e-06, + "loss": 0.6362, + "step": 4190 + }, + { + "epoch": 1.1609418282548476, + "grad_norm": 0.7144986987113953, + "learning_rate": 4.5688064911889605e-06, + "loss": 0.635, + "step": 4191 + }, + { + "epoch": 1.161218836565097, + "grad_norm": 0.6765926480293274, + "learning_rate": 4.5686019478979915e-06, + "loss": 0.6082, + "step": 4192 + }, + { + "epoch": 1.1614958448753463, + "grad_norm": 0.7012190222740173, + "learning_rate": 4.5683973606853216e-06, + "loss": 0.6636, + "step": 4193 + }, + { + "epoch": 1.1617728531855955, + "grad_norm": 0.6823146939277649, + "learning_rate": 4.568192729555293e-06, + "loss": 0.6071, + "step": 4194 + }, + { + "epoch": 1.162049861495845, + "grad_norm": 0.6781933903694153, + "learning_rate": 4.567988054512251e-06, + "loss": 0.6544, + "step": 4195 + }, + { + "epoch": 1.1623268698060942, + "grad_norm": 0.6953796744346619, + "learning_rate": 4.567783335560542e-06, + "loss": 0.6036, + "step": 4196 + }, + { + "epoch": 1.1626038781163435, + "grad_norm": 0.7140665650367737, + "learning_rate": 4.567578572704512e-06, + "loss": 0.6562, + "step": 4197 + }, + { + "epoch": 1.1628808864265927, + "grad_norm": 0.7328235507011414, + "learning_rate": 4.567373765948509e-06, + "loss": 0.6365, + "step": 4198 + }, + { + "epoch": 1.1631578947368422, + "grad_norm": 0.7307348847389221, + "learning_rate": 4.567168915296881e-06, + "loss": 0.6155, + "step": 4199 + }, + { + "epoch": 1.1634349030470914, + "grad_norm": 0.6891008019447327, + "learning_rate": 4.566964020753979e-06, + "loss": 0.6396, + "step": 4200 + }, + { + "epoch": 1.1637119113573406, + "grad_norm": 0.7145599126815796, + "learning_rate": 4.5667590823241515e-06, + "loss": 0.6082, + "step": 4201 + }, + { + "epoch": 1.16398891966759, + "grad_norm": 0.6793602705001831, + "learning_rate": 4.5665541000117504e-06, + "loss": 0.6393, + "step": 4202 + }, + { + "epoch": 1.1642659279778393, + "grad_norm": 0.6986454129219055, + "learning_rate": 4.566349073821129e-06, + "loss": 0.6861, + "step": 4203 + }, + { + "epoch": 1.1645429362880886, + "grad_norm": 0.6601034998893738, + "learning_rate": 4.566144003756641e-06, + "loss": 0.6556, + "step": 4204 + }, + { + "epoch": 1.164819944598338, + "grad_norm": 0.7221328020095825, + "learning_rate": 4.565938889822637e-06, + "loss": 0.6427, + "step": 4205 + }, + { + "epoch": 1.1650969529085873, + "grad_norm": 0.6929381489753723, + "learning_rate": 4.5657337320234765e-06, + "loss": 0.6507, + "step": 4206 + }, + { + "epoch": 1.1653739612188365, + "grad_norm": 0.716958224773407, + "learning_rate": 4.565528530363513e-06, + "loss": 0.6626, + "step": 4207 + }, + { + "epoch": 1.1656509695290858, + "grad_norm": 0.6676911115646362, + "learning_rate": 4.5653232848471044e-06, + "loss": 0.6325, + "step": 4208 + }, + { + "epoch": 1.1659279778393352, + "grad_norm": 0.6989991068840027, + "learning_rate": 4.565117995478607e-06, + "loss": 0.6409, + "step": 4209 + }, + { + "epoch": 1.1662049861495845, + "grad_norm": 0.6747390627861023, + "learning_rate": 4.564912662262382e-06, + "loss": 0.6289, + "step": 4210 + }, + { + "epoch": 1.1664819944598337, + "grad_norm": 0.7510681748390198, + "learning_rate": 4.564707285202787e-06, + "loss": 0.6718, + "step": 4211 + }, + { + "epoch": 1.1667590027700832, + "grad_norm": 0.6956333518028259, + "learning_rate": 4.564501864304184e-06, + "loss": 0.6466, + "step": 4212 + }, + { + "epoch": 1.1670360110803324, + "grad_norm": 0.6801543831825256, + "learning_rate": 4.564296399570935e-06, + "loss": 0.6505, + "step": 4213 + }, + { + "epoch": 1.1673130193905816, + "grad_norm": 0.6749382019042969, + "learning_rate": 4.564090891007401e-06, + "loss": 0.6512, + "step": 4214 + }, + { + "epoch": 1.167590027700831, + "grad_norm": 0.6772417426109314, + "learning_rate": 4.563885338617946e-06, + "loss": 0.6516, + "step": 4215 + }, + { + "epoch": 1.1678670360110803, + "grad_norm": 0.709536612033844, + "learning_rate": 4.563679742406935e-06, + "loss": 0.6672, + "step": 4216 + }, + { + "epoch": 1.1681440443213296, + "grad_norm": 0.6753586530685425, + "learning_rate": 4.563474102378733e-06, + "loss": 0.5978, + "step": 4217 + }, + { + "epoch": 1.168421052631579, + "grad_norm": 0.6711388230323792, + "learning_rate": 4.563268418537707e-06, + "loss": 0.6517, + "step": 4218 + }, + { + "epoch": 1.1686980609418283, + "grad_norm": 0.710477352142334, + "learning_rate": 4.563062690888223e-06, + "loss": 0.5786, + "step": 4219 + }, + { + "epoch": 1.1689750692520775, + "grad_norm": 0.7528419494628906, + "learning_rate": 4.562856919434649e-06, + "loss": 0.6847, + "step": 4220 + }, + { + "epoch": 1.169252077562327, + "grad_norm": 0.6778903603553772, + "learning_rate": 4.562651104181355e-06, + "loss": 0.6534, + "step": 4221 + }, + { + "epoch": 1.1695290858725762, + "grad_norm": 0.7068110704421997, + "learning_rate": 4.562445245132711e-06, + "loss": 0.6424, + "step": 4222 + }, + { + "epoch": 1.1698060941828254, + "grad_norm": 0.7029364705085754, + "learning_rate": 4.562239342293087e-06, + "loss": 0.604, + "step": 4223 + }, + { + "epoch": 1.170083102493075, + "grad_norm": 0.6927694082260132, + "learning_rate": 4.562033395666856e-06, + "loss": 0.631, + "step": 4224 + }, + { + "epoch": 1.1703601108033241, + "grad_norm": 0.6557900905609131, + "learning_rate": 4.56182740525839e-06, + "loss": 0.5642, + "step": 4225 + }, + { + "epoch": 1.1706371191135734, + "grad_norm": 0.7124279141426086, + "learning_rate": 4.561621371072062e-06, + "loss": 0.616, + "step": 4226 + }, + { + "epoch": 1.1709141274238228, + "grad_norm": 0.687511146068573, + "learning_rate": 4.561415293112248e-06, + "loss": 0.6791, + "step": 4227 + }, + { + "epoch": 1.171191135734072, + "grad_norm": 0.735288679599762, + "learning_rate": 4.561209171383323e-06, + "loss": 0.6629, + "step": 4228 + }, + { + "epoch": 1.1714681440443213, + "grad_norm": 0.7048251032829285, + "learning_rate": 4.561003005889664e-06, + "loss": 0.6281, + "step": 4229 + }, + { + "epoch": 1.1717451523545706, + "grad_norm": 0.7248666882514954, + "learning_rate": 4.5607967966356485e-06, + "loss": 0.6081, + "step": 4230 + }, + { + "epoch": 1.17202216066482, + "grad_norm": 0.7453956604003906, + "learning_rate": 4.5605905436256525e-06, + "loss": 0.6674, + "step": 4231 + }, + { + "epoch": 1.1722991689750693, + "grad_norm": 0.7330853343009949, + "learning_rate": 4.560384246864058e-06, + "loss": 0.6476, + "step": 4232 + }, + { + "epoch": 1.1725761772853185, + "grad_norm": 0.6860072016716003, + "learning_rate": 4.560177906355244e-06, + "loss": 0.6248, + "step": 4233 + }, + { + "epoch": 1.172853185595568, + "grad_norm": 0.7117403149604797, + "learning_rate": 4.559971522103593e-06, + "loss": 0.6479, + "step": 4234 + }, + { + "epoch": 1.1731301939058172, + "grad_norm": 0.7106106877326965, + "learning_rate": 4.559765094113485e-06, + "loss": 0.6356, + "step": 4235 + }, + { + "epoch": 1.1734072022160664, + "grad_norm": 0.7224343419075012, + "learning_rate": 4.559558622389304e-06, + "loss": 0.6826, + "step": 4236 + }, + { + "epoch": 1.1736842105263159, + "grad_norm": 0.6818200945854187, + "learning_rate": 4.559352106935435e-06, + "loss": 0.665, + "step": 4237 + }, + { + "epoch": 1.1739612188365651, + "grad_norm": 0.6833382844924927, + "learning_rate": 4.559145547756261e-06, + "loss": 0.626, + "step": 4238 + }, + { + "epoch": 1.1742382271468144, + "grad_norm": 0.6887573599815369, + "learning_rate": 4.5589389448561685e-06, + "loss": 0.5914, + "step": 4239 + }, + { + "epoch": 1.1745152354570636, + "grad_norm": 0.6644566059112549, + "learning_rate": 4.558732298239544e-06, + "loss": 0.6298, + "step": 4240 + }, + { + "epoch": 1.174792243767313, + "grad_norm": 0.6625338792800903, + "learning_rate": 4.5585256079107754e-06, + "loss": 0.6454, + "step": 4241 + }, + { + "epoch": 1.1750692520775623, + "grad_norm": 0.693747878074646, + "learning_rate": 4.558318873874251e-06, + "loss": 0.636, + "step": 4242 + }, + { + "epoch": 1.1753462603878115, + "grad_norm": 0.664527952671051, + "learning_rate": 4.558112096134361e-06, + "loss": 0.6218, + "step": 4243 + }, + { + "epoch": 1.175623268698061, + "grad_norm": 0.7000016570091248, + "learning_rate": 4.557905274695496e-06, + "loss": 0.5644, + "step": 4244 + }, + { + "epoch": 1.1759002770083102, + "grad_norm": 0.679958701133728, + "learning_rate": 4.557698409562045e-06, + "loss": 0.6628, + "step": 4245 + }, + { + "epoch": 1.1761772853185595, + "grad_norm": 0.7532631158828735, + "learning_rate": 4.557491500738404e-06, + "loss": 0.6723, + "step": 4246 + }, + { + "epoch": 1.176454293628809, + "grad_norm": 0.717473566532135, + "learning_rate": 4.557284548228962e-06, + "loss": 0.6573, + "step": 4247 + }, + { + "epoch": 1.1767313019390582, + "grad_norm": 0.6726430058479309, + "learning_rate": 4.557077552038117e-06, + "loss": 0.641, + "step": 4248 + }, + { + "epoch": 1.1770083102493074, + "grad_norm": 0.6901071667671204, + "learning_rate": 4.556870512170261e-06, + "loss": 0.6325, + "step": 4249 + }, + { + "epoch": 1.1772853185595569, + "grad_norm": 0.743581235408783, + "learning_rate": 4.5566634286297925e-06, + "loss": 0.6499, + "step": 4250 + }, + { + "epoch": 1.177562326869806, + "grad_norm": 0.6675125360488892, + "learning_rate": 4.556456301421107e-06, + "loss": 0.6098, + "step": 4251 + }, + { + "epoch": 1.1778393351800553, + "grad_norm": 0.7380349636077881, + "learning_rate": 4.556249130548603e-06, + "loss": 0.6552, + "step": 4252 + }, + { + "epoch": 1.1781163434903048, + "grad_norm": 0.6815469264984131, + "learning_rate": 4.556041916016678e-06, + "loss": 0.6356, + "step": 4253 + }, + { + "epoch": 1.178393351800554, + "grad_norm": 0.7289326190948486, + "learning_rate": 4.555834657829733e-06, + "loss": 0.6627, + "step": 4254 + }, + { + "epoch": 1.1786703601108033, + "grad_norm": 0.6645580530166626, + "learning_rate": 4.555627355992168e-06, + "loss": 0.6166, + "step": 4255 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 0.69098961353302, + "learning_rate": 4.555420010508386e-06, + "loss": 0.6364, + "step": 4256 + }, + { + "epoch": 1.179224376731302, + "grad_norm": 0.701954185962677, + "learning_rate": 4.555212621382786e-06, + "loss": 0.6556, + "step": 4257 + }, + { + "epoch": 1.1795013850415512, + "grad_norm": 0.7003589868545532, + "learning_rate": 4.555005188619776e-06, + "loss": 0.6613, + "step": 4258 + }, + { + "epoch": 1.1797783933518005, + "grad_norm": 0.6646780967712402, + "learning_rate": 4.554797712223756e-06, + "loss": 0.637, + "step": 4259 + }, + { + "epoch": 1.18005540166205, + "grad_norm": 0.6946302652359009, + "learning_rate": 4.554590192199134e-06, + "loss": 0.6475, + "step": 4260 + }, + { + "epoch": 1.1803324099722992, + "grad_norm": 0.6732128858566284, + "learning_rate": 4.554382628550315e-06, + "loss": 0.6355, + "step": 4261 + }, + { + "epoch": 1.1806094182825484, + "grad_norm": 0.7041682004928589, + "learning_rate": 4.5541750212817065e-06, + "loss": 0.6679, + "step": 4262 + }, + { + "epoch": 1.1808864265927979, + "grad_norm": 0.6972526907920837, + "learning_rate": 4.553967370397717e-06, + "loss": 0.6495, + "step": 4263 + }, + { + "epoch": 1.181163434903047, + "grad_norm": 0.6706414222717285, + "learning_rate": 4.553759675902754e-06, + "loss": 0.648, + "step": 4264 + }, + { + "epoch": 1.1814404432132963, + "grad_norm": 0.6960473656654358, + "learning_rate": 4.553551937801229e-06, + "loss": 0.597, + "step": 4265 + }, + { + "epoch": 1.1817174515235458, + "grad_norm": 0.7602198123931885, + "learning_rate": 4.5533441560975525e-06, + "loss": 0.6275, + "step": 4266 + }, + { + "epoch": 1.181994459833795, + "grad_norm": 0.7271232008934021, + "learning_rate": 4.5531363307961355e-06, + "loss": 0.6614, + "step": 4267 + }, + { + "epoch": 1.1822714681440443, + "grad_norm": 0.6794989109039307, + "learning_rate": 4.552928461901391e-06, + "loss": 0.6522, + "step": 4268 + }, + { + "epoch": 1.1825484764542935, + "grad_norm": 0.6836474537849426, + "learning_rate": 4.552720549417733e-06, + "loss": 0.6167, + "step": 4269 + }, + { + "epoch": 1.182825484764543, + "grad_norm": 0.7154577970504761, + "learning_rate": 4.552512593349575e-06, + "loss": 0.6265, + "step": 4270 + }, + { + "epoch": 1.1831024930747922, + "grad_norm": 0.7174842953681946, + "learning_rate": 4.552304593701334e-06, + "loss": 0.6631, + "step": 4271 + }, + { + "epoch": 1.1833795013850414, + "grad_norm": 0.702748715877533, + "learning_rate": 4.5520965504774254e-06, + "loss": 0.6121, + "step": 4272 + }, + { + "epoch": 1.183656509695291, + "grad_norm": 0.7215221524238586, + "learning_rate": 4.551888463682266e-06, + "loss": 0.6173, + "step": 4273 + }, + { + "epoch": 1.1839335180055401, + "grad_norm": 0.6870599985122681, + "learning_rate": 4.551680333320274e-06, + "loss": 0.632, + "step": 4274 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.7133249640464783, + "learning_rate": 4.55147215939587e-06, + "loss": 0.6538, + "step": 4275 + }, + { + "epoch": 1.1844875346260388, + "grad_norm": 0.7360088229179382, + "learning_rate": 4.551263941913473e-06, + "loss": 0.6628, + "step": 4276 + }, + { + "epoch": 1.184764542936288, + "grad_norm": 0.6721140742301941, + "learning_rate": 4.551055680877504e-06, + "loss": 0.6347, + "step": 4277 + }, + { + "epoch": 1.1850415512465373, + "grad_norm": 0.6926547884941101, + "learning_rate": 4.550847376292385e-06, + "loss": 0.6544, + "step": 4278 + }, + { + "epoch": 1.1853185595567868, + "grad_norm": 0.7329055070877075, + "learning_rate": 4.550639028162539e-06, + "loss": 0.669, + "step": 4279 + }, + { + "epoch": 1.185595567867036, + "grad_norm": 0.7039414644241333, + "learning_rate": 4.55043063649239e-06, + "loss": 0.6161, + "step": 4280 + }, + { + "epoch": 1.1858725761772853, + "grad_norm": 0.7472238540649414, + "learning_rate": 4.550222201286362e-06, + "loss": 0.657, + "step": 4281 + }, + { + "epoch": 1.1861495844875347, + "grad_norm": 0.7102679014205933, + "learning_rate": 4.550013722548882e-06, + "loss": 0.6748, + "step": 4282 + }, + { + "epoch": 1.186426592797784, + "grad_norm": 0.6597498655319214, + "learning_rate": 4.549805200284374e-06, + "loss": 0.6175, + "step": 4283 + }, + { + "epoch": 1.1867036011080332, + "grad_norm": 0.7128549814224243, + "learning_rate": 4.5495966344972675e-06, + "loss": 0.6658, + "step": 4284 + }, + { + "epoch": 1.1869806094182827, + "grad_norm": 0.6920933723449707, + "learning_rate": 4.54938802519199e-06, + "loss": 0.6846, + "step": 4285 + }, + { + "epoch": 1.1872576177285319, + "grad_norm": 0.7122675776481628, + "learning_rate": 4.549179372372971e-06, + "loss": 0.598, + "step": 4286 + }, + { + "epoch": 1.1875346260387811, + "grad_norm": 0.6924179792404175, + "learning_rate": 4.548970676044642e-06, + "loss": 0.6782, + "step": 4287 + }, + { + "epoch": 1.1878116343490306, + "grad_norm": 0.6880548596382141, + "learning_rate": 4.548761936211433e-06, + "loss": 0.6455, + "step": 4288 + }, + { + "epoch": 1.1880886426592798, + "grad_norm": 0.6973972320556641, + "learning_rate": 4.5485531528777754e-06, + "loss": 0.6654, + "step": 4289 + }, + { + "epoch": 1.188365650969529, + "grad_norm": 0.7235435843467712, + "learning_rate": 4.548344326048103e-06, + "loss": 0.6529, + "step": 4290 + }, + { + "epoch": 1.1886426592797783, + "grad_norm": 0.7232111096382141, + "learning_rate": 4.54813545572685e-06, + "loss": 0.6503, + "step": 4291 + }, + { + "epoch": 1.1889196675900278, + "grad_norm": 0.7104105353355408, + "learning_rate": 4.547926541918451e-06, + "loss": 0.617, + "step": 4292 + }, + { + "epoch": 1.189196675900277, + "grad_norm": 0.7474246025085449, + "learning_rate": 4.547717584627341e-06, + "loss": 0.6935, + "step": 4293 + }, + { + "epoch": 1.1894736842105262, + "grad_norm": 0.666694164276123, + "learning_rate": 4.547508583857958e-06, + "loss": 0.6286, + "step": 4294 + }, + { + "epoch": 1.1897506925207757, + "grad_norm": 0.7198405861854553, + "learning_rate": 4.547299539614739e-06, + "loss": 0.6494, + "step": 4295 + }, + { + "epoch": 1.190027700831025, + "grad_norm": 0.6890691518783569, + "learning_rate": 4.547090451902122e-06, + "loss": 0.6632, + "step": 4296 + }, + { + "epoch": 1.1903047091412742, + "grad_norm": 0.7404274940490723, + "learning_rate": 4.546881320724547e-06, + "loss": 0.6546, + "step": 4297 + }, + { + "epoch": 1.1905817174515236, + "grad_norm": 0.7668154835700989, + "learning_rate": 4.5466721460864545e-06, + "loss": 0.6071, + "step": 4298 + }, + { + "epoch": 1.1908587257617729, + "grad_norm": 0.6996212601661682, + "learning_rate": 4.546462927992287e-06, + "loss": 0.5975, + "step": 4299 + }, + { + "epoch": 1.1911357340720221, + "grad_norm": 0.7173477411270142, + "learning_rate": 4.546253666446484e-06, + "loss": 0.6498, + "step": 4300 + }, + { + "epoch": 1.1914127423822714, + "grad_norm": 0.6888937950134277, + "learning_rate": 4.54604436145349e-06, + "loss": 0.6264, + "step": 4301 + }, + { + "epoch": 1.1916897506925208, + "grad_norm": 0.7148381471633911, + "learning_rate": 4.54583501301775e-06, + "loss": 0.6041, + "step": 4302 + }, + { + "epoch": 1.19196675900277, + "grad_norm": 0.714459240436554, + "learning_rate": 4.545625621143707e-06, + "loss": 0.6545, + "step": 4303 + }, + { + "epoch": 1.1922437673130193, + "grad_norm": 0.6980823874473572, + "learning_rate": 4.54541618583581e-06, + "loss": 0.6409, + "step": 4304 + }, + { + "epoch": 1.1925207756232687, + "grad_norm": 0.70297771692276, + "learning_rate": 4.545206707098503e-06, + "loss": 0.696, + "step": 4305 + }, + { + "epoch": 1.192797783933518, + "grad_norm": 0.6873321533203125, + "learning_rate": 4.5449971849362345e-06, + "loss": 0.6614, + "step": 4306 + }, + { + "epoch": 1.1930747922437672, + "grad_norm": 0.6853544116020203, + "learning_rate": 4.5447876193534535e-06, + "loss": 0.6193, + "step": 4307 + }, + { + "epoch": 1.1933518005540167, + "grad_norm": 0.7040695548057556, + "learning_rate": 4.54457801035461e-06, + "loss": 0.65, + "step": 4308 + }, + { + "epoch": 1.193628808864266, + "grad_norm": 0.7038838863372803, + "learning_rate": 4.544368357944153e-06, + "loss": 0.5984, + "step": 4309 + }, + { + "epoch": 1.1939058171745152, + "grad_norm": 0.7128795385360718, + "learning_rate": 4.5441586621265355e-06, + "loss": 0.6457, + "step": 4310 + }, + { + "epoch": 1.1941828254847646, + "grad_norm": 0.6804617643356323, + "learning_rate": 4.543948922906209e-06, + "loss": 0.6331, + "step": 4311 + }, + { + "epoch": 1.1944598337950139, + "grad_norm": 0.7417746186256409, + "learning_rate": 4.543739140287628e-06, + "loss": 0.6492, + "step": 4312 + }, + { + "epoch": 1.194736842105263, + "grad_norm": 0.6482412815093994, + "learning_rate": 4.543529314275246e-06, + "loss": 0.6603, + "step": 4313 + }, + { + "epoch": 1.1950138504155126, + "grad_norm": 0.718251645565033, + "learning_rate": 4.543319444873517e-06, + "loss": 0.6177, + "step": 4314 + }, + { + "epoch": 1.1952908587257618, + "grad_norm": 0.701597273349762, + "learning_rate": 4.5431095320869e-06, + "loss": 0.646, + "step": 4315 + }, + { + "epoch": 1.195567867036011, + "grad_norm": 0.6754698753356934, + "learning_rate": 4.542899575919848e-06, + "loss": 0.6194, + "step": 4316 + }, + { + "epoch": 1.1958448753462605, + "grad_norm": 0.7565904855728149, + "learning_rate": 4.542689576376822e-06, + "loss": 0.6346, + "step": 4317 + }, + { + "epoch": 1.1961218836565097, + "grad_norm": 0.6827279329299927, + "learning_rate": 4.5424795334622796e-06, + "loss": 0.5896, + "step": 4318 + }, + { + "epoch": 1.196398891966759, + "grad_norm": 0.6626576781272888, + "learning_rate": 4.542269447180682e-06, + "loss": 0.5791, + "step": 4319 + }, + { + "epoch": 1.1966759002770084, + "grad_norm": 0.6990970969200134, + "learning_rate": 4.542059317536487e-06, + "loss": 0.6393, + "step": 4320 + }, + { + "epoch": 1.1969529085872577, + "grad_norm": 0.6745477318763733, + "learning_rate": 4.541849144534158e-06, + "loss": 0.63, + "step": 4321 + }, + { + "epoch": 1.197229916897507, + "grad_norm": 0.6842837929725647, + "learning_rate": 4.541638928178158e-06, + "loss": 0.6406, + "step": 4322 + }, + { + "epoch": 1.1975069252077561, + "grad_norm": 0.7515599727630615, + "learning_rate": 4.54142866847295e-06, + "loss": 0.7141, + "step": 4323 + }, + { + "epoch": 1.1977839335180056, + "grad_norm": 0.6852219104766846, + "learning_rate": 4.541218365422997e-06, + "loss": 0.6726, + "step": 4324 + }, + { + "epoch": 1.1980609418282548, + "grad_norm": 0.692903995513916, + "learning_rate": 4.541008019032766e-06, + "loss": 0.6796, + "step": 4325 + }, + { + "epoch": 1.198337950138504, + "grad_norm": 0.6800821423530579, + "learning_rate": 4.540797629306722e-06, + "loss": 0.6205, + "step": 4326 + }, + { + "epoch": 1.1986149584487535, + "grad_norm": 0.6892538070678711, + "learning_rate": 4.540587196249334e-06, + "loss": 0.6659, + "step": 4327 + }, + { + "epoch": 1.1988919667590028, + "grad_norm": 0.6764261722564697, + "learning_rate": 4.5403767198650685e-06, + "loss": 0.6432, + "step": 4328 + }, + { + "epoch": 1.199168975069252, + "grad_norm": 0.6850671768188477, + "learning_rate": 4.540166200158394e-06, + "loss": 0.6389, + "step": 4329 + }, + { + "epoch": 1.1994459833795015, + "grad_norm": 0.7733122706413269, + "learning_rate": 4.539955637133781e-06, + "loss": 0.6897, + "step": 4330 + }, + { + "epoch": 1.1997229916897507, + "grad_norm": 0.7287300825119019, + "learning_rate": 4.539745030795701e-06, + "loss": 0.6121, + "step": 4331 + }, + { + "epoch": 1.2, + "grad_norm": 0.7426024079322815, + "learning_rate": 4.539534381148625e-06, + "loss": 0.6242, + "step": 4332 + }, + { + "epoch": 1.2002770083102492, + "grad_norm": 0.6916001439094543, + "learning_rate": 4.539323688197025e-06, + "loss": 0.6173, + "step": 4333 + }, + { + "epoch": 1.2005540166204987, + "grad_norm": 0.6753976941108704, + "learning_rate": 4.5391129519453755e-06, + "loss": 0.5832, + "step": 4334 + }, + { + "epoch": 1.200831024930748, + "grad_norm": 0.7031921148300171, + "learning_rate": 4.538902172398151e-06, + "loss": 0.7186, + "step": 4335 + }, + { + "epoch": 1.2011080332409971, + "grad_norm": 0.6825315952301025, + "learning_rate": 4.538691349559826e-06, + "loss": 0.6368, + "step": 4336 + }, + { + "epoch": 1.2013850415512466, + "grad_norm": 0.7218365669250488, + "learning_rate": 4.538480483434878e-06, + "loss": 0.6509, + "step": 4337 + }, + { + "epoch": 1.2016620498614958, + "grad_norm": 0.6999350786209106, + "learning_rate": 4.538269574027784e-06, + "loss": 0.6409, + "step": 4338 + }, + { + "epoch": 1.201939058171745, + "grad_norm": 0.6903960108757019, + "learning_rate": 4.538058621343021e-06, + "loss": 0.5946, + "step": 4339 + }, + { + "epoch": 1.2022160664819945, + "grad_norm": 0.7130528688430786, + "learning_rate": 4.537847625385069e-06, + "loss": 0.6628, + "step": 4340 + }, + { + "epoch": 1.2024930747922438, + "grad_norm": 0.6918487548828125, + "learning_rate": 4.537636586158408e-06, + "loss": 0.6841, + "step": 4341 + }, + { + "epoch": 1.202770083102493, + "grad_norm": 0.7330001592636108, + "learning_rate": 4.5374255036675185e-06, + "loss": 0.6573, + "step": 4342 + }, + { + "epoch": 1.2030470914127425, + "grad_norm": 0.6802187561988831, + "learning_rate": 4.537214377916882e-06, + "loss": 0.6342, + "step": 4343 + }, + { + "epoch": 1.2033240997229917, + "grad_norm": 0.6788448691368103, + "learning_rate": 4.537003208910983e-06, + "loss": 0.6528, + "step": 4344 + }, + { + "epoch": 1.203601108033241, + "grad_norm": 0.6833314299583435, + "learning_rate": 4.536791996654303e-06, + "loss": 0.6437, + "step": 4345 + }, + { + "epoch": 1.2038781163434904, + "grad_norm": 0.6793355345726013, + "learning_rate": 4.536580741151328e-06, + "loss": 0.6384, + "step": 4346 + }, + { + "epoch": 1.2041551246537396, + "grad_norm": 0.7549615502357483, + "learning_rate": 4.536369442406543e-06, + "loss": 0.6657, + "step": 4347 + }, + { + "epoch": 1.2044321329639889, + "grad_norm": 0.6814740896224976, + "learning_rate": 4.536158100424434e-06, + "loss": 0.6212, + "step": 4348 + }, + { + "epoch": 1.2047091412742383, + "grad_norm": 0.7178139686584473, + "learning_rate": 4.535946715209488e-06, + "loss": 0.6641, + "step": 4349 + }, + { + "epoch": 1.2049861495844876, + "grad_norm": 0.7358893752098083, + "learning_rate": 4.535735286766196e-06, + "loss": 0.6558, + "step": 4350 + }, + { + "epoch": 1.2052631578947368, + "grad_norm": 0.6513990759849548, + "learning_rate": 4.535523815099044e-06, + "loss": 0.6421, + "step": 4351 + }, + { + "epoch": 1.2055401662049863, + "grad_norm": 0.7175055146217346, + "learning_rate": 4.535312300212523e-06, + "loss": 0.6676, + "step": 4352 + }, + { + "epoch": 1.2058171745152355, + "grad_norm": 0.6629988551139832, + "learning_rate": 4.535100742111125e-06, + "loss": 0.5907, + "step": 4353 + }, + { + "epoch": 1.2060941828254848, + "grad_norm": 0.7017353177070618, + "learning_rate": 4.534889140799341e-06, + "loss": 0.6615, + "step": 4354 + }, + { + "epoch": 1.206371191135734, + "grad_norm": 0.7436637878417969, + "learning_rate": 4.5346774962816635e-06, + "loss": 0.681, + "step": 4355 + }, + { + "epoch": 1.2066481994459834, + "grad_norm": 0.6942512392997742, + "learning_rate": 4.534465808562587e-06, + "loss": 0.6619, + "step": 4356 + }, + { + "epoch": 1.2069252077562327, + "grad_norm": 0.6875748038291931, + "learning_rate": 4.534254077646606e-06, + "loss": 0.664, + "step": 4357 + }, + { + "epoch": 1.207202216066482, + "grad_norm": 0.6879202127456665, + "learning_rate": 4.5340423035382165e-06, + "loss": 0.6087, + "step": 4358 + }, + { + "epoch": 1.2074792243767314, + "grad_norm": 0.6686105132102966, + "learning_rate": 4.533830486241914e-06, + "loss": 0.6218, + "step": 4359 + }, + { + "epoch": 1.2077562326869806, + "grad_norm": 0.6839017868041992, + "learning_rate": 4.533618625762196e-06, + "loss": 0.6579, + "step": 4360 + }, + { + "epoch": 1.2080332409972299, + "grad_norm": 0.6563047170639038, + "learning_rate": 4.533406722103562e-06, + "loss": 0.6073, + "step": 4361 + }, + { + "epoch": 1.208310249307479, + "grad_norm": 0.7116596102714539, + "learning_rate": 4.53319477527051e-06, + "loss": 0.6353, + "step": 4362 + }, + { + "epoch": 1.2085872576177286, + "grad_norm": 0.6606056690216064, + "learning_rate": 4.5329827852675415e-06, + "loss": 0.5481, + "step": 4363 + }, + { + "epoch": 1.2088642659279778, + "grad_norm": 0.7383421659469604, + "learning_rate": 4.532770752099156e-06, + "loss": 0.6742, + "step": 4364 + }, + { + "epoch": 1.209141274238227, + "grad_norm": 0.6783084273338318, + "learning_rate": 4.532558675769857e-06, + "loss": 0.6384, + "step": 4365 + }, + { + "epoch": 1.2094182825484765, + "grad_norm": 0.6640183329582214, + "learning_rate": 4.532346556284146e-06, + "loss": 0.6, + "step": 4366 + }, + { + "epoch": 1.2096952908587257, + "grad_norm": 0.6977359652519226, + "learning_rate": 4.5321343936465275e-06, + "loss": 0.5709, + "step": 4367 + }, + { + "epoch": 1.209972299168975, + "grad_norm": 0.6893478631973267, + "learning_rate": 4.531922187861507e-06, + "loss": 0.6805, + "step": 4368 + }, + { + "epoch": 1.2102493074792244, + "grad_norm": 0.7190584540367126, + "learning_rate": 4.531709938933589e-06, + "loss": 0.6757, + "step": 4369 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.7101868391036987, + "learning_rate": 4.531497646867281e-06, + "loss": 0.647, + "step": 4370 + }, + { + "epoch": 1.210803324099723, + "grad_norm": 0.7230154871940613, + "learning_rate": 4.53128531166709e-06, + "loss": 0.6387, + "step": 4371 + }, + { + "epoch": 1.2110803324099724, + "grad_norm": 0.6733981966972351, + "learning_rate": 4.5310729333375245e-06, + "loss": 0.6259, + "step": 4372 + }, + { + "epoch": 1.2113573407202216, + "grad_norm": 0.7233732342720032, + "learning_rate": 4.530860511883093e-06, + "loss": 0.6457, + "step": 4373 + }, + { + "epoch": 1.2116343490304708, + "grad_norm": 0.6873233318328857, + "learning_rate": 4.530648047308308e-06, + "loss": 0.6845, + "step": 4374 + }, + { + "epoch": 1.2119113573407203, + "grad_norm": 0.6871516108512878, + "learning_rate": 4.530435539617678e-06, + "loss": 0.6357, + "step": 4375 + }, + { + "epoch": 1.2121883656509695, + "grad_norm": 0.6907857656478882, + "learning_rate": 4.530222988815717e-06, + "loss": 0.6181, + "step": 4376 + }, + { + "epoch": 1.2124653739612188, + "grad_norm": 0.680697500705719, + "learning_rate": 4.530010394906938e-06, + "loss": 0.6281, + "step": 4377 + }, + { + "epoch": 1.2127423822714682, + "grad_norm": 0.6937142014503479, + "learning_rate": 4.529797757895852e-06, + "loss": 0.6251, + "step": 4378 + }, + { + "epoch": 1.2130193905817175, + "grad_norm": 0.6891060471534729, + "learning_rate": 4.529585077786977e-06, + "loss": 0.5744, + "step": 4379 + }, + { + "epoch": 1.2132963988919667, + "grad_norm": 0.6695826053619385, + "learning_rate": 4.529372354584829e-06, + "loss": 0.6819, + "step": 4380 + }, + { + "epoch": 1.2135734072022162, + "grad_norm": 0.7129724621772766, + "learning_rate": 4.529159588293922e-06, + "loss": 0.6574, + "step": 4381 + }, + { + "epoch": 1.2138504155124654, + "grad_norm": 0.7002400159835815, + "learning_rate": 4.528946778918777e-06, + "loss": 0.646, + "step": 4382 + }, + { + "epoch": 1.2141274238227147, + "grad_norm": 0.7137943506240845, + "learning_rate": 4.528733926463909e-06, + "loss": 0.6311, + "step": 4383 + }, + { + "epoch": 1.214404432132964, + "grad_norm": 0.6910654306411743, + "learning_rate": 4.528521030933839e-06, + "loss": 0.6364, + "step": 4384 + }, + { + "epoch": 1.2146814404432134, + "grad_norm": 0.7344447374343872, + "learning_rate": 4.528308092333087e-06, + "loss": 0.6668, + "step": 4385 + }, + { + "epoch": 1.2149584487534626, + "grad_norm": 0.7259987592697144, + "learning_rate": 4.5280951106661755e-06, + "loss": 0.6296, + "step": 4386 + }, + { + "epoch": 1.2152354570637118, + "grad_norm": 0.7200382351875305, + "learning_rate": 4.527882085937625e-06, + "loss": 0.6375, + "step": 4387 + }, + { + "epoch": 1.2155124653739613, + "grad_norm": 0.7581033110618591, + "learning_rate": 4.527669018151959e-06, + "loss": 0.6453, + "step": 4388 + }, + { + "epoch": 1.2157894736842105, + "grad_norm": 0.6925063133239746, + "learning_rate": 4.527455907313702e-06, + "loss": 0.6299, + "step": 4389 + }, + { + "epoch": 1.2160664819944598, + "grad_norm": 0.7005012631416321, + "learning_rate": 4.527242753427378e-06, + "loss": 0.6334, + "step": 4390 + }, + { + "epoch": 1.2163434903047092, + "grad_norm": 0.6756923198699951, + "learning_rate": 4.527029556497513e-06, + "loss": 0.6363, + "step": 4391 + }, + { + "epoch": 1.2166204986149585, + "grad_norm": 0.7093691825866699, + "learning_rate": 4.526816316528635e-06, + "loss": 0.6279, + "step": 4392 + }, + { + "epoch": 1.2168975069252077, + "grad_norm": 0.7289251089096069, + "learning_rate": 4.526603033525271e-06, + "loss": 0.6371, + "step": 4393 + }, + { + "epoch": 1.217174515235457, + "grad_norm": 0.6732731461524963, + "learning_rate": 4.526389707491948e-06, + "loss": 0.6124, + "step": 4394 + }, + { + "epoch": 1.2174515235457064, + "grad_norm": 0.6666122674942017, + "learning_rate": 4.5261763384331975e-06, + "loss": 0.6345, + "step": 4395 + }, + { + "epoch": 1.2177285318559556, + "grad_norm": 0.6899173855781555, + "learning_rate": 4.525962926353549e-06, + "loss": 0.6911, + "step": 4396 + }, + { + "epoch": 1.2180055401662049, + "grad_norm": 0.673346996307373, + "learning_rate": 4.525749471257534e-06, + "loss": 0.6024, + "step": 4397 + }, + { + "epoch": 1.2182825484764543, + "grad_norm": 0.695513904094696, + "learning_rate": 4.5255359731496835e-06, + "loss": 0.6529, + "step": 4398 + }, + { + "epoch": 1.2185595567867036, + "grad_norm": 0.6898539066314697, + "learning_rate": 4.5253224320345324e-06, + "loss": 0.673, + "step": 4399 + }, + { + "epoch": 1.2188365650969528, + "grad_norm": 0.6731051206588745, + "learning_rate": 4.525108847916614e-06, + "loss": 0.6152, + "step": 4400 + }, + { + "epoch": 1.2191135734072023, + "grad_norm": 0.6579740047454834, + "learning_rate": 4.524895220800463e-06, + "loss": 0.6188, + "step": 4401 + }, + { + "epoch": 1.2193905817174515, + "grad_norm": 0.6864678263664246, + "learning_rate": 4.524681550690615e-06, + "loss": 0.6384, + "step": 4402 + }, + { + "epoch": 1.2196675900277008, + "grad_norm": 0.7058321237564087, + "learning_rate": 4.524467837591608e-06, + "loss": 0.6311, + "step": 4403 + }, + { + "epoch": 1.2199445983379502, + "grad_norm": 0.7127742171287537, + "learning_rate": 4.524254081507978e-06, + "loss": 0.646, + "step": 4404 + }, + { + "epoch": 1.2202216066481995, + "grad_norm": 0.6770905256271362, + "learning_rate": 4.524040282444265e-06, + "loss": 0.6787, + "step": 4405 + }, + { + "epoch": 1.2204986149584487, + "grad_norm": 0.6978400349617004, + "learning_rate": 4.523826440405009e-06, + "loss": 0.6089, + "step": 4406 + }, + { + "epoch": 1.2207756232686982, + "grad_norm": 0.6935478448867798, + "learning_rate": 4.523612555394748e-06, + "loss": 0.6656, + "step": 4407 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 0.6988148093223572, + "learning_rate": 4.523398627418025e-06, + "loss": 0.6174, + "step": 4408 + }, + { + "epoch": 1.2213296398891966, + "grad_norm": 0.67249596118927, + "learning_rate": 4.523184656479382e-06, + "loss": 0.5974, + "step": 4409 + }, + { + "epoch": 1.221606648199446, + "grad_norm": 0.6931259632110596, + "learning_rate": 4.522970642583362e-06, + "loss": 0.6077, + "step": 4410 + }, + { + "epoch": 1.2218836565096953, + "grad_norm": 0.6827836632728577, + "learning_rate": 4.522756585734509e-06, + "loss": 0.6427, + "step": 4411 + }, + { + "epoch": 1.2221606648199446, + "grad_norm": 0.7271913886070251, + "learning_rate": 4.522542485937369e-06, + "loss": 0.6013, + "step": 4412 + }, + { + "epoch": 1.222437673130194, + "grad_norm": 0.7043451070785522, + "learning_rate": 4.522328343196486e-06, + "loss": 0.6546, + "step": 4413 + }, + { + "epoch": 1.2227146814404433, + "grad_norm": 0.7179906964302063, + "learning_rate": 4.522114157516409e-06, + "loss": 0.6769, + "step": 4414 + }, + { + "epoch": 1.2229916897506925, + "grad_norm": 0.7093173265457153, + "learning_rate": 4.521899928901683e-06, + "loss": 0.6705, + "step": 4415 + }, + { + "epoch": 1.2232686980609417, + "grad_norm": 0.6971815228462219, + "learning_rate": 4.5216856573568594e-06, + "loss": 0.6267, + "step": 4416 + }, + { + "epoch": 1.2235457063711912, + "grad_norm": 0.7456490993499756, + "learning_rate": 4.521471342886486e-06, + "loss": 0.6268, + "step": 4417 + }, + { + "epoch": 1.2238227146814404, + "grad_norm": 0.7302700877189636, + "learning_rate": 4.521256985495114e-06, + "loss": 0.6061, + "step": 4418 + }, + { + "epoch": 1.2240997229916897, + "grad_norm": 0.6957641243934631, + "learning_rate": 4.5210425851872944e-06, + "loss": 0.6209, + "step": 4419 + }, + { + "epoch": 1.2243767313019391, + "grad_norm": 0.7002754807472229, + "learning_rate": 4.520828141967579e-06, + "loss": 0.6072, + "step": 4420 + }, + { + "epoch": 1.2246537396121884, + "grad_norm": 0.7033765316009521, + "learning_rate": 4.5206136558405214e-06, + "loss": 0.6292, + "step": 4421 + }, + { + "epoch": 1.2249307479224376, + "grad_norm": 0.7024313807487488, + "learning_rate": 4.520399126810676e-06, + "loss": 0.6282, + "step": 4422 + }, + { + "epoch": 1.225207756232687, + "grad_norm": 0.6639774441719055, + "learning_rate": 4.520184554882598e-06, + "loss": 0.6386, + "step": 4423 + }, + { + "epoch": 1.2254847645429363, + "grad_norm": 0.6910221576690674, + "learning_rate": 4.519969940060843e-06, + "loss": 0.6593, + "step": 4424 + }, + { + "epoch": 1.2257617728531855, + "grad_norm": 0.6906338930130005, + "learning_rate": 4.519755282349967e-06, + "loss": 0.6654, + "step": 4425 + }, + { + "epoch": 1.2260387811634348, + "grad_norm": 0.6946108937263489, + "learning_rate": 4.519540581754529e-06, + "loss": 0.6468, + "step": 4426 + }, + { + "epoch": 1.2263157894736842, + "grad_norm": 0.683337926864624, + "learning_rate": 4.519325838279087e-06, + "loss": 0.6455, + "step": 4427 + }, + { + "epoch": 1.2265927977839335, + "grad_norm": 0.7096542119979858, + "learning_rate": 4.519111051928201e-06, + "loss": 0.5897, + "step": 4428 + }, + { + "epoch": 1.2268698060941827, + "grad_norm": 0.7188792824745178, + "learning_rate": 4.518896222706431e-06, + "loss": 0.6503, + "step": 4429 + }, + { + "epoch": 1.2271468144044322, + "grad_norm": 0.6780487895011902, + "learning_rate": 4.518681350618338e-06, + "loss": 0.6146, + "step": 4430 + }, + { + "epoch": 1.2274238227146814, + "grad_norm": 0.7028935551643372, + "learning_rate": 4.518466435668485e-06, + "loss": 0.7035, + "step": 4431 + }, + { + "epoch": 1.2277008310249307, + "grad_norm": 0.673099935054779, + "learning_rate": 4.518251477861435e-06, + "loss": 0.6031, + "step": 4432 + }, + { + "epoch": 1.2279778393351801, + "grad_norm": 0.7044011354446411, + "learning_rate": 4.518036477201752e-06, + "loss": 0.6526, + "step": 4433 + }, + { + "epoch": 1.2282548476454294, + "grad_norm": 0.7253800630569458, + "learning_rate": 4.5178214336940015e-06, + "loss": 0.6041, + "step": 4434 + }, + { + "epoch": 1.2285318559556786, + "grad_norm": 0.7392945885658264, + "learning_rate": 4.5176063473427485e-06, + "loss": 0.607, + "step": 4435 + }, + { + "epoch": 1.228808864265928, + "grad_norm": 0.6777207255363464, + "learning_rate": 4.51739121815256e-06, + "loss": 0.6576, + "step": 4436 + }, + { + "epoch": 1.2290858725761773, + "grad_norm": 0.7337944507598877, + "learning_rate": 4.517176046128005e-06, + "loss": 0.6573, + "step": 4437 + }, + { + "epoch": 1.2293628808864265, + "grad_norm": 0.6642884612083435, + "learning_rate": 4.516960831273651e-06, + "loss": 0.5576, + "step": 4438 + }, + { + "epoch": 1.229639889196676, + "grad_norm": 0.7272876501083374, + "learning_rate": 4.516745573594068e-06, + "loss": 0.6473, + "step": 4439 + }, + { + "epoch": 1.2299168975069252, + "grad_norm": 0.6578417420387268, + "learning_rate": 4.516530273093826e-06, + "loss": 0.648, + "step": 4440 + }, + { + "epoch": 1.2301939058171745, + "grad_norm": 0.6466295123100281, + "learning_rate": 4.516314929777496e-06, + "loss": 0.6004, + "step": 4441 + }, + { + "epoch": 1.230470914127424, + "grad_norm": 0.6575599908828735, + "learning_rate": 4.516099543649651e-06, + "loss": 0.5946, + "step": 4442 + }, + { + "epoch": 1.2307479224376732, + "grad_norm": 0.7217696905136108, + "learning_rate": 4.515884114714865e-06, + "loss": 0.6447, + "step": 4443 + }, + { + "epoch": 1.2310249307479224, + "grad_norm": 0.7224985957145691, + "learning_rate": 4.515668642977711e-06, + "loss": 0.6679, + "step": 4444 + }, + { + "epoch": 1.2313019390581719, + "grad_norm": 0.7255591750144958, + "learning_rate": 4.515453128442764e-06, + "loss": 0.6278, + "step": 4445 + }, + { + "epoch": 1.231578947368421, + "grad_norm": 0.6969859004020691, + "learning_rate": 4.5152375711146e-06, + "loss": 0.6217, + "step": 4446 + }, + { + "epoch": 1.2318559556786703, + "grad_norm": 0.731724739074707, + "learning_rate": 4.515021970997797e-06, + "loss": 0.6845, + "step": 4447 + }, + { + "epoch": 1.2321329639889196, + "grad_norm": 0.7150566577911377, + "learning_rate": 4.5148063280969315e-06, + "loss": 0.6456, + "step": 4448 + }, + { + "epoch": 1.232409972299169, + "grad_norm": 0.6822801232337952, + "learning_rate": 4.5145906424165815e-06, + "loss": 0.6243, + "step": 4449 + }, + { + "epoch": 1.2326869806094183, + "grad_norm": 0.7677393555641174, + "learning_rate": 4.514374913961328e-06, + "loss": 0.6645, + "step": 4450 + }, + { + "epoch": 1.2329639889196675, + "grad_norm": 0.71131432056427, + "learning_rate": 4.514159142735751e-06, + "loss": 0.6702, + "step": 4451 + }, + { + "epoch": 1.233240997229917, + "grad_norm": 0.7009198069572449, + "learning_rate": 4.5139433287444316e-06, + "loss": 0.6511, + "step": 4452 + }, + { + "epoch": 1.2335180055401662, + "grad_norm": 0.6575670838356018, + "learning_rate": 4.5137274719919526e-06, + "loss": 0.6374, + "step": 4453 + }, + { + "epoch": 1.2337950138504155, + "grad_norm": 0.721043586730957, + "learning_rate": 4.513511572482896e-06, + "loss": 0.609, + "step": 4454 + }, + { + "epoch": 1.234072022160665, + "grad_norm": 0.7106706500053406, + "learning_rate": 4.513295630221847e-06, + "loss": 0.6202, + "step": 4455 + }, + { + "epoch": 1.2343490304709142, + "grad_norm": 0.6895164251327515, + "learning_rate": 4.513079645213391e-06, + "loss": 0.6248, + "step": 4456 + }, + { + "epoch": 1.2346260387811634, + "grad_norm": 0.7164800763130188, + "learning_rate": 4.512863617462113e-06, + "loss": 0.6702, + "step": 4457 + }, + { + "epoch": 1.2349030470914126, + "grad_norm": 0.6920846700668335, + "learning_rate": 4.512647546972601e-06, + "loss": 0.6542, + "step": 4458 + }, + { + "epoch": 1.235180055401662, + "grad_norm": 0.6987187266349792, + "learning_rate": 4.5124314337494405e-06, + "loss": 0.6786, + "step": 4459 + }, + { + "epoch": 1.2354570637119113, + "grad_norm": 0.7604243159294128, + "learning_rate": 4.5122152777972215e-06, + "loss": 0.654, + "step": 4460 + }, + { + "epoch": 1.2357340720221606, + "grad_norm": 0.6927931308746338, + "learning_rate": 4.5119990791205335e-06, + "loss": 0.6272, + "step": 4461 + }, + { + "epoch": 1.23601108033241, + "grad_norm": 0.7042927742004395, + "learning_rate": 4.511782837723968e-06, + "loss": 0.6764, + "step": 4462 + }, + { + "epoch": 1.2362880886426593, + "grad_norm": 0.7222568988800049, + "learning_rate": 4.5115665536121145e-06, + "loss": 0.6519, + "step": 4463 + }, + { + "epoch": 1.2365650969529085, + "grad_norm": 0.7020226120948792, + "learning_rate": 4.511350226789566e-06, + "loss": 0.6217, + "step": 4464 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.7381118535995483, + "learning_rate": 4.511133857260917e-06, + "loss": 0.6656, + "step": 4465 + }, + { + "epoch": 1.2371191135734072, + "grad_norm": 0.7156130075454712, + "learning_rate": 4.510917445030759e-06, + "loss": 0.5877, + "step": 4466 + }, + { + "epoch": 1.2373961218836564, + "grad_norm": 0.682116687297821, + "learning_rate": 4.510700990103689e-06, + "loss": 0.5929, + "step": 4467 + }, + { + "epoch": 1.237673130193906, + "grad_norm": 0.7004262208938599, + "learning_rate": 4.510484492484302e-06, + "loss": 0.6637, + "step": 4468 + }, + { + "epoch": 1.2379501385041551, + "grad_norm": 0.7237050533294678, + "learning_rate": 4.510267952177195e-06, + "loss": 0.6004, + "step": 4469 + }, + { + "epoch": 1.2382271468144044, + "grad_norm": 0.7384737730026245, + "learning_rate": 4.510051369186965e-06, + "loss": 0.6523, + "step": 4470 + }, + { + "epoch": 1.2385041551246538, + "grad_norm": 0.6756117343902588, + "learning_rate": 4.509834743518212e-06, + "loss": 0.6028, + "step": 4471 + }, + { + "epoch": 1.238781163434903, + "grad_norm": 0.656095564365387, + "learning_rate": 4.5096180751755345e-06, + "loss": 0.5923, + "step": 4472 + }, + { + "epoch": 1.2390581717451523, + "grad_norm": 0.7115722894668579, + "learning_rate": 4.509401364163534e-06, + "loss": 0.6651, + "step": 4473 + }, + { + "epoch": 1.2393351800554018, + "grad_norm": 0.694269597530365, + "learning_rate": 4.509184610486811e-06, + "loss": 0.6496, + "step": 4474 + }, + { + "epoch": 1.239612188365651, + "grad_norm": 0.8465580940246582, + "learning_rate": 4.508967814149967e-06, + "loss": 0.6187, + "step": 4475 + }, + { + "epoch": 1.2398891966759003, + "grad_norm": 0.7109991312026978, + "learning_rate": 4.508750975157606e-06, + "loss": 0.6393, + "step": 4476 + }, + { + "epoch": 1.2401662049861495, + "grad_norm": 0.6904207468032837, + "learning_rate": 4.508534093514333e-06, + "loss": 0.6075, + "step": 4477 + }, + { + "epoch": 1.240443213296399, + "grad_norm": 0.7130043506622314, + "learning_rate": 4.508317169224752e-06, + "loss": 0.6706, + "step": 4478 + }, + { + "epoch": 1.2407202216066482, + "grad_norm": 0.6982183456420898, + "learning_rate": 4.508100202293468e-06, + "loss": 0.6405, + "step": 4479 + }, + { + "epoch": 1.2409972299168974, + "grad_norm": 0.697863757610321, + "learning_rate": 4.507883192725089e-06, + "loss": 0.6528, + "step": 4480 + }, + { + "epoch": 1.2412742382271469, + "grad_norm": 0.7692074179649353, + "learning_rate": 4.5076661405242215e-06, + "loss": 0.6911, + "step": 4481 + }, + { + "epoch": 1.2415512465373961, + "grad_norm": 0.6971391439437866, + "learning_rate": 4.507449045695475e-06, + "loss": 0.6221, + "step": 4482 + }, + { + "epoch": 1.2418282548476454, + "grad_norm": 0.7123902440071106, + "learning_rate": 4.507231908243459e-06, + "loss": 0.6344, + "step": 4483 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 0.6307297348976135, + "learning_rate": 4.507014728172784e-06, + "loss": 0.5585, + "step": 4484 + }, + { + "epoch": 1.242382271468144, + "grad_norm": 0.6647037863731384, + "learning_rate": 4.506797505488061e-06, + "loss": 0.611, + "step": 4485 + }, + { + "epoch": 1.2426592797783933, + "grad_norm": 0.7097634673118591, + "learning_rate": 4.506580240193902e-06, + "loss": 0.6426, + "step": 4486 + }, + { + "epoch": 1.2429362880886425, + "grad_norm": 0.7197310924530029, + "learning_rate": 4.50636293229492e-06, + "loss": 0.6631, + "step": 4487 + }, + { + "epoch": 1.243213296398892, + "grad_norm": 0.733557403087616, + "learning_rate": 4.50614558179573e-06, + "loss": 0.7106, + "step": 4488 + }, + { + "epoch": 1.2434903047091412, + "grad_norm": 0.6763700246810913, + "learning_rate": 4.505928188700946e-06, + "loss": 0.641, + "step": 4489 + }, + { + "epoch": 1.2437673130193905, + "grad_norm": 0.6792541146278381, + "learning_rate": 4.505710753015183e-06, + "loss": 0.6299, + "step": 4490 + }, + { + "epoch": 1.24404432132964, + "grad_norm": 0.6903266310691833, + "learning_rate": 4.505493274743059e-06, + "loss": 0.6145, + "step": 4491 + }, + { + "epoch": 1.2443213296398892, + "grad_norm": 0.683650553226471, + "learning_rate": 4.505275753889193e-06, + "loss": 0.6278, + "step": 4492 + }, + { + "epoch": 1.2445983379501384, + "grad_norm": 0.6768408417701721, + "learning_rate": 4.5050581904581995e-06, + "loss": 0.6286, + "step": 4493 + }, + { + "epoch": 1.2448753462603879, + "grad_norm": 0.7079578042030334, + "learning_rate": 4.5048405844547e-06, + "loss": 0.677, + "step": 4494 + }, + { + "epoch": 1.245152354570637, + "grad_norm": 0.7316173315048218, + "learning_rate": 4.504622935883317e-06, + "loss": 0.6535, + "step": 4495 + }, + { + "epoch": 1.2454293628808863, + "grad_norm": 0.710331380367279, + "learning_rate": 4.504405244748669e-06, + "loss": 0.6437, + "step": 4496 + }, + { + "epoch": 1.2457063711911358, + "grad_norm": 0.7169858813285828, + "learning_rate": 4.504187511055379e-06, + "loss": 0.6394, + "step": 4497 + }, + { + "epoch": 1.245983379501385, + "grad_norm": 0.7219961285591125, + "learning_rate": 4.50396973480807e-06, + "loss": 0.6137, + "step": 4498 + }, + { + "epoch": 1.2462603878116343, + "grad_norm": 0.6625384092330933, + "learning_rate": 4.503751916011366e-06, + "loss": 0.6166, + "step": 4499 + }, + { + "epoch": 1.2465373961218837, + "grad_norm": 0.6969285011291504, + "learning_rate": 4.5035340546698915e-06, + "loss": 0.6553, + "step": 4500 + }, + { + "epoch": 1.246814404432133, + "grad_norm": 0.7164677381515503, + "learning_rate": 4.5033161507882735e-06, + "loss": 0.6162, + "step": 4501 + }, + { + "epoch": 1.2470914127423822, + "grad_norm": 0.6780024766921997, + "learning_rate": 4.503098204371137e-06, + "loss": 0.6477, + "step": 4502 + }, + { + "epoch": 1.2473684210526317, + "grad_norm": 0.6793859601020813, + "learning_rate": 4.50288021542311e-06, + "loss": 0.639, + "step": 4503 + }, + { + "epoch": 1.247645429362881, + "grad_norm": 0.6692623496055603, + "learning_rate": 4.5026621839488216e-06, + "loss": 0.6533, + "step": 4504 + }, + { + "epoch": 1.2479224376731302, + "grad_norm": 0.6580060720443726, + "learning_rate": 4.502444109952901e-06, + "loss": 0.635, + "step": 4505 + }, + { + "epoch": 1.2481994459833796, + "grad_norm": 0.6871946454048157, + "learning_rate": 4.502225993439978e-06, + "loss": 0.6506, + "step": 4506 + }, + { + "epoch": 1.2484764542936289, + "grad_norm": 0.6828311085700989, + "learning_rate": 4.502007834414684e-06, + "loss": 0.6284, + "step": 4507 + }, + { + "epoch": 1.248753462603878, + "grad_norm": 0.6815675497055054, + "learning_rate": 4.5017896328816515e-06, + "loss": 0.6438, + "step": 4508 + }, + { + "epoch": 1.2490304709141273, + "grad_norm": 0.6794790029525757, + "learning_rate": 4.501571388845513e-06, + "loss": 0.6353, + "step": 4509 + }, + { + "epoch": 1.2493074792243768, + "grad_norm": 0.6996496319770813, + "learning_rate": 4.501353102310901e-06, + "loss": 0.5925, + "step": 4510 + }, + { + "epoch": 1.249584487534626, + "grad_norm": 0.7551298141479492, + "learning_rate": 4.501134773282453e-06, + "loss": 0.6096, + "step": 4511 + }, + { + "epoch": 1.2498614958448753, + "grad_norm": 0.6573173403739929, + "learning_rate": 4.500916401764803e-06, + "loss": 0.6343, + "step": 4512 + }, + { + "epoch": 1.2501385041551247, + "grad_norm": 0.7132793068885803, + "learning_rate": 4.500697987762589e-06, + "loss": 0.6373, + "step": 4513 + }, + { + "epoch": 1.250415512465374, + "grad_norm": 0.6720200181007385, + "learning_rate": 4.500479531280447e-06, + "loss": 0.628, + "step": 4514 + }, + { + "epoch": 1.2506925207756232, + "grad_norm": 0.6969121098518372, + "learning_rate": 4.500261032323015e-06, + "loss": 0.655, + "step": 4515 + }, + { + "epoch": 1.2509695290858724, + "grad_norm": 0.6909615993499756, + "learning_rate": 4.500042490894935e-06, + "loss": 0.6342, + "step": 4516 + }, + { + "epoch": 1.251246537396122, + "grad_norm": 0.7056781053543091, + "learning_rate": 4.499823907000844e-06, + "loss": 0.6741, + "step": 4517 + }, + { + "epoch": 1.2515235457063711, + "grad_norm": 0.6950377821922302, + "learning_rate": 4.499605280645385e-06, + "loss": 0.6234, + "step": 4518 + }, + { + "epoch": 1.2518005540166204, + "grad_norm": 0.7027127146720886, + "learning_rate": 4.499386611833199e-06, + "loss": 0.6298, + "step": 4519 + }, + { + "epoch": 1.2520775623268698, + "grad_norm": 0.7132726311683655, + "learning_rate": 4.499167900568929e-06, + "loss": 0.6417, + "step": 4520 + }, + { + "epoch": 1.252354570637119, + "grad_norm": 0.7127083539962769, + "learning_rate": 4.49894914685722e-06, + "loss": 0.6289, + "step": 4521 + }, + { + "epoch": 1.2526315789473683, + "grad_norm": 0.6999812126159668, + "learning_rate": 4.4987303507027155e-06, + "loss": 0.6629, + "step": 4522 + }, + { + "epoch": 1.2529085872576178, + "grad_norm": 0.7140102982521057, + "learning_rate": 4.4985115121100605e-06, + "loss": 0.6367, + "step": 4523 + }, + { + "epoch": 1.253185595567867, + "grad_norm": 0.6769750714302063, + "learning_rate": 4.4982926310839035e-06, + "loss": 0.6438, + "step": 4524 + }, + { + "epoch": 1.2534626038781163, + "grad_norm": 0.6684278249740601, + "learning_rate": 4.498073707628891e-06, + "loss": 0.6069, + "step": 4525 + }, + { + "epoch": 1.2537396121883657, + "grad_norm": 0.6692405939102173, + "learning_rate": 4.497854741749671e-06, + "loss": 0.615, + "step": 4526 + }, + { + "epoch": 1.254016620498615, + "grad_norm": 0.7372734546661377, + "learning_rate": 4.497635733450893e-06, + "loss": 0.5969, + "step": 4527 + }, + { + "epoch": 1.2542936288088642, + "grad_norm": 0.7145940661430359, + "learning_rate": 4.497416682737207e-06, + "loss": 0.6445, + "step": 4528 + }, + { + "epoch": 1.2545706371191137, + "grad_norm": 0.7145653963088989, + "learning_rate": 4.497197589613263e-06, + "loss": 0.6326, + "step": 4529 + }, + { + "epoch": 1.254847645429363, + "grad_norm": 0.6717661619186401, + "learning_rate": 4.496978454083716e-06, + "loss": 0.7085, + "step": 4530 + }, + { + "epoch": 1.2551246537396121, + "grad_norm": 0.6757134199142456, + "learning_rate": 4.496759276153215e-06, + "loss": 0.6329, + "step": 4531 + }, + { + "epoch": 1.2554016620498616, + "grad_norm": 0.7108446955680847, + "learning_rate": 4.4965400558264155e-06, + "loss": 0.6525, + "step": 4532 + }, + { + "epoch": 1.2556786703601108, + "grad_norm": 0.7189573049545288, + "learning_rate": 4.496320793107972e-06, + "loss": 0.6302, + "step": 4533 + }, + { + "epoch": 1.25595567867036, + "grad_norm": 0.7077795267105103, + "learning_rate": 4.496101488002541e-06, + "loss": 0.6495, + "step": 4534 + }, + { + "epoch": 1.2562326869806095, + "grad_norm": 0.722058892250061, + "learning_rate": 4.495882140514777e-06, + "loss": 0.6663, + "step": 4535 + }, + { + "epoch": 1.2565096952908588, + "grad_norm": 0.7322894334793091, + "learning_rate": 4.495662750649339e-06, + "loss": 0.6126, + "step": 4536 + }, + { + "epoch": 1.256786703601108, + "grad_norm": 0.7008985877037048, + "learning_rate": 4.495443318410883e-06, + "loss": 0.6237, + "step": 4537 + }, + { + "epoch": 1.2570637119113575, + "grad_norm": 0.7410510778427124, + "learning_rate": 4.4952238438040705e-06, + "loss": 0.6669, + "step": 4538 + }, + { + "epoch": 1.2573407202216067, + "grad_norm": 0.7088368535041809, + "learning_rate": 4.495004326833561e-06, + "loss": 0.5999, + "step": 4539 + }, + { + "epoch": 1.257617728531856, + "grad_norm": 0.712836742401123, + "learning_rate": 4.4947847675040135e-06, + "loss": 0.6145, + "step": 4540 + }, + { + "epoch": 1.2578947368421054, + "grad_norm": 0.6875491142272949, + "learning_rate": 4.494565165820093e-06, + "loss": 0.6499, + "step": 4541 + }, + { + "epoch": 1.2581717451523546, + "grad_norm": 0.7181865572929382, + "learning_rate": 4.49434552178646e-06, + "loss": 0.6741, + "step": 4542 + }, + { + "epoch": 1.2584487534626039, + "grad_norm": 0.639896810054779, + "learning_rate": 4.4941258354077774e-06, + "loss": 0.5928, + "step": 4543 + }, + { + "epoch": 1.2587257617728531, + "grad_norm": 0.7280617952346802, + "learning_rate": 4.493906106688712e-06, + "loss": 0.6431, + "step": 4544 + }, + { + "epoch": 1.2590027700831026, + "grad_norm": 0.6944310665130615, + "learning_rate": 4.493686335633928e-06, + "loss": 0.614, + "step": 4545 + }, + { + "epoch": 1.2592797783933518, + "grad_norm": 0.6686841249465942, + "learning_rate": 4.4934665222480914e-06, + "loss": 0.5946, + "step": 4546 + }, + { + "epoch": 1.259556786703601, + "grad_norm": 0.711683452129364, + "learning_rate": 4.49324666653587e-06, + "loss": 0.6002, + "step": 4547 + }, + { + "epoch": 1.2598337950138503, + "grad_norm": 0.7249209880828857, + "learning_rate": 4.493026768501931e-06, + "loss": 0.6424, + "step": 4548 + }, + { + "epoch": 1.2601108033240997, + "grad_norm": 0.6927335858345032, + "learning_rate": 4.492806828150945e-06, + "loss": 0.6605, + "step": 4549 + }, + { + "epoch": 1.260387811634349, + "grad_norm": 0.689573347568512, + "learning_rate": 4.4925868454875806e-06, + "loss": 0.6113, + "step": 4550 + }, + { + "epoch": 1.2606648199445982, + "grad_norm": 0.7189673781394958, + "learning_rate": 4.492366820516509e-06, + "loss": 0.6747, + "step": 4551 + }, + { + "epoch": 1.2609418282548477, + "grad_norm": 0.7115404009819031, + "learning_rate": 4.492146753242402e-06, + "loss": 0.6049, + "step": 4552 + }, + { + "epoch": 1.261218836565097, + "grad_norm": 0.6188634037971497, + "learning_rate": 4.491926643669931e-06, + "loss": 0.5518, + "step": 4553 + }, + { + "epoch": 1.2614958448753462, + "grad_norm": 0.7529398798942566, + "learning_rate": 4.491706491803772e-06, + "loss": 0.7053, + "step": 4554 + }, + { + "epoch": 1.2617728531855956, + "grad_norm": 0.6784997582435608, + "learning_rate": 4.4914862976485975e-06, + "loss": 0.6631, + "step": 4555 + }, + { + "epoch": 1.2620498614958449, + "grad_norm": 0.6835280656814575, + "learning_rate": 4.491266061209083e-06, + "loss": 0.6227, + "step": 4556 + }, + { + "epoch": 1.262326869806094, + "grad_norm": 0.7191388010978699, + "learning_rate": 4.4910457824899045e-06, + "loss": 0.6378, + "step": 4557 + }, + { + "epoch": 1.2626038781163436, + "grad_norm": 0.6956838965415955, + "learning_rate": 4.49082546149574e-06, + "loss": 0.6674, + "step": 4558 + }, + { + "epoch": 1.2628808864265928, + "grad_norm": 0.7452521324157715, + "learning_rate": 4.490605098231267e-06, + "loss": 0.6648, + "step": 4559 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.7133474946022034, + "learning_rate": 4.490384692701164e-06, + "loss": 0.5917, + "step": 4560 + }, + { + "epoch": 1.2634349030470915, + "grad_norm": 1.4470252990722656, + "learning_rate": 4.490164244910112e-06, + "loss": 0.6172, + "step": 4561 + }, + { + "epoch": 1.2637119113573407, + "grad_norm": 0.6451750993728638, + "learning_rate": 4.489943754862789e-06, + "loss": 0.5528, + "step": 4562 + }, + { + "epoch": 1.26398891966759, + "grad_norm": 0.7071599960327148, + "learning_rate": 4.489723222563879e-06, + "loss": 0.6244, + "step": 4563 + }, + { + "epoch": 1.2642659279778394, + "grad_norm": 0.716210663318634, + "learning_rate": 4.489502648018064e-06, + "loss": 0.6003, + "step": 4564 + }, + { + "epoch": 1.2645429362880887, + "grad_norm": 0.7129285335540771, + "learning_rate": 4.489282031230028e-06, + "loss": 0.6613, + "step": 4565 + }, + { + "epoch": 1.264819944598338, + "grad_norm": 0.7196843028068542, + "learning_rate": 4.4890613722044526e-06, + "loss": 0.626, + "step": 4566 + }, + { + "epoch": 1.2650969529085874, + "grad_norm": 0.6882171034812927, + "learning_rate": 4.488840670946026e-06, + "loss": 0.6542, + "step": 4567 + }, + { + "epoch": 1.2653739612188366, + "grad_norm": 0.7489578127861023, + "learning_rate": 4.488619927459433e-06, + "loss": 0.6923, + "step": 4568 + }, + { + "epoch": 1.2656509695290858, + "grad_norm": 0.6784943342208862, + "learning_rate": 4.48839914174936e-06, + "loss": 0.6528, + "step": 4569 + }, + { + "epoch": 1.2659279778393353, + "grad_norm": 0.7308769822120667, + "learning_rate": 4.488178313820496e-06, + "loss": 0.6789, + "step": 4570 + }, + { + "epoch": 1.2662049861495845, + "grad_norm": 0.7080146074295044, + "learning_rate": 4.487957443677529e-06, + "loss": 0.5962, + "step": 4571 + }, + { + "epoch": 1.2664819944598338, + "grad_norm": 0.7015159726142883, + "learning_rate": 4.487736531325149e-06, + "loss": 0.6787, + "step": 4572 + }, + { + "epoch": 1.266759002770083, + "grad_norm": 0.768822431564331, + "learning_rate": 4.487515576768046e-06, + "loss": 0.6674, + "step": 4573 + }, + { + "epoch": 1.2670360110803325, + "grad_norm": 0.6876410841941833, + "learning_rate": 4.487294580010911e-06, + "loss": 0.6382, + "step": 4574 + }, + { + "epoch": 1.2673130193905817, + "grad_norm": 0.6838921308517456, + "learning_rate": 4.487073541058438e-06, + "loss": 0.6155, + "step": 4575 + }, + { + "epoch": 1.267590027700831, + "grad_norm": 0.6943739652633667, + "learning_rate": 4.486852459915319e-06, + "loss": 0.6397, + "step": 4576 + }, + { + "epoch": 1.2678670360110802, + "grad_norm": 0.6962082386016846, + "learning_rate": 4.486631336586248e-06, + "loss": 0.6101, + "step": 4577 + }, + { + "epoch": 1.2681440443213297, + "grad_norm": 0.6831097602844238, + "learning_rate": 4.486410171075921e-06, + "loss": 0.6382, + "step": 4578 + }, + { + "epoch": 1.268421052631579, + "grad_norm": 0.6977545022964478, + "learning_rate": 4.486188963389034e-06, + "loss": 0.625, + "step": 4579 + }, + { + "epoch": 1.2686980609418281, + "grad_norm": 0.7188796997070312, + "learning_rate": 4.485967713530282e-06, + "loss": 0.6152, + "step": 4580 + }, + { + "epoch": 1.2689750692520776, + "grad_norm": 0.7160911560058594, + "learning_rate": 4.485746421504365e-06, + "loss": 0.6692, + "step": 4581 + }, + { + "epoch": 1.2692520775623268, + "grad_norm": 0.7020512223243713, + "learning_rate": 4.485525087315979e-06, + "loss": 0.6369, + "step": 4582 + }, + { + "epoch": 1.269529085872576, + "grad_norm": 0.7281891107559204, + "learning_rate": 4.485303710969825e-06, + "loss": 0.6518, + "step": 4583 + }, + { + "epoch": 1.2698060941828255, + "grad_norm": 0.6871817111968994, + "learning_rate": 4.485082292470604e-06, + "loss": 0.6156, + "step": 4584 + }, + { + "epoch": 1.2700831024930748, + "grad_norm": 0.7031907439231873, + "learning_rate": 4.484860831823017e-06, + "loss": 0.6422, + "step": 4585 + }, + { + "epoch": 1.270360110803324, + "grad_norm": 0.7040826678276062, + "learning_rate": 4.484639329031765e-06, + "loss": 0.6438, + "step": 4586 + }, + { + "epoch": 1.2706371191135735, + "grad_norm": 0.691525399684906, + "learning_rate": 4.4844177841015516e-06, + "loss": 0.6267, + "step": 4587 + }, + { + "epoch": 1.2709141274238227, + "grad_norm": 0.683026134967804, + "learning_rate": 4.484196197037082e-06, + "loss": 0.6341, + "step": 4588 + }, + { + "epoch": 1.271191135734072, + "grad_norm": 0.696869969367981, + "learning_rate": 4.483974567843059e-06, + "loss": 0.6229, + "step": 4589 + }, + { + "epoch": 1.2714681440443214, + "grad_norm": 0.765889048576355, + "learning_rate": 4.48375289652419e-06, + "loss": 0.6088, + "step": 4590 + }, + { + "epoch": 1.2717451523545706, + "grad_norm": 0.6921378374099731, + "learning_rate": 4.483531183085181e-06, + "loss": 0.5697, + "step": 4591 + }, + { + "epoch": 1.2720221606648199, + "grad_norm": 0.7679482698440552, + "learning_rate": 4.483309427530739e-06, + "loss": 0.6527, + "step": 4592 + }, + { + "epoch": 1.2722991689750693, + "grad_norm": 0.6497597098350525, + "learning_rate": 4.483087629865573e-06, + "loss": 0.526, + "step": 4593 + }, + { + "epoch": 1.2725761772853186, + "grad_norm": 0.7268680334091187, + "learning_rate": 4.482865790094394e-06, + "loss": 0.687, + "step": 4594 + }, + { + "epoch": 1.2728531855955678, + "grad_norm": 0.6968439221382141, + "learning_rate": 4.48264390822191e-06, + "loss": 0.6423, + "step": 4595 + }, + { + "epoch": 1.2731301939058173, + "grad_norm": 0.6854841709136963, + "learning_rate": 4.4824219842528315e-06, + "loss": 0.6677, + "step": 4596 + }, + { + "epoch": 1.2734072022160665, + "grad_norm": 0.6899726390838623, + "learning_rate": 4.482200018191872e-06, + "loss": 0.6266, + "step": 4597 + }, + { + "epoch": 1.2736842105263158, + "grad_norm": 0.7343031764030457, + "learning_rate": 4.481978010043745e-06, + "loss": 0.6459, + "step": 4598 + }, + { + "epoch": 1.2739612188365652, + "grad_norm": 0.6642977595329285, + "learning_rate": 4.4817559598131635e-06, + "loss": 0.6194, + "step": 4599 + }, + { + "epoch": 1.2742382271468145, + "grad_norm": 0.7220334410667419, + "learning_rate": 4.481533867504841e-06, + "loss": 0.6689, + "step": 4600 + }, + { + "epoch": 1.2745152354570637, + "grad_norm": 0.6452382206916809, + "learning_rate": 4.481311733123495e-06, + "loss": 0.6143, + "step": 4601 + }, + { + "epoch": 1.2747922437673131, + "grad_norm": 0.6728619337081909, + "learning_rate": 4.481089556673841e-06, + "loss": 0.6198, + "step": 4602 + }, + { + "epoch": 1.2750692520775624, + "grad_norm": 0.6677701473236084, + "learning_rate": 4.480867338160596e-06, + "loss": 0.5959, + "step": 4603 + }, + { + "epoch": 1.2753462603878116, + "grad_norm": 0.7178294062614441, + "learning_rate": 4.480645077588479e-06, + "loss": 0.6414, + "step": 4604 + }, + { + "epoch": 1.2756232686980609, + "grad_norm": 0.7325197458267212, + "learning_rate": 4.48042277496221e-06, + "loss": 0.6589, + "step": 4605 + }, + { + "epoch": 1.2759002770083103, + "grad_norm": 0.7167461514472961, + "learning_rate": 4.480200430286508e-06, + "loss": 0.6728, + "step": 4606 + }, + { + "epoch": 1.2761772853185596, + "grad_norm": 0.6576178073883057, + "learning_rate": 4.479978043566094e-06, + "loss": 0.655, + "step": 4607 + }, + { + "epoch": 1.2764542936288088, + "grad_norm": 0.6865749359130859, + "learning_rate": 4.4797556148056884e-06, + "loss": 0.6594, + "step": 4608 + }, + { + "epoch": 1.276731301939058, + "grad_norm": 0.7056397795677185, + "learning_rate": 4.479533144010016e-06, + "loss": 0.6249, + "step": 4609 + }, + { + "epoch": 1.2770083102493075, + "grad_norm": 0.6812729239463806, + "learning_rate": 4.4793106311838e-06, + "loss": 0.6174, + "step": 4610 + }, + { + "epoch": 1.2772853185595567, + "grad_norm": 0.6877213716506958, + "learning_rate": 4.4790880763317646e-06, + "loss": 0.6007, + "step": 4611 + }, + { + "epoch": 1.277562326869806, + "grad_norm": 0.7136980295181274, + "learning_rate": 4.478865479458635e-06, + "loss": 0.6883, + "step": 4612 + }, + { + "epoch": 1.2778393351800554, + "grad_norm": 0.7126525640487671, + "learning_rate": 4.4786428405691385e-06, + "loss": 0.6294, + "step": 4613 + }, + { + "epoch": 1.2781163434903047, + "grad_norm": 0.6921951770782471, + "learning_rate": 4.478420159668001e-06, + "loss": 0.6899, + "step": 4614 + }, + { + "epoch": 1.278393351800554, + "grad_norm": 0.7217085361480713, + "learning_rate": 4.478197436759951e-06, + "loss": 0.6774, + "step": 4615 + }, + { + "epoch": 1.2786703601108034, + "grad_norm": 0.6517031788825989, + "learning_rate": 4.477974671849718e-06, + "loss": 0.575, + "step": 4616 + }, + { + "epoch": 1.2789473684210526, + "grad_norm": 0.680118203163147, + "learning_rate": 4.477751864942032e-06, + "loss": 0.6299, + "step": 4617 + }, + { + "epoch": 1.2792243767313018, + "grad_norm": 0.7211265563964844, + "learning_rate": 4.477529016041623e-06, + "loss": 0.6159, + "step": 4618 + }, + { + "epoch": 1.2795013850415513, + "grad_norm": 0.7222548127174377, + "learning_rate": 4.477306125153223e-06, + "loss": 0.6827, + "step": 4619 + }, + { + "epoch": 1.2797783933518005, + "grad_norm": 0.7041400074958801, + "learning_rate": 4.477083192281564e-06, + "loss": 0.5943, + "step": 4620 + }, + { + "epoch": 1.2800554016620498, + "grad_norm": 0.6987934708595276, + "learning_rate": 4.476860217431381e-06, + "loss": 0.6259, + "step": 4621 + }, + { + "epoch": 1.2803324099722992, + "grad_norm": 0.7352740168571472, + "learning_rate": 4.4766372006074055e-06, + "loss": 0.6532, + "step": 4622 + }, + { + "epoch": 1.2806094182825485, + "grad_norm": 0.6798954010009766, + "learning_rate": 4.476414141814375e-06, + "loss": 0.6638, + "step": 4623 + }, + { + "epoch": 1.2808864265927977, + "grad_norm": 0.6747403740882874, + "learning_rate": 4.476191041057026e-06, + "loss": 0.6807, + "step": 4624 + }, + { + "epoch": 1.2811634349030472, + "grad_norm": 0.685394823551178, + "learning_rate": 4.475967898340094e-06, + "loss": 0.6137, + "step": 4625 + }, + { + "epoch": 1.2814404432132964, + "grad_norm": 0.7123876810073853, + "learning_rate": 4.4757447136683175e-06, + "loss": 0.5759, + "step": 4626 + }, + { + "epoch": 1.2817174515235457, + "grad_norm": 0.7212755084037781, + "learning_rate": 4.475521487046435e-06, + "loss": 0.6571, + "step": 4627 + }, + { + "epoch": 1.2819944598337951, + "grad_norm": 0.7282657027244568, + "learning_rate": 4.475298218479186e-06, + "loss": 0.6568, + "step": 4628 + }, + { + "epoch": 1.2822714681440444, + "grad_norm": 0.7013609409332275, + "learning_rate": 4.4750749079713124e-06, + "loss": 0.6175, + "step": 4629 + }, + { + "epoch": 1.2825484764542936, + "grad_norm": 0.6803429126739502, + "learning_rate": 4.474851555527554e-06, + "loss": 0.6389, + "step": 4630 + }, + { + "epoch": 1.282825484764543, + "grad_norm": 0.6631330251693726, + "learning_rate": 4.474628161152654e-06, + "loss": 0.6118, + "step": 4631 + }, + { + "epoch": 1.2831024930747923, + "grad_norm": 0.7086408734321594, + "learning_rate": 4.474404724851356e-06, + "loss": 0.612, + "step": 4632 + }, + { + "epoch": 1.2833795013850415, + "grad_norm": 0.7163555026054382, + "learning_rate": 4.474181246628404e-06, + "loss": 0.6794, + "step": 4633 + }, + { + "epoch": 1.283656509695291, + "grad_norm": 0.7133662104606628, + "learning_rate": 4.4739577264885415e-06, + "loss": 0.6819, + "step": 4634 + }, + { + "epoch": 1.2839335180055402, + "grad_norm": 0.7242811918258667, + "learning_rate": 4.473734164436516e-06, + "loss": 0.6825, + "step": 4635 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 0.6506803631782532, + "learning_rate": 4.473510560477073e-06, + "loss": 0.5928, + "step": 4636 + }, + { + "epoch": 1.2844875346260387, + "grad_norm": 0.6938612461090088, + "learning_rate": 4.473286914614962e-06, + "loss": 0.6575, + "step": 4637 + }, + { + "epoch": 1.2847645429362882, + "grad_norm": 0.679517924785614, + "learning_rate": 4.4730632268549305e-06, + "loss": 0.6791, + "step": 4638 + }, + { + "epoch": 1.2850415512465374, + "grad_norm": 0.6626391410827637, + "learning_rate": 4.472839497201728e-06, + "loss": 0.5879, + "step": 4639 + }, + { + "epoch": 1.2853185595567866, + "grad_norm": 0.677143394947052, + "learning_rate": 4.472615725660104e-06, + "loss": 0.6404, + "step": 4640 + }, + { + "epoch": 1.2855955678670359, + "grad_norm": 0.6869742274284363, + "learning_rate": 4.472391912234812e-06, + "loss": 0.5803, + "step": 4641 + }, + { + "epoch": 1.2858725761772853, + "grad_norm": 0.7295581102371216, + "learning_rate": 4.472168056930602e-06, + "loss": 0.6676, + "step": 4642 + }, + { + "epoch": 1.2861495844875346, + "grad_norm": 0.6919876337051392, + "learning_rate": 4.471944159752228e-06, + "loss": 0.6327, + "step": 4643 + }, + { + "epoch": 1.2864265927977838, + "grad_norm": 0.6825493574142456, + "learning_rate": 4.471720220704443e-06, + "loss": 0.6457, + "step": 4644 + }, + { + "epoch": 1.2867036011080333, + "grad_norm": 0.6969654560089111, + "learning_rate": 4.471496239792004e-06, + "loss": 0.6363, + "step": 4645 + }, + { + "epoch": 1.2869806094182825, + "grad_norm": 0.7185841798782349, + "learning_rate": 4.471272217019664e-06, + "loss": 0.6138, + "step": 4646 + }, + { + "epoch": 1.2872576177285318, + "grad_norm": 0.6602373719215393, + "learning_rate": 4.4710481523921816e-06, + "loss": 0.6046, + "step": 4647 + }, + { + "epoch": 1.2875346260387812, + "grad_norm": 0.7047215104103088, + "learning_rate": 4.470824045914312e-06, + "loss": 0.6184, + "step": 4648 + }, + { + "epoch": 1.2878116343490305, + "grad_norm": 0.7656547427177429, + "learning_rate": 4.470599897590816e-06, + "loss": 0.6555, + "step": 4649 + }, + { + "epoch": 1.2880886426592797, + "grad_norm": 0.7046623826026917, + "learning_rate": 4.470375707426452e-06, + "loss": 0.6694, + "step": 4650 + }, + { + "epoch": 1.2883656509695292, + "grad_norm": 0.6936684846878052, + "learning_rate": 4.470151475425979e-06, + "loss": 0.6186, + "step": 4651 + }, + { + "epoch": 1.2886426592797784, + "grad_norm": 0.7161130905151367, + "learning_rate": 4.46992720159416e-06, + "loss": 0.6827, + "step": 4652 + }, + { + "epoch": 1.2889196675900276, + "grad_norm": 0.670766294002533, + "learning_rate": 4.469702885935755e-06, + "loss": 0.5889, + "step": 4653 + }, + { + "epoch": 1.289196675900277, + "grad_norm": 0.6470814347267151, + "learning_rate": 4.469478528455529e-06, + "loss": 0.6004, + "step": 4654 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.7229974865913391, + "learning_rate": 4.469254129158242e-06, + "loss": 0.6121, + "step": 4655 + }, + { + "epoch": 1.2897506925207756, + "grad_norm": 0.6630874276161194, + "learning_rate": 4.469029688048663e-06, + "loss": 0.6844, + "step": 4656 + }, + { + "epoch": 1.290027700831025, + "grad_norm": 0.6928765773773193, + "learning_rate": 4.468805205131555e-06, + "loss": 0.6489, + "step": 4657 + }, + { + "epoch": 1.2903047091412743, + "grad_norm": 0.7021420001983643, + "learning_rate": 4.4685806804116835e-06, + "loss": 0.624, + "step": 4658 + }, + { + "epoch": 1.2905817174515235, + "grad_norm": 0.7199434041976929, + "learning_rate": 4.468356113893818e-06, + "loss": 0.6424, + "step": 4659 + }, + { + "epoch": 1.290858725761773, + "grad_norm": 0.678230881690979, + "learning_rate": 4.4681315055827255e-06, + "loss": 0.5948, + "step": 4660 + }, + { + "epoch": 1.2911357340720222, + "grad_norm": 0.6820054650306702, + "learning_rate": 4.467906855483175e-06, + "loss": 0.648, + "step": 4661 + }, + { + "epoch": 1.2914127423822714, + "grad_norm": 0.7020586729049683, + "learning_rate": 4.467682163599937e-06, + "loss": 0.648, + "step": 4662 + }, + { + "epoch": 1.291689750692521, + "grad_norm": 0.7439424991607666, + "learning_rate": 4.467457429937781e-06, + "loss": 0.6649, + "step": 4663 + }, + { + "epoch": 1.2919667590027701, + "grad_norm": 0.7275158166885376, + "learning_rate": 4.46723265450148e-06, + "loss": 0.6516, + "step": 4664 + }, + { + "epoch": 1.2922437673130194, + "grad_norm": 0.6814894676208496, + "learning_rate": 4.467007837295805e-06, + "loss": 0.6106, + "step": 4665 + }, + { + "epoch": 1.2925207756232688, + "grad_norm": 0.721579372882843, + "learning_rate": 4.466782978325531e-06, + "loss": 0.6157, + "step": 4666 + }, + { + "epoch": 1.292797783933518, + "grad_norm": 0.7090845108032227, + "learning_rate": 4.466558077595432e-06, + "loss": 0.685, + "step": 4667 + }, + { + "epoch": 1.2930747922437673, + "grad_norm": 0.661973237991333, + "learning_rate": 4.466333135110283e-06, + "loss": 0.6404, + "step": 4668 + }, + { + "epoch": 1.2933518005540166, + "grad_norm": 0.7091084718704224, + "learning_rate": 4.466108150874859e-06, + "loss": 0.6804, + "step": 4669 + }, + { + "epoch": 1.293628808864266, + "grad_norm": 0.6906969547271729, + "learning_rate": 4.465883124893939e-06, + "loss": 0.6756, + "step": 4670 + }, + { + "epoch": 1.2939058171745152, + "grad_norm": 0.6625245809555054, + "learning_rate": 4.465658057172301e-06, + "loss": 0.6563, + "step": 4671 + }, + { + "epoch": 1.2941828254847645, + "grad_norm": 0.6626020669937134, + "learning_rate": 4.46543294771472e-06, + "loss": 0.6524, + "step": 4672 + }, + { + "epoch": 1.2944598337950137, + "grad_norm": 0.7468387484550476, + "learning_rate": 4.465207796525981e-06, + "loss": 0.6784, + "step": 4673 + }, + { + "epoch": 1.2947368421052632, + "grad_norm": 0.7022821307182312, + "learning_rate": 4.46498260361086e-06, + "loss": 0.6629, + "step": 4674 + }, + { + "epoch": 1.2950138504155124, + "grad_norm": 0.7802258729934692, + "learning_rate": 4.464757368974142e-06, + "loss": 0.6862, + "step": 4675 + }, + { + "epoch": 1.2952908587257617, + "grad_norm": 0.7289448976516724, + "learning_rate": 4.464532092620607e-06, + "loss": 0.6343, + "step": 4676 + }, + { + "epoch": 1.2955678670360111, + "grad_norm": 0.68182373046875, + "learning_rate": 4.464306774555038e-06, + "loss": 0.6641, + "step": 4677 + }, + { + "epoch": 1.2958448753462604, + "grad_norm": 0.6709579229354858, + "learning_rate": 4.46408141478222e-06, + "loss": 0.6494, + "step": 4678 + }, + { + "epoch": 1.2961218836565096, + "grad_norm": 0.724901556968689, + "learning_rate": 4.463856013306939e-06, + "loss": 0.6506, + "step": 4679 + }, + { + "epoch": 1.296398891966759, + "grad_norm": 0.7291810512542725, + "learning_rate": 4.463630570133978e-06, + "loss": 0.6156, + "step": 4680 + }, + { + "epoch": 1.2966759002770083, + "grad_norm": 0.7094226479530334, + "learning_rate": 4.463405085268127e-06, + "loss": 0.6355, + "step": 4681 + }, + { + "epoch": 1.2969529085872575, + "grad_norm": 0.7184739112854004, + "learning_rate": 4.463179558714171e-06, + "loss": 0.6385, + "step": 4682 + }, + { + "epoch": 1.297229916897507, + "grad_norm": 0.6918147802352905, + "learning_rate": 4.4629539904769e-06, + "loss": 0.6432, + "step": 4683 + }, + { + "epoch": 1.2975069252077562, + "grad_norm": 0.6969414949417114, + "learning_rate": 4.462728380561103e-06, + "loss": 0.621, + "step": 4684 + }, + { + "epoch": 1.2977839335180055, + "grad_norm": 0.7299537062644958, + "learning_rate": 4.46250272897157e-06, + "loss": 0.7035, + "step": 4685 + }, + { + "epoch": 1.298060941828255, + "grad_norm": 0.7747761607170105, + "learning_rate": 4.462277035713093e-06, + "loss": 0.587, + "step": 4686 + }, + { + "epoch": 1.2983379501385042, + "grad_norm": 0.7294924855232239, + "learning_rate": 4.462051300790463e-06, + "loss": 0.6202, + "step": 4687 + }, + { + "epoch": 1.2986149584487534, + "grad_norm": 0.7223609685897827, + "learning_rate": 4.461825524208473e-06, + "loss": 0.6515, + "step": 4688 + }, + { + "epoch": 1.2988919667590029, + "grad_norm": 0.6905951499938965, + "learning_rate": 4.461599705971918e-06, + "loss": 0.6466, + "step": 4689 + }, + { + "epoch": 1.299168975069252, + "grad_norm": 0.7002838253974915, + "learning_rate": 4.4613738460855905e-06, + "loss": 0.6579, + "step": 4690 + }, + { + "epoch": 1.2994459833795013, + "grad_norm": 0.6855634450912476, + "learning_rate": 4.461147944554288e-06, + "loss": 0.6083, + "step": 4691 + }, + { + "epoch": 1.2997229916897508, + "grad_norm": 0.6717777848243713, + "learning_rate": 4.460922001382807e-06, + "loss": 0.6412, + "step": 4692 + }, + { + "epoch": 1.3, + "grad_norm": 0.685459554195404, + "learning_rate": 4.460696016575944e-06, + "loss": 0.6333, + "step": 4693 + }, + { + "epoch": 1.3002770083102493, + "grad_norm": 0.6794373393058777, + "learning_rate": 4.460469990138497e-06, + "loss": 0.6755, + "step": 4694 + }, + { + "epoch": 1.3005540166204987, + "grad_norm": 0.6875943541526794, + "learning_rate": 4.4602439220752665e-06, + "loss": 0.6295, + "step": 4695 + }, + { + "epoch": 1.300831024930748, + "grad_norm": 0.6423923373222351, + "learning_rate": 4.460017812391051e-06, + "loss": 0.6523, + "step": 4696 + }, + { + "epoch": 1.3011080332409972, + "grad_norm": 0.6820371150970459, + "learning_rate": 4.459791661090652e-06, + "loss": 0.5964, + "step": 4697 + }, + { + "epoch": 1.3013850415512465, + "grad_norm": 0.6558367013931274, + "learning_rate": 4.4595654681788715e-06, + "loss": 0.6141, + "step": 4698 + }, + { + "epoch": 1.301662049861496, + "grad_norm": 0.7163952589035034, + "learning_rate": 4.459339233660512e-06, + "loss": 0.6544, + "step": 4699 + }, + { + "epoch": 1.3019390581717452, + "grad_norm": 0.7021366357803345, + "learning_rate": 4.459112957540377e-06, + "loss": 0.6427, + "step": 4700 + }, + { + "epoch": 1.3022160664819944, + "grad_norm": 0.7018139362335205, + "learning_rate": 4.45888663982327e-06, + "loss": 0.6041, + "step": 4701 + }, + { + "epoch": 1.3024930747922436, + "grad_norm": 0.6925555467605591, + "learning_rate": 4.458660280513998e-06, + "loss": 0.6071, + "step": 4702 + }, + { + "epoch": 1.302770083102493, + "grad_norm": 0.7855971455574036, + "learning_rate": 4.458433879617367e-06, + "loss": 0.6324, + "step": 4703 + }, + { + "epoch": 1.3030470914127423, + "grad_norm": 0.6972799301147461, + "learning_rate": 4.458207437138182e-06, + "loss": 0.6331, + "step": 4704 + }, + { + "epoch": 1.3033240997229916, + "grad_norm": 0.6723865866661072, + "learning_rate": 4.4579809530812535e-06, + "loss": 0.6135, + "step": 4705 + }, + { + "epoch": 1.303601108033241, + "grad_norm": 0.7214810848236084, + "learning_rate": 4.457754427451389e-06, + "loss": 0.6299, + "step": 4706 + }, + { + "epoch": 1.3038781163434903, + "grad_norm": 0.7141785621643066, + "learning_rate": 4.4575278602533975e-06, + "loss": 0.6764, + "step": 4707 + }, + { + "epoch": 1.3041551246537395, + "grad_norm": 0.6679520010948181, + "learning_rate": 4.457301251492092e-06, + "loss": 0.636, + "step": 4708 + }, + { + "epoch": 1.304432132963989, + "grad_norm": 0.695136308670044, + "learning_rate": 4.457074601172282e-06, + "loss": 0.6668, + "step": 4709 + }, + { + "epoch": 1.3047091412742382, + "grad_norm": 0.7452983260154724, + "learning_rate": 4.45684790929878e-06, + "loss": 0.63, + "step": 4710 + }, + { + "epoch": 1.3049861495844874, + "grad_norm": 0.673240602016449, + "learning_rate": 4.4566211758764e-06, + "loss": 0.6409, + "step": 4711 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 0.6913855671882629, + "learning_rate": 4.456394400909956e-06, + "loss": 0.6333, + "step": 4712 + }, + { + "epoch": 1.3055401662049861, + "grad_norm": 0.6479515433311462, + "learning_rate": 4.456167584404262e-06, + "loss": 0.5998, + "step": 4713 + }, + { + "epoch": 1.3058171745152354, + "grad_norm": 0.7309516072273254, + "learning_rate": 4.455940726364135e-06, + "loss": 0.6845, + "step": 4714 + }, + { + "epoch": 1.3060941828254848, + "grad_norm": 0.6839296817779541, + "learning_rate": 4.455713826794391e-06, + "loss": 0.6567, + "step": 4715 + }, + { + "epoch": 1.306371191135734, + "grad_norm": 0.6930821537971497, + "learning_rate": 4.455486885699849e-06, + "loss": 0.5875, + "step": 4716 + }, + { + "epoch": 1.3066481994459833, + "grad_norm": 0.7218079566955566, + "learning_rate": 4.455259903085326e-06, + "loss": 0.6062, + "step": 4717 + }, + { + "epoch": 1.3069252077562328, + "grad_norm": 0.6791582107543945, + "learning_rate": 4.455032878955643e-06, + "loss": 0.6485, + "step": 4718 + }, + { + "epoch": 1.307202216066482, + "grad_norm": 0.690004825592041, + "learning_rate": 4.454805813315619e-06, + "loss": 0.6284, + "step": 4719 + }, + { + "epoch": 1.3074792243767313, + "grad_norm": 0.7091121673583984, + "learning_rate": 4.454578706170075e-06, + "loss": 0.5778, + "step": 4720 + }, + { + "epoch": 1.3077562326869807, + "grad_norm": 0.6884284615516663, + "learning_rate": 4.4543515575238336e-06, + "loss": 0.6224, + "step": 4721 + }, + { + "epoch": 1.30803324099723, + "grad_norm": 0.7247439622879028, + "learning_rate": 4.454124367381718e-06, + "loss": 0.6518, + "step": 4722 + }, + { + "epoch": 1.3083102493074792, + "grad_norm": 0.7225871682167053, + "learning_rate": 4.453897135748552e-06, + "loss": 0.6391, + "step": 4723 + }, + { + "epoch": 1.3085872576177286, + "grad_norm": 0.7072814702987671, + "learning_rate": 4.45366986262916e-06, + "loss": 0.6758, + "step": 4724 + }, + { + "epoch": 1.3088642659279779, + "grad_norm": 0.7352576851844788, + "learning_rate": 4.453442548028367e-06, + "loss": 0.6547, + "step": 4725 + }, + { + "epoch": 1.3091412742382271, + "grad_norm": 0.7073747515678406, + "learning_rate": 4.453215191951002e-06, + "loss": 0.594, + "step": 4726 + }, + { + "epoch": 1.3094182825484766, + "grad_norm": 0.6708406805992126, + "learning_rate": 4.452987794401889e-06, + "loss": 0.6303, + "step": 4727 + }, + { + "epoch": 1.3096952908587258, + "grad_norm": 0.7301291823387146, + "learning_rate": 4.452760355385858e-06, + "loss": 0.6228, + "step": 4728 + }, + { + "epoch": 1.309972299168975, + "grad_norm": 0.7150014638900757, + "learning_rate": 4.452532874907738e-06, + "loss": 0.6325, + "step": 4729 + }, + { + "epoch": 1.3102493074792243, + "grad_norm": 0.6688306331634521, + "learning_rate": 4.452305352972359e-06, + "loss": 0.6629, + "step": 4730 + }, + { + "epoch": 1.3105263157894738, + "grad_norm": 0.6686022877693176, + "learning_rate": 4.452077789584551e-06, + "loss": 0.6015, + "step": 4731 + }, + { + "epoch": 1.310803324099723, + "grad_norm": 0.728378176689148, + "learning_rate": 4.451850184749147e-06, + "loss": 0.6728, + "step": 4732 + }, + { + "epoch": 1.3110803324099722, + "grad_norm": 0.6999015212059021, + "learning_rate": 4.451622538470979e-06, + "loss": 0.6272, + "step": 4733 + }, + { + "epoch": 1.3113573407202215, + "grad_norm": 0.7011626362800598, + "learning_rate": 4.451394850754881e-06, + "loss": 0.6667, + "step": 4734 + }, + { + "epoch": 1.311634349030471, + "grad_norm": 0.7036128044128418, + "learning_rate": 4.4511671216056865e-06, + "loss": 0.6376, + "step": 4735 + }, + { + "epoch": 1.3119113573407202, + "grad_norm": 0.6627241373062134, + "learning_rate": 4.450939351028231e-06, + "loss": 0.6393, + "step": 4736 + }, + { + "epoch": 1.3121883656509694, + "grad_norm": 0.692579448223114, + "learning_rate": 4.450711539027352e-06, + "loss": 0.6152, + "step": 4737 + }, + { + "epoch": 1.3124653739612189, + "grad_norm": 0.6422987580299377, + "learning_rate": 4.450483685607884e-06, + "loss": 0.6209, + "step": 4738 + }, + { + "epoch": 1.3127423822714681, + "grad_norm": 0.6677369475364685, + "learning_rate": 4.450255790774667e-06, + "loss": 0.6189, + "step": 4739 + }, + { + "epoch": 1.3130193905817173, + "grad_norm": 0.696083664894104, + "learning_rate": 4.450027854532539e-06, + "loss": 0.6347, + "step": 4740 + }, + { + "epoch": 1.3132963988919668, + "grad_norm": 0.702476441860199, + "learning_rate": 4.449799876886341e-06, + "loss": 0.6398, + "step": 4741 + }, + { + "epoch": 1.313573407202216, + "grad_norm": 0.7123687267303467, + "learning_rate": 4.449571857840911e-06, + "loss": 0.6288, + "step": 4742 + }, + { + "epoch": 1.3138504155124653, + "grad_norm": 0.7451086044311523, + "learning_rate": 4.449343797401092e-06, + "loss": 0.656, + "step": 4743 + }, + { + "epoch": 1.3141274238227147, + "grad_norm": 0.7208288311958313, + "learning_rate": 4.449115695571727e-06, + "loss": 0.6844, + "step": 4744 + }, + { + "epoch": 1.314404432132964, + "grad_norm": 0.6745233535766602, + "learning_rate": 4.448887552357658e-06, + "loss": 0.6571, + "step": 4745 + }, + { + "epoch": 1.3146814404432132, + "grad_norm": 0.7078210115432739, + "learning_rate": 4.448659367763729e-06, + "loss": 0.6656, + "step": 4746 + }, + { + "epoch": 1.3149584487534627, + "grad_norm": 0.6784594058990479, + "learning_rate": 4.448431141794784e-06, + "loss": 0.6252, + "step": 4747 + }, + { + "epoch": 1.315235457063712, + "grad_norm": 0.7477366328239441, + "learning_rate": 4.4482028744556725e-06, + "loss": 0.647, + "step": 4748 + }, + { + "epoch": 1.3155124653739612, + "grad_norm": 0.7212636470794678, + "learning_rate": 4.447974565751237e-06, + "loss": 0.6541, + "step": 4749 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.7099882960319519, + "learning_rate": 4.447746215686328e-06, + "loss": 0.625, + "step": 4750 + }, + { + "epoch": 1.3160664819944599, + "grad_norm": 0.6715225577354431, + "learning_rate": 4.447517824265793e-06, + "loss": 0.6326, + "step": 4751 + }, + { + "epoch": 1.316343490304709, + "grad_norm": 0.7152224779129028, + "learning_rate": 4.44728939149448e-06, + "loss": 0.6436, + "step": 4752 + }, + { + "epoch": 1.3166204986149586, + "grad_norm": 0.6906647086143494, + "learning_rate": 4.44706091737724e-06, + "loss": 0.6627, + "step": 4753 + }, + { + "epoch": 1.3168975069252078, + "grad_norm": 0.7126035094261169, + "learning_rate": 4.446832401918927e-06, + "loss": 0.6472, + "step": 4754 + }, + { + "epoch": 1.317174515235457, + "grad_norm": 0.6664754748344421, + "learning_rate": 4.446603845124388e-06, + "loss": 0.6275, + "step": 4755 + }, + { + "epoch": 1.3174515235457065, + "grad_norm": 0.654424786567688, + "learning_rate": 4.4463752469984796e-06, + "loss": 0.6258, + "step": 4756 + }, + { + "epoch": 1.3177285318559557, + "grad_norm": 0.7223296165466309, + "learning_rate": 4.446146607546053e-06, + "loss": 0.6377, + "step": 4757 + }, + { + "epoch": 1.318005540166205, + "grad_norm": 0.7383649349212646, + "learning_rate": 4.445917926771966e-06, + "loss": 0.6745, + "step": 4758 + }, + { + "epoch": 1.3182825484764544, + "grad_norm": 0.6735358834266663, + "learning_rate": 4.445689204681071e-06, + "loss": 0.6315, + "step": 4759 + }, + { + "epoch": 1.3185595567867037, + "grad_norm": 0.6980936527252197, + "learning_rate": 4.445460441278225e-06, + "loss": 0.6214, + "step": 4760 + }, + { + "epoch": 1.318836565096953, + "grad_norm": 0.7276691794395447, + "learning_rate": 4.445231636568286e-06, + "loss": 0.671, + "step": 4761 + }, + { + "epoch": 1.3191135734072021, + "grad_norm": 0.7002887725830078, + "learning_rate": 4.445002790556111e-06, + "loss": 0.6264, + "step": 4762 + }, + { + "epoch": 1.3193905817174516, + "grad_norm": 0.7233803868293762, + "learning_rate": 4.444773903246561e-06, + "loss": 0.6311, + "step": 4763 + }, + { + "epoch": 1.3196675900277008, + "grad_norm": 0.7100148797035217, + "learning_rate": 4.444544974644493e-06, + "loss": 0.6274, + "step": 4764 + }, + { + "epoch": 1.31994459833795, + "grad_norm": 0.6861152648925781, + "learning_rate": 4.4443160047547716e-06, + "loss": 0.6194, + "step": 4765 + }, + { + "epoch": 1.3202216066481993, + "grad_norm": 0.7061241865158081, + "learning_rate": 4.444086993582255e-06, + "loss": 0.6851, + "step": 4766 + }, + { + "epoch": 1.3204986149584488, + "grad_norm": 0.6619032621383667, + "learning_rate": 4.443857941131807e-06, + "loss": 0.6126, + "step": 4767 + }, + { + "epoch": 1.320775623268698, + "grad_norm": 0.7326112985610962, + "learning_rate": 4.4436288474082905e-06, + "loss": 0.6274, + "step": 4768 + }, + { + "epoch": 1.3210526315789473, + "grad_norm": 0.6847018599510193, + "learning_rate": 4.44339971241657e-06, + "loss": 0.6524, + "step": 4769 + }, + { + "epoch": 1.3213296398891967, + "grad_norm": 0.6778106689453125, + "learning_rate": 4.443170536161512e-06, + "loss": 0.6166, + "step": 4770 + }, + { + "epoch": 1.321606648199446, + "grad_norm": 0.6772767901420593, + "learning_rate": 4.44294131864798e-06, + "loss": 0.6339, + "step": 4771 + }, + { + "epoch": 1.3218836565096952, + "grad_norm": 0.6837650537490845, + "learning_rate": 4.442712059880842e-06, + "loss": 0.6382, + "step": 4772 + }, + { + "epoch": 1.3221606648199447, + "grad_norm": 0.6830680966377258, + "learning_rate": 4.442482759864966e-06, + "loss": 0.6558, + "step": 4773 + }, + { + "epoch": 1.322437673130194, + "grad_norm": 0.7215142250061035, + "learning_rate": 4.442253418605221e-06, + "loss": 0.634, + "step": 4774 + }, + { + "epoch": 1.3227146814404431, + "grad_norm": 0.6660498380661011, + "learning_rate": 4.442024036106476e-06, + "loss": 0.6656, + "step": 4775 + }, + { + "epoch": 1.3229916897506926, + "grad_norm": 0.6940247416496277, + "learning_rate": 4.441794612373601e-06, + "loss": 0.6765, + "step": 4776 + }, + { + "epoch": 1.3232686980609418, + "grad_norm": 0.7326346635818481, + "learning_rate": 4.4415651474114664e-06, + "loss": 0.6547, + "step": 4777 + }, + { + "epoch": 1.323545706371191, + "grad_norm": 0.6846688389778137, + "learning_rate": 4.441335641224947e-06, + "loss": 0.6461, + "step": 4778 + }, + { + "epoch": 1.3238227146814405, + "grad_norm": 0.696536660194397, + "learning_rate": 4.441106093818914e-06, + "loss": 0.6428, + "step": 4779 + }, + { + "epoch": 1.3240997229916898, + "grad_norm": 0.7244516015052795, + "learning_rate": 4.440876505198242e-06, + "loss": 0.6311, + "step": 4780 + }, + { + "epoch": 1.324376731301939, + "grad_norm": 0.7271976470947266, + "learning_rate": 4.440646875367805e-06, + "loss": 0.6445, + "step": 4781 + }, + { + "epoch": 1.3246537396121885, + "grad_norm": 0.6603023409843445, + "learning_rate": 4.440417204332477e-06, + "loss": 0.5799, + "step": 4782 + }, + { + "epoch": 1.3249307479224377, + "grad_norm": 0.6802365183830261, + "learning_rate": 4.440187492097139e-06, + "loss": 0.6408, + "step": 4783 + }, + { + "epoch": 1.325207756232687, + "grad_norm": 0.685371458530426, + "learning_rate": 4.4399577386666645e-06, + "loss": 0.6284, + "step": 4784 + }, + { + "epoch": 1.3254847645429364, + "grad_norm": 0.752849280834198, + "learning_rate": 4.439727944045934e-06, + "loss": 0.6522, + "step": 4785 + }, + { + "epoch": 1.3257617728531856, + "grad_norm": 0.7088627815246582, + "learning_rate": 4.4394981082398254e-06, + "loss": 0.6221, + "step": 4786 + }, + { + "epoch": 1.3260387811634349, + "grad_norm": 0.6534282565116882, + "learning_rate": 4.439268231253219e-06, + "loss": 0.6515, + "step": 4787 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 0.7163136005401611, + "learning_rate": 4.439038313090996e-06, + "loss": 0.6562, + "step": 4788 + }, + { + "epoch": 1.3265927977839336, + "grad_norm": 0.7224626541137695, + "learning_rate": 4.438808353758038e-06, + "loss": 0.6291, + "step": 4789 + }, + { + "epoch": 1.3268698060941828, + "grad_norm": 0.7121604084968567, + "learning_rate": 4.438578353259227e-06, + "loss": 0.6915, + "step": 4790 + }, + { + "epoch": 1.327146814404432, + "grad_norm": 0.7014841437339783, + "learning_rate": 4.438348311599446e-06, + "loss": 0.6353, + "step": 4791 + }, + { + "epoch": 1.3274238227146815, + "grad_norm": 0.7054080367088318, + "learning_rate": 4.438118228783582e-06, + "loss": 0.6322, + "step": 4792 + }, + { + "epoch": 1.3277008310249307, + "grad_norm": 0.7404279708862305, + "learning_rate": 4.437888104816518e-06, + "loss": 0.6305, + "step": 4793 + }, + { + "epoch": 1.32797783933518, + "grad_norm": 0.7308083772659302, + "learning_rate": 4.4376579397031406e-06, + "loss": 0.6363, + "step": 4794 + }, + { + "epoch": 1.3282548476454292, + "grad_norm": 0.702226996421814, + "learning_rate": 4.437427733448337e-06, + "loss": 0.5752, + "step": 4795 + }, + { + "epoch": 1.3285318559556787, + "grad_norm": 0.6674501895904541, + "learning_rate": 4.437197486056994e-06, + "loss": 0.6565, + "step": 4796 + }, + { + "epoch": 1.328808864265928, + "grad_norm": 0.6679547429084778, + "learning_rate": 4.436967197534003e-06, + "loss": 0.6591, + "step": 4797 + }, + { + "epoch": 1.3290858725761772, + "grad_norm": 0.6631932854652405, + "learning_rate": 4.436736867884252e-06, + "loss": 0.6463, + "step": 4798 + }, + { + "epoch": 1.3293628808864266, + "grad_norm": 0.7053595185279846, + "learning_rate": 4.436506497112629e-06, + "loss": 0.6494, + "step": 4799 + }, + { + "epoch": 1.3296398891966759, + "grad_norm": 0.7127822637557983, + "learning_rate": 4.43627608522403e-06, + "loss": 0.6781, + "step": 4800 + }, + { + "epoch": 1.329916897506925, + "grad_norm": 0.667141318321228, + "learning_rate": 4.436045632223344e-06, + "loss": 0.617, + "step": 4801 + }, + { + "epoch": 1.3301939058171746, + "grad_norm": 0.6789475083351135, + "learning_rate": 4.4358151381154654e-06, + "loss": 0.6538, + "step": 4802 + }, + { + "epoch": 1.3304709141274238, + "grad_norm": 0.6923264861106873, + "learning_rate": 4.435584602905289e-06, + "loss": 0.6294, + "step": 4803 + }, + { + "epoch": 1.330747922437673, + "grad_norm": 0.7080187797546387, + "learning_rate": 4.435354026597707e-06, + "loss": 0.625, + "step": 4804 + }, + { + "epoch": 1.3310249307479225, + "grad_norm": 0.7222117185592651, + "learning_rate": 4.435123409197617e-06, + "loss": 0.6589, + "step": 4805 + }, + { + "epoch": 1.3313019390581717, + "grad_norm": 0.6662753820419312, + "learning_rate": 4.434892750709915e-06, + "loss": 0.5916, + "step": 4806 + }, + { + "epoch": 1.331578947368421, + "grad_norm": 0.730897068977356, + "learning_rate": 4.434662051139499e-06, + "loss": 0.6537, + "step": 4807 + }, + { + "epoch": 1.3318559556786704, + "grad_norm": 0.719174861907959, + "learning_rate": 4.434431310491267e-06, + "loss": 0.6498, + "step": 4808 + }, + { + "epoch": 1.3321329639889197, + "grad_norm": 0.6620305776596069, + "learning_rate": 4.434200528770118e-06, + "loss": 0.6247, + "step": 4809 + }, + { + "epoch": 1.332409972299169, + "grad_norm": 0.6843666434288025, + "learning_rate": 4.433969705980953e-06, + "loss": 0.6083, + "step": 4810 + }, + { + "epoch": 1.3326869806094184, + "grad_norm": 0.7134045362472534, + "learning_rate": 4.4337388421286724e-06, + "loss": 0.6956, + "step": 4811 + }, + { + "epoch": 1.3329639889196676, + "grad_norm": 0.6772432327270508, + "learning_rate": 4.433507937218177e-06, + "loss": 0.652, + "step": 4812 + }, + { + "epoch": 1.3332409972299168, + "grad_norm": 0.6777689456939697, + "learning_rate": 4.433276991254371e-06, + "loss": 0.6256, + "step": 4813 + }, + { + "epoch": 1.3335180055401663, + "grad_norm": 0.7466452717781067, + "learning_rate": 4.4330460042421584e-06, + "loss": 0.6231, + "step": 4814 + }, + { + "epoch": 1.3337950138504155, + "grad_norm": 0.6792182326316833, + "learning_rate": 4.4328149761864415e-06, + "loss": 0.6025, + "step": 4815 + }, + { + "epoch": 1.3340720221606648, + "grad_norm": 0.6836180090904236, + "learning_rate": 4.432583907092127e-06, + "loss": 0.6066, + "step": 4816 + }, + { + "epoch": 1.3343490304709142, + "grad_norm": 0.7040029168128967, + "learning_rate": 4.432352796964121e-06, + "loss": 0.662, + "step": 4817 + }, + { + "epoch": 1.3346260387811635, + "grad_norm": 0.6517881751060486, + "learning_rate": 4.432121645807331e-06, + "loss": 0.5577, + "step": 4818 + }, + { + "epoch": 1.3349030470914127, + "grad_norm": 0.6596935987472534, + "learning_rate": 4.431890453626664e-06, + "loss": 0.6183, + "step": 4819 + }, + { + "epoch": 1.3351800554016622, + "grad_norm": 0.6841338276863098, + "learning_rate": 4.4316592204270284e-06, + "loss": 0.6077, + "step": 4820 + }, + { + "epoch": 1.3354570637119114, + "grad_norm": 0.6922977566719055, + "learning_rate": 4.4314279462133345e-06, + "loss": 0.6343, + "step": 4821 + }, + { + "epoch": 1.3357340720221607, + "grad_norm": 0.7033997178077698, + "learning_rate": 4.431196630990494e-06, + "loss": 0.6277, + "step": 4822 + }, + { + "epoch": 1.33601108033241, + "grad_norm": 0.7048894762992859, + "learning_rate": 4.430965274763417e-06, + "loss": 0.6105, + "step": 4823 + }, + { + "epoch": 1.3362880886426594, + "grad_norm": 0.6983682513237, + "learning_rate": 4.430733877537016e-06, + "loss": 0.6152, + "step": 4824 + }, + { + "epoch": 1.3365650969529086, + "grad_norm": 0.7080015540122986, + "learning_rate": 4.430502439316204e-06, + "loss": 0.6336, + "step": 4825 + }, + { + "epoch": 1.3368421052631578, + "grad_norm": 0.6375042796134949, + "learning_rate": 4.4302709601058955e-06, + "loss": 0.6112, + "step": 4826 + }, + { + "epoch": 1.337119113573407, + "grad_norm": 0.6347265243530273, + "learning_rate": 4.4300394399110055e-06, + "loss": 0.6033, + "step": 4827 + }, + { + "epoch": 1.3373961218836565, + "grad_norm": 0.7151878476142883, + "learning_rate": 4.429807878736448e-06, + "loss": 0.6392, + "step": 4828 + }, + { + "epoch": 1.3376731301939058, + "grad_norm": 0.6534563899040222, + "learning_rate": 4.429576276587143e-06, + "loss": 0.645, + "step": 4829 + }, + { + "epoch": 1.337950138504155, + "grad_norm": 0.7110674977302551, + "learning_rate": 4.429344633468005e-06, + "loss": 0.6271, + "step": 4830 + }, + { + "epoch": 1.3382271468144045, + "grad_norm": 0.6874481439590454, + "learning_rate": 4.429112949383953e-06, + "loss": 0.6397, + "step": 4831 + }, + { + "epoch": 1.3385041551246537, + "grad_norm": 0.6635070443153381, + "learning_rate": 4.428881224339907e-06, + "loss": 0.6297, + "step": 4832 + }, + { + "epoch": 1.338781163434903, + "grad_norm": 0.6539145112037659, + "learning_rate": 4.4286494583407875e-06, + "loss": 0.585, + "step": 4833 + }, + { + "epoch": 1.3390581717451524, + "grad_norm": 0.7016680836677551, + "learning_rate": 4.428417651391514e-06, + "loss": 0.575, + "step": 4834 + }, + { + "epoch": 1.3393351800554016, + "grad_norm": 0.7028341889381409, + "learning_rate": 4.428185803497009e-06, + "loss": 0.5788, + "step": 4835 + }, + { + "epoch": 1.3396121883656509, + "grad_norm": 0.6498736143112183, + "learning_rate": 4.427953914662196e-06, + "loss": 0.6205, + "step": 4836 + }, + { + "epoch": 1.3398891966759003, + "grad_norm": 0.6495695114135742, + "learning_rate": 4.427721984891998e-06, + "loss": 0.5968, + "step": 4837 + }, + { + "epoch": 1.3401662049861496, + "grad_norm": 0.6960228681564331, + "learning_rate": 4.427490014191339e-06, + "loss": 0.6364, + "step": 4838 + }, + { + "epoch": 1.3404432132963988, + "grad_norm": 0.681462287902832, + "learning_rate": 4.4272580025651445e-06, + "loss": 0.6549, + "step": 4839 + }, + { + "epoch": 1.3407202216066483, + "grad_norm": 0.6907786130905151, + "learning_rate": 4.427025950018341e-06, + "loss": 0.6461, + "step": 4840 + }, + { + "epoch": 1.3409972299168975, + "grad_norm": 0.7233545780181885, + "learning_rate": 4.4267938565558555e-06, + "loss": 0.6735, + "step": 4841 + }, + { + "epoch": 1.3412742382271468, + "grad_norm": 0.6877539157867432, + "learning_rate": 4.426561722182616e-06, + "loss": 0.6173, + "step": 4842 + }, + { + "epoch": 1.3415512465373962, + "grad_norm": 0.7026717662811279, + "learning_rate": 4.426329546903552e-06, + "loss": 0.6466, + "step": 4843 + }, + { + "epoch": 1.3418282548476455, + "grad_norm": 0.7012684941291809, + "learning_rate": 4.426097330723591e-06, + "loss": 0.6601, + "step": 4844 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.6691379547119141, + "learning_rate": 4.425865073647666e-06, + "loss": 0.6278, + "step": 4845 + }, + { + "epoch": 1.3423822714681442, + "grad_norm": 0.7390052676200867, + "learning_rate": 4.4256327756807075e-06, + "loss": 0.664, + "step": 4846 + }, + { + "epoch": 1.3426592797783934, + "grad_norm": 0.6644761562347412, + "learning_rate": 4.425400436827647e-06, + "loss": 0.6076, + "step": 4847 + }, + { + "epoch": 1.3429362880886426, + "grad_norm": 0.705655038356781, + "learning_rate": 4.425168057093418e-06, + "loss": 0.6452, + "step": 4848 + }, + { + "epoch": 1.343213296398892, + "grad_norm": 0.7211757898330688, + "learning_rate": 4.424935636482955e-06, + "loss": 0.6577, + "step": 4849 + }, + { + "epoch": 1.3434903047091413, + "grad_norm": 0.668464720249176, + "learning_rate": 4.424703175001194e-06, + "loss": 0.5973, + "step": 4850 + }, + { + "epoch": 1.3437673130193906, + "grad_norm": 0.6970594525337219, + "learning_rate": 4.4244706726530676e-06, + "loss": 0.6887, + "step": 4851 + }, + { + "epoch": 1.34404432132964, + "grad_norm": 0.6972764134407043, + "learning_rate": 4.424238129443515e-06, + "loss": 0.6646, + "step": 4852 + }, + { + "epoch": 1.3443213296398893, + "grad_norm": 0.6718432307243347, + "learning_rate": 4.424005545377473e-06, + "loss": 0.5996, + "step": 4853 + }, + { + "epoch": 1.3445983379501385, + "grad_norm": 0.6845982074737549, + "learning_rate": 4.42377292045988e-06, + "loss": 0.6474, + "step": 4854 + }, + { + "epoch": 1.3448753462603877, + "grad_norm": 0.6861254572868347, + "learning_rate": 4.4235402546956756e-06, + "loss": 0.6814, + "step": 4855 + }, + { + "epoch": 1.3451523545706372, + "grad_norm": 0.6887468099594116, + "learning_rate": 4.423307548089798e-06, + "loss": 0.6538, + "step": 4856 + }, + { + "epoch": 1.3454293628808864, + "grad_norm": 0.6625999212265015, + "learning_rate": 4.423074800647191e-06, + "loss": 0.6455, + "step": 4857 + }, + { + "epoch": 1.3457063711911357, + "grad_norm": 0.7102819681167603, + "learning_rate": 4.422842012372794e-06, + "loss": 0.6903, + "step": 4858 + }, + { + "epoch": 1.345983379501385, + "grad_norm": 0.687042236328125, + "learning_rate": 4.422609183271551e-06, + "loss": 0.6013, + "step": 4859 + }, + { + "epoch": 1.3462603878116344, + "grad_norm": 0.7196336984634399, + "learning_rate": 4.422376313348405e-06, + "loss": 0.6243, + "step": 4860 + }, + { + "epoch": 1.3465373961218836, + "grad_norm": 0.6799708008766174, + "learning_rate": 4.422143402608302e-06, + "loss": 0.6289, + "step": 4861 + }, + { + "epoch": 1.3468144044321329, + "grad_norm": 0.7131449580192566, + "learning_rate": 4.421910451056184e-06, + "loss": 0.6736, + "step": 4862 + }, + { + "epoch": 1.3470914127423823, + "grad_norm": 0.6889610290527344, + "learning_rate": 4.421677458697e-06, + "loss": 0.6175, + "step": 4863 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 0.6625426411628723, + "learning_rate": 4.421444425535696e-06, + "loss": 0.597, + "step": 4864 + }, + { + "epoch": 1.3476454293628808, + "grad_norm": 0.7058026790618896, + "learning_rate": 4.42121135157722e-06, + "loss": 0.6549, + "step": 4865 + }, + { + "epoch": 1.3479224376731302, + "grad_norm": 0.6828805208206177, + "learning_rate": 4.420978236826521e-06, + "loss": 0.6195, + "step": 4866 + }, + { + "epoch": 1.3481994459833795, + "grad_norm": 0.7049974203109741, + "learning_rate": 4.420745081288549e-06, + "loss": 0.6275, + "step": 4867 + }, + { + "epoch": 1.3484764542936287, + "grad_norm": 0.7258075475692749, + "learning_rate": 4.4205118849682524e-06, + "loss": 0.6864, + "step": 4868 + }, + { + "epoch": 1.3487534626038782, + "grad_norm": 0.721996545791626, + "learning_rate": 4.420278647870585e-06, + "loss": 0.6157, + "step": 4869 + }, + { + "epoch": 1.3490304709141274, + "grad_norm": 0.7096882462501526, + "learning_rate": 4.420045370000497e-06, + "loss": 0.6405, + "step": 4870 + }, + { + "epoch": 1.3493074792243767, + "grad_norm": 0.71756511926651, + "learning_rate": 4.419812051362944e-06, + "loss": 0.5924, + "step": 4871 + }, + { + "epoch": 1.3495844875346261, + "grad_norm": 0.7162507176399231, + "learning_rate": 4.419578691962878e-06, + "loss": 0.6705, + "step": 4872 + }, + { + "epoch": 1.3498614958448754, + "grad_norm": 0.7038041949272156, + "learning_rate": 4.419345291805254e-06, + "loss": 0.6778, + "step": 4873 + }, + { + "epoch": 1.3501385041551246, + "grad_norm": 0.6972736716270447, + "learning_rate": 4.4191118508950286e-06, + "loss": 0.6447, + "step": 4874 + }, + { + "epoch": 1.350415512465374, + "grad_norm": 0.7007998824119568, + "learning_rate": 4.418878369237157e-06, + "loss": 0.5849, + "step": 4875 + }, + { + "epoch": 1.3506925207756233, + "grad_norm": 0.6676672697067261, + "learning_rate": 4.418644846836597e-06, + "loss": 0.6379, + "step": 4876 + }, + { + "epoch": 1.3509695290858725, + "grad_norm": 0.6992767453193665, + "learning_rate": 4.418411283698309e-06, + "loss": 0.7062, + "step": 4877 + }, + { + "epoch": 1.351246537396122, + "grad_norm": 0.722588062286377, + "learning_rate": 4.418177679827248e-06, + "loss": 0.6823, + "step": 4878 + }, + { + "epoch": 1.3515235457063712, + "grad_norm": 0.676636815071106, + "learning_rate": 4.417944035228377e-06, + "loss": 0.6406, + "step": 4879 + }, + { + "epoch": 1.3518005540166205, + "grad_norm": 0.7693878412246704, + "learning_rate": 4.4177103499066575e-06, + "loss": 0.6849, + "step": 4880 + }, + { + "epoch": 1.35207756232687, + "grad_norm": 0.8314934968948364, + "learning_rate": 4.41747662386705e-06, + "loss": 0.6413, + "step": 4881 + }, + { + "epoch": 1.3523545706371192, + "grad_norm": 0.6808057427406311, + "learning_rate": 4.417242857114515e-06, + "loss": 0.6657, + "step": 4882 + }, + { + "epoch": 1.3526315789473684, + "grad_norm": 0.6932368278503418, + "learning_rate": 4.41700904965402e-06, + "loss": 0.6405, + "step": 4883 + }, + { + "epoch": 1.3529085872576179, + "grad_norm": 0.683689296245575, + "learning_rate": 4.416775201490527e-06, + "loss": 0.5958, + "step": 4884 + }, + { + "epoch": 1.353185595567867, + "grad_norm": 0.7213323712348938, + "learning_rate": 4.4165413126290015e-06, + "loss": 0.6122, + "step": 4885 + }, + { + "epoch": 1.3534626038781163, + "grad_norm": 0.6757032871246338, + "learning_rate": 4.416307383074408e-06, + "loss": 0.5911, + "step": 4886 + }, + { + "epoch": 1.3537396121883656, + "grad_norm": 0.6949636340141296, + "learning_rate": 4.416073412831717e-06, + "loss": 0.6261, + "step": 4887 + }, + { + "epoch": 1.3540166204986148, + "grad_norm": 0.7025136947631836, + "learning_rate": 4.415839401905894e-06, + "loss": 0.6555, + "step": 4888 + }, + { + "epoch": 1.3542936288088643, + "grad_norm": 0.6896710991859436, + "learning_rate": 4.415605350301908e-06, + "loss": 0.6389, + "step": 4889 + }, + { + "epoch": 1.3545706371191135, + "grad_norm": 0.6909484267234802, + "learning_rate": 4.415371258024727e-06, + "loss": 0.6401, + "step": 4890 + }, + { + "epoch": 1.3548476454293628, + "grad_norm": 0.7090306282043457, + "learning_rate": 4.415137125079325e-06, + "loss": 0.6582, + "step": 4891 + }, + { + "epoch": 1.3551246537396122, + "grad_norm": 0.685055136680603, + "learning_rate": 4.41490295147067e-06, + "loss": 0.5861, + "step": 4892 + }, + { + "epoch": 1.3554016620498615, + "grad_norm": 0.7134220600128174, + "learning_rate": 4.414668737203736e-06, + "loss": 0.6362, + "step": 4893 + }, + { + "epoch": 1.3556786703601107, + "grad_norm": 0.6816451549530029, + "learning_rate": 4.414434482283494e-06, + "loss": 0.6156, + "step": 4894 + }, + { + "epoch": 1.3559556786703602, + "grad_norm": 0.7040759921073914, + "learning_rate": 4.41420018671492e-06, + "loss": 0.6265, + "step": 4895 + }, + { + "epoch": 1.3562326869806094, + "grad_norm": 0.722031831741333, + "learning_rate": 4.413965850502987e-06, + "loss": 0.6878, + "step": 4896 + }, + { + "epoch": 1.3565096952908586, + "grad_norm": 0.6951577663421631, + "learning_rate": 4.413731473652672e-06, + "loss": 0.5759, + "step": 4897 + }, + { + "epoch": 1.356786703601108, + "grad_norm": 0.6912435293197632, + "learning_rate": 4.41349705616895e-06, + "loss": 0.6396, + "step": 4898 + }, + { + "epoch": 1.3570637119113573, + "grad_norm": 0.7001351714134216, + "learning_rate": 4.413262598056799e-06, + "loss": 0.6035, + "step": 4899 + }, + { + "epoch": 1.3573407202216066, + "grad_norm": 0.7252200245857239, + "learning_rate": 4.413028099321198e-06, + "loss": 0.6015, + "step": 4900 + }, + { + "epoch": 1.357617728531856, + "grad_norm": 0.7011927962303162, + "learning_rate": 4.412793559967124e-06, + "loss": 0.6121, + "step": 4901 + }, + { + "epoch": 1.3578947368421053, + "grad_norm": 0.7054765224456787, + "learning_rate": 4.4125589799995585e-06, + "loss": 0.6563, + "step": 4902 + }, + { + "epoch": 1.3581717451523545, + "grad_norm": 0.6914619207382202, + "learning_rate": 4.412324359423482e-06, + "loss": 0.6285, + "step": 4903 + }, + { + "epoch": 1.358448753462604, + "grad_norm": 0.7494106888771057, + "learning_rate": 4.4120896982438745e-06, + "loss": 0.6738, + "step": 4904 + }, + { + "epoch": 1.3587257617728532, + "grad_norm": 0.6712677478790283, + "learning_rate": 4.411854996465721e-06, + "loss": 0.6406, + "step": 4905 + }, + { + "epoch": 1.3590027700831024, + "grad_norm": 0.7037259936332703, + "learning_rate": 4.411620254094003e-06, + "loss": 0.6761, + "step": 4906 + }, + { + "epoch": 1.359279778393352, + "grad_norm": 0.7152909636497498, + "learning_rate": 4.411385471133706e-06, + "loss": 0.6637, + "step": 4907 + }, + { + "epoch": 1.3595567867036011, + "grad_norm": 0.6827277541160583, + "learning_rate": 4.411150647589813e-06, + "loss": 0.6316, + "step": 4908 + }, + { + "epoch": 1.3598337950138504, + "grad_norm": 0.7183017134666443, + "learning_rate": 4.410915783467312e-06, + "loss": 0.6172, + "step": 4909 + }, + { + "epoch": 1.3601108033240998, + "grad_norm": 0.6997835636138916, + "learning_rate": 4.41068087877119e-06, + "loss": 0.6005, + "step": 4910 + }, + { + "epoch": 1.360387811634349, + "grad_norm": 0.6682953834533691, + "learning_rate": 4.410445933506432e-06, + "loss": 0.6269, + "step": 4911 + }, + { + "epoch": 1.3606648199445983, + "grad_norm": 0.7462157011032104, + "learning_rate": 4.4102109476780284e-06, + "loss": 0.6336, + "step": 4912 + }, + { + "epoch": 1.3609418282548478, + "grad_norm": 0.6756572723388672, + "learning_rate": 4.409975921290969e-06, + "loss": 0.6441, + "step": 4913 + }, + { + "epoch": 1.361218836565097, + "grad_norm": 0.7158071398735046, + "learning_rate": 4.409740854350243e-06, + "loss": 0.6703, + "step": 4914 + }, + { + "epoch": 1.3614958448753463, + "grad_norm": 0.7397752404212952, + "learning_rate": 4.409505746860841e-06, + "loss": 0.6339, + "step": 4915 + }, + { + "epoch": 1.3617728531855955, + "grad_norm": 0.7081032991409302, + "learning_rate": 4.409270598827756e-06, + "loss": 0.674, + "step": 4916 + }, + { + "epoch": 1.362049861495845, + "grad_norm": 0.7220388054847717, + "learning_rate": 4.409035410255981e-06, + "loss": 0.647, + "step": 4917 + }, + { + "epoch": 1.3623268698060942, + "grad_norm": 0.7010590434074402, + "learning_rate": 4.408800181150509e-06, + "loss": 0.6112, + "step": 4918 + }, + { + "epoch": 1.3626038781163434, + "grad_norm": 0.7032333016395569, + "learning_rate": 4.408564911516333e-06, + "loss": 0.6207, + "step": 4919 + }, + { + "epoch": 1.3628808864265927, + "grad_norm": 0.7119109630584717, + "learning_rate": 4.408329601358452e-06, + "loss": 0.6234, + "step": 4920 + }, + { + "epoch": 1.3631578947368421, + "grad_norm": 0.7243717908859253, + "learning_rate": 4.408094250681859e-06, + "loss": 0.649, + "step": 4921 + }, + { + "epoch": 1.3634349030470914, + "grad_norm": 0.6528921127319336, + "learning_rate": 4.407858859491552e-06, + "loss": 0.5989, + "step": 4922 + }, + { + "epoch": 1.3637119113573406, + "grad_norm": 0.693771243095398, + "learning_rate": 4.407623427792531e-06, + "loss": 0.6507, + "step": 4923 + }, + { + "epoch": 1.36398891966759, + "grad_norm": 0.6874661445617676, + "learning_rate": 4.407387955589792e-06, + "loss": 0.6474, + "step": 4924 + }, + { + "epoch": 1.3642659279778393, + "grad_norm": 0.7141904830932617, + "learning_rate": 4.407152442888336e-06, + "loss": 0.6593, + "step": 4925 + }, + { + "epoch": 1.3645429362880885, + "grad_norm": 0.7314815521240234, + "learning_rate": 4.406916889693163e-06, + "loss": 0.6326, + "step": 4926 + }, + { + "epoch": 1.364819944598338, + "grad_norm": 0.6970093846321106, + "learning_rate": 4.4066812960092754e-06, + "loss": 0.5605, + "step": 4927 + }, + { + "epoch": 1.3650969529085872, + "grad_norm": 0.7311210632324219, + "learning_rate": 4.406445661841674e-06, + "loss": 0.6857, + "step": 4928 + }, + { + "epoch": 1.3653739612188365, + "grad_norm": 0.6865612864494324, + "learning_rate": 4.406209987195363e-06, + "loss": 0.6316, + "step": 4929 + }, + { + "epoch": 1.365650969529086, + "grad_norm": 0.7145235538482666, + "learning_rate": 4.405974272075347e-06, + "loss": 0.6357, + "step": 4930 + }, + { + "epoch": 1.3659279778393352, + "grad_norm": 0.6490334272384644, + "learning_rate": 4.405738516486628e-06, + "loss": 0.625, + "step": 4931 + }, + { + "epoch": 1.3662049861495844, + "grad_norm": 0.6927633881568909, + "learning_rate": 4.405502720434215e-06, + "loss": 0.6267, + "step": 4932 + }, + { + "epoch": 1.3664819944598339, + "grad_norm": 0.6824126839637756, + "learning_rate": 4.405266883923112e-06, + "loss": 0.6357, + "step": 4933 + }, + { + "epoch": 1.366759002770083, + "grad_norm": 0.7101442813873291, + "learning_rate": 4.405031006958328e-06, + "loss": 0.6518, + "step": 4934 + }, + { + "epoch": 1.3670360110803323, + "grad_norm": 0.7349445223808289, + "learning_rate": 4.40479508954487e-06, + "loss": 0.6223, + "step": 4935 + }, + { + "epoch": 1.3673130193905818, + "grad_norm": 0.6581106781959534, + "learning_rate": 4.404559131687749e-06, + "loss": 0.6091, + "step": 4936 + }, + { + "epoch": 1.367590027700831, + "grad_norm": 0.692273736000061, + "learning_rate": 4.404323133391974e-06, + "loss": 0.6242, + "step": 4937 + }, + { + "epoch": 1.3678670360110803, + "grad_norm": 0.6735270619392395, + "learning_rate": 4.404087094662556e-06, + "loss": 0.5825, + "step": 4938 + }, + { + "epoch": 1.3681440443213297, + "grad_norm": 0.6877938508987427, + "learning_rate": 4.403851015504506e-06, + "loss": 0.6088, + "step": 4939 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.690207302570343, + "learning_rate": 4.4036148959228365e-06, + "loss": 0.6462, + "step": 4940 + }, + { + "epoch": 1.3686980609418282, + "grad_norm": 0.6809142827987671, + "learning_rate": 4.4033787359225626e-06, + "loss": 0.649, + "step": 4941 + }, + { + "epoch": 1.3689750692520777, + "grad_norm": 0.7366628646850586, + "learning_rate": 4.403142535508697e-06, + "loss": 0.6731, + "step": 4942 + }, + { + "epoch": 1.369252077562327, + "grad_norm": 0.7015873789787292, + "learning_rate": 4.402906294686254e-06, + "loss": 0.6381, + "step": 4943 + }, + { + "epoch": 1.3695290858725762, + "grad_norm": 0.6807177662849426, + "learning_rate": 4.402670013460252e-06, + "loss": 0.6652, + "step": 4944 + }, + { + "epoch": 1.3698060941828256, + "grad_norm": 0.6987523436546326, + "learning_rate": 4.402433691835707e-06, + "loss": 0.614, + "step": 4945 + }, + { + "epoch": 1.3700831024930749, + "grad_norm": 0.7272536754608154, + "learning_rate": 4.402197329817636e-06, + "loss": 0.5836, + "step": 4946 + }, + { + "epoch": 1.370360110803324, + "grad_norm": 0.7454519867897034, + "learning_rate": 4.401960927411057e-06, + "loss": 0.6685, + "step": 4947 + }, + { + "epoch": 1.3706371191135733, + "grad_norm": 0.6987824440002441, + "learning_rate": 4.401724484620993e-06, + "loss": 0.6276, + "step": 4948 + }, + { + "epoch": 1.3709141274238228, + "grad_norm": 0.7243875861167908, + "learning_rate": 4.40148800145246e-06, + "loss": 0.6288, + "step": 4949 + }, + { + "epoch": 1.371191135734072, + "grad_norm": 0.6841127872467041, + "learning_rate": 4.4012514779104805e-06, + "loss": 0.6113, + "step": 4950 + }, + { + "epoch": 1.3714681440443213, + "grad_norm": 0.663722574710846, + "learning_rate": 4.401014914000078e-06, + "loss": 0.6448, + "step": 4951 + }, + { + "epoch": 1.3717451523545705, + "grad_norm": 0.7879506349563599, + "learning_rate": 4.400778309726273e-06, + "loss": 0.6207, + "step": 4952 + }, + { + "epoch": 1.37202216066482, + "grad_norm": 0.7135480046272278, + "learning_rate": 4.400541665094091e-06, + "loss": 0.6194, + "step": 4953 + }, + { + "epoch": 1.3722991689750692, + "grad_norm": 0.6983644366264343, + "learning_rate": 4.4003049801085565e-06, + "loss": 0.6689, + "step": 4954 + }, + { + "epoch": 1.3725761772853184, + "grad_norm": 0.6594923734664917, + "learning_rate": 4.4000682547746945e-06, + "loss": 0.6202, + "step": 4955 + }, + { + "epoch": 1.372853185595568, + "grad_norm": 0.6694162487983704, + "learning_rate": 4.39983148909753e-06, + "loss": 0.6257, + "step": 4956 + }, + { + "epoch": 1.3731301939058171, + "grad_norm": 0.7033407688140869, + "learning_rate": 4.399594683082092e-06, + "loss": 0.626, + "step": 4957 + }, + { + "epoch": 1.3734072022160664, + "grad_norm": 0.6754797101020813, + "learning_rate": 4.399357836733408e-06, + "loss": 0.6152, + "step": 4958 + }, + { + "epoch": 1.3736842105263158, + "grad_norm": 0.6850981712341309, + "learning_rate": 4.399120950056507e-06, + "loss": 0.6074, + "step": 4959 + }, + { + "epoch": 1.373961218836565, + "grad_norm": 0.6460973024368286, + "learning_rate": 4.398884023056418e-06, + "loss": 0.6314, + "step": 4960 + }, + { + "epoch": 1.3742382271468143, + "grad_norm": 0.7158933281898499, + "learning_rate": 4.398647055738172e-06, + "loss": 0.6515, + "step": 4961 + }, + { + "epoch": 1.3745152354570638, + "grad_norm": 0.7266000509262085, + "learning_rate": 4.3984100481068e-06, + "loss": 0.639, + "step": 4962 + }, + { + "epoch": 1.374792243767313, + "grad_norm": 0.7677624225616455, + "learning_rate": 4.398173000167335e-06, + "loss": 0.6462, + "step": 4963 + }, + { + "epoch": 1.3750692520775623, + "grad_norm": 0.6976761817932129, + "learning_rate": 4.39793591192481e-06, + "loss": 0.6105, + "step": 4964 + }, + { + "epoch": 1.3753462603878117, + "grad_norm": 0.6852526664733887, + "learning_rate": 4.397698783384259e-06, + "loss": 0.6208, + "step": 4965 + }, + { + "epoch": 1.375623268698061, + "grad_norm": 0.7142199277877808, + "learning_rate": 4.397461614550716e-06, + "loss": 0.6435, + "step": 4966 + }, + { + "epoch": 1.3759002770083102, + "grad_norm": 0.6593324542045593, + "learning_rate": 4.397224405429218e-06, + "loss": 0.6297, + "step": 4967 + }, + { + "epoch": 1.3761772853185597, + "grad_norm": 0.6773996353149414, + "learning_rate": 4.3969871560248e-06, + "loss": 0.6538, + "step": 4968 + }, + { + "epoch": 1.376454293628809, + "grad_norm": 0.7089106440544128, + "learning_rate": 4.396749866342501e-06, + "loss": 0.6292, + "step": 4969 + }, + { + "epoch": 1.3767313019390581, + "grad_norm": 0.6807532906532288, + "learning_rate": 4.396512536387358e-06, + "loss": 0.632, + "step": 4970 + }, + { + "epoch": 1.3770083102493076, + "grad_norm": 0.6472201347351074, + "learning_rate": 4.396275166164411e-06, + "loss": 0.627, + "step": 4971 + }, + { + "epoch": 1.3772853185595568, + "grad_norm": 0.675743043422699, + "learning_rate": 4.3960377556787e-06, + "loss": 0.6061, + "step": 4972 + }, + { + "epoch": 1.377562326869806, + "grad_norm": 0.6863962411880493, + "learning_rate": 4.395800304935265e-06, + "loss": 0.6055, + "step": 4973 + }, + { + "epoch": 1.3778393351800555, + "grad_norm": 0.6849420666694641, + "learning_rate": 4.395562813939148e-06, + "loss": 0.6504, + "step": 4974 + }, + { + "epoch": 1.3781163434903048, + "grad_norm": 0.6862570643424988, + "learning_rate": 4.395325282695393e-06, + "loss": 0.5924, + "step": 4975 + }, + { + "epoch": 1.378393351800554, + "grad_norm": 0.7083377838134766, + "learning_rate": 4.395087711209041e-06, + "loss": 0.61, + "step": 4976 + }, + { + "epoch": 1.3786703601108035, + "grad_norm": 0.6750888228416443, + "learning_rate": 4.394850099485138e-06, + "loss": 0.6368, + "step": 4977 + }, + { + "epoch": 1.3789473684210527, + "grad_norm": 0.6690243482589722, + "learning_rate": 4.3946124475287275e-06, + "loss": 0.62, + "step": 4978 + }, + { + "epoch": 1.379224376731302, + "grad_norm": 0.6947623491287231, + "learning_rate": 4.3943747553448566e-06, + "loss": 0.6352, + "step": 4979 + }, + { + "epoch": 1.3795013850415512, + "grad_norm": 0.7098008394241333, + "learning_rate": 4.3941370229385725e-06, + "loss": 0.6206, + "step": 4980 + }, + { + "epoch": 1.3797783933518006, + "grad_norm": 0.7828469276428223, + "learning_rate": 4.393899250314923e-06, + "loss": 0.6318, + "step": 4981 + }, + { + "epoch": 1.3800554016620499, + "grad_norm": 0.6869782209396362, + "learning_rate": 4.3936614374789545e-06, + "loss": 0.651, + "step": 4982 + }, + { + "epoch": 1.3803324099722991, + "grad_norm": 0.6738064885139465, + "learning_rate": 4.393423584435718e-06, + "loss": 0.6511, + "step": 4983 + }, + { + "epoch": 1.3806094182825484, + "grad_norm": 0.7082775831222534, + "learning_rate": 4.3931856911902635e-06, + "loss": 0.6642, + "step": 4984 + }, + { + "epoch": 1.3808864265927978, + "grad_norm": 0.7249225378036499, + "learning_rate": 4.392947757747643e-06, + "loss": 0.618, + "step": 4985 + }, + { + "epoch": 1.381163434903047, + "grad_norm": 0.6885894536972046, + "learning_rate": 4.392709784112907e-06, + "loss": 0.6463, + "step": 4986 + }, + { + "epoch": 1.3814404432132963, + "grad_norm": 0.7157624959945679, + "learning_rate": 4.392471770291109e-06, + "loss": 0.6746, + "step": 4987 + }, + { + "epoch": 1.3817174515235457, + "grad_norm": 0.7281468510627747, + "learning_rate": 4.392233716287302e-06, + "loss": 0.6293, + "step": 4988 + }, + { + "epoch": 1.381994459833795, + "grad_norm": 0.7012090086936951, + "learning_rate": 4.3919956221065415e-06, + "loss": 0.653, + "step": 4989 + }, + { + "epoch": 1.3822714681440442, + "grad_norm": 0.6941935420036316, + "learning_rate": 4.391757487753882e-06, + "loss": 0.6138, + "step": 4990 + }, + { + "epoch": 1.3825484764542937, + "grad_norm": 0.703411340713501, + "learning_rate": 4.39151931323438e-06, + "loss": 0.6536, + "step": 4991 + }, + { + "epoch": 1.382825484764543, + "grad_norm": 0.6732279658317566, + "learning_rate": 4.391281098553093e-06, + "loss": 0.6215, + "step": 4992 + }, + { + "epoch": 1.3831024930747922, + "grad_norm": 0.7355413436889648, + "learning_rate": 4.3910428437150776e-06, + "loss": 0.6319, + "step": 4993 + }, + { + "epoch": 1.3833795013850416, + "grad_norm": 0.6964555978775024, + "learning_rate": 4.390804548725394e-06, + "loss": 0.6176, + "step": 4994 + }, + { + "epoch": 1.3836565096952909, + "grad_norm": 0.7024789452552795, + "learning_rate": 4.390566213589101e-06, + "loss": 0.6563, + "step": 4995 + }, + { + "epoch": 1.38393351800554, + "grad_norm": 0.6982277035713196, + "learning_rate": 4.3903278383112606e-06, + "loss": 0.6427, + "step": 4996 + }, + { + "epoch": 1.3842105263157896, + "grad_norm": 0.7301534414291382, + "learning_rate": 4.390089422896931e-06, + "loss": 0.6351, + "step": 4997 + }, + { + "epoch": 1.3844875346260388, + "grad_norm": 0.7176867127418518, + "learning_rate": 4.389850967351177e-06, + "loss": 0.6316, + "step": 4998 + }, + { + "epoch": 1.384764542936288, + "grad_norm": 0.729347825050354, + "learning_rate": 4.3896124716790615e-06, + "loss": 0.6598, + "step": 4999 + }, + { + "epoch": 1.3850415512465375, + "grad_norm": 0.6763535737991333, + "learning_rate": 4.3893739358856465e-06, + "loss": 0.658, + "step": 5000 + }, + { + "epoch": 1.3853185595567867, + "grad_norm": 0.6909424066543579, + "learning_rate": 4.389135359975998e-06, + "loss": 0.5952, + "step": 5001 + }, + { + "epoch": 1.385595567867036, + "grad_norm": 0.6764463186264038, + "learning_rate": 4.388896743955182e-06, + "loss": 0.6299, + "step": 5002 + }, + { + "epoch": 1.3858725761772854, + "grad_norm": 0.7030847668647766, + "learning_rate": 4.388658087828264e-06, + "loss": 0.6338, + "step": 5003 + }, + { + "epoch": 1.3861495844875347, + "grad_norm": 0.670483410358429, + "learning_rate": 4.388419391600312e-06, + "loss": 0.6809, + "step": 5004 + }, + { + "epoch": 1.386426592797784, + "grad_norm": 0.6741538643836975, + "learning_rate": 4.388180655276394e-06, + "loss": 0.5736, + "step": 5005 + }, + { + "epoch": 1.3867036011080334, + "grad_norm": 0.7062891125679016, + "learning_rate": 4.387941878861578e-06, + "loss": 0.6475, + "step": 5006 + }, + { + "epoch": 1.3869806094182826, + "grad_norm": 0.7449402213096619, + "learning_rate": 4.387703062360935e-06, + "loss": 0.6448, + "step": 5007 + }, + { + "epoch": 1.3872576177285318, + "grad_norm": 0.714369535446167, + "learning_rate": 4.387464205779536e-06, + "loss": 0.6513, + "step": 5008 + }, + { + "epoch": 1.387534626038781, + "grad_norm": 0.6852971911430359, + "learning_rate": 4.387225309122451e-06, + "loss": 0.61, + "step": 5009 + }, + { + "epoch": 1.3878116343490305, + "grad_norm": 0.7403523325920105, + "learning_rate": 4.3869863723947536e-06, + "loss": 0.6809, + "step": 5010 + }, + { + "epoch": 1.3880886426592798, + "grad_norm": 0.6950234770774841, + "learning_rate": 4.386747395601517e-06, + "loss": 0.6178, + "step": 5011 + }, + { + "epoch": 1.388365650969529, + "grad_norm": 0.7244662642478943, + "learning_rate": 4.386508378747815e-06, + "loss": 0.687, + "step": 5012 + }, + { + "epoch": 1.3886426592797783, + "grad_norm": 0.7085997462272644, + "learning_rate": 4.386269321838722e-06, + "loss": 0.5976, + "step": 5013 + }, + { + "epoch": 1.3889196675900277, + "grad_norm": 0.7164833545684814, + "learning_rate": 4.386030224879314e-06, + "loss": 0.6344, + "step": 5014 + }, + { + "epoch": 1.389196675900277, + "grad_norm": 0.6968429088592529, + "learning_rate": 4.385791087874668e-06, + "loss": 0.6052, + "step": 5015 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 0.6692972779273987, + "learning_rate": 4.385551910829861e-06, + "loss": 0.5957, + "step": 5016 + }, + { + "epoch": 1.3897506925207757, + "grad_norm": 0.6999967694282532, + "learning_rate": 4.3853126937499735e-06, + "loss": 0.6415, + "step": 5017 + }, + { + "epoch": 1.390027700831025, + "grad_norm": 0.6935191750526428, + "learning_rate": 4.385073436640081e-06, + "loss": 0.6374, + "step": 5018 + }, + { + "epoch": 1.3903047091412741, + "grad_norm": 0.7233996391296387, + "learning_rate": 4.384834139505267e-06, + "loss": 0.6473, + "step": 5019 + }, + { + "epoch": 1.3905817174515236, + "grad_norm": 0.7004993557929993, + "learning_rate": 4.3845948023506106e-06, + "loss": 0.6411, + "step": 5020 + }, + { + "epoch": 1.3908587257617728, + "grad_norm": 0.7594379186630249, + "learning_rate": 4.384355425181193e-06, + "loss": 0.6059, + "step": 5021 + }, + { + "epoch": 1.391135734072022, + "grad_norm": 0.7419117093086243, + "learning_rate": 4.384116008002099e-06, + "loss": 0.671, + "step": 5022 + }, + { + "epoch": 1.3914127423822715, + "grad_norm": 0.6777958869934082, + "learning_rate": 4.3838765508184095e-06, + "loss": 0.6296, + "step": 5023 + }, + { + "epoch": 1.3916897506925208, + "grad_norm": 0.6925370693206787, + "learning_rate": 4.38363705363521e-06, + "loss": 0.6727, + "step": 5024 + }, + { + "epoch": 1.39196675900277, + "grad_norm": 0.6831212639808655, + "learning_rate": 4.383397516457586e-06, + "loss": 0.648, + "step": 5025 + }, + { + "epoch": 1.3922437673130195, + "grad_norm": 0.6936487555503845, + "learning_rate": 4.383157939290624e-06, + "loss": 0.6504, + "step": 5026 + }, + { + "epoch": 1.3925207756232687, + "grad_norm": 0.6868559718132019, + "learning_rate": 4.382918322139407e-06, + "loss": 0.5858, + "step": 5027 + }, + { + "epoch": 1.392797783933518, + "grad_norm": 0.7001911401748657, + "learning_rate": 4.382678665009028e-06, + "loss": 0.6252, + "step": 5028 + }, + { + "epoch": 1.3930747922437674, + "grad_norm": 0.6877859830856323, + "learning_rate": 4.382438967904572e-06, + "loss": 0.6169, + "step": 5029 + }, + { + "epoch": 1.3933518005540166, + "grad_norm": 0.7069692015647888, + "learning_rate": 4.38219923083113e-06, + "loss": 0.6361, + "step": 5030 + }, + { + "epoch": 1.3936288088642659, + "grad_norm": 0.7657400369644165, + "learning_rate": 4.381959453793792e-06, + "loss": 0.6686, + "step": 5031 + }, + { + "epoch": 1.3939058171745153, + "grad_norm": 0.7232145667076111, + "learning_rate": 4.381719636797648e-06, + "loss": 0.6654, + "step": 5032 + }, + { + "epoch": 1.3941828254847646, + "grad_norm": 0.676956057548523, + "learning_rate": 4.381479779847791e-06, + "loss": 0.6518, + "step": 5033 + }, + { + "epoch": 1.3944598337950138, + "grad_norm": 0.6840213537216187, + "learning_rate": 4.381239882949314e-06, + "loss": 0.596, + "step": 5034 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.6917917728424072, + "learning_rate": 4.380999946107309e-06, + "loss": 0.6598, + "step": 5035 + }, + { + "epoch": 1.3950138504155125, + "grad_norm": 0.6727468967437744, + "learning_rate": 4.380759969326873e-06, + "loss": 0.666, + "step": 5036 + }, + { + "epoch": 1.3952908587257618, + "grad_norm": 0.6889804601669312, + "learning_rate": 4.3805199526131e-06, + "loss": 0.687, + "step": 5037 + }, + { + "epoch": 1.3955678670360112, + "grad_norm": 0.6997852325439453, + "learning_rate": 4.380279895971085e-06, + "loss": 0.6766, + "step": 5038 + }, + { + "epoch": 1.3958448753462604, + "grad_norm": 0.760193407535553, + "learning_rate": 4.380039799405926e-06, + "loss": 0.6834, + "step": 5039 + }, + { + "epoch": 1.3961218836565097, + "grad_norm": 0.7114586234092712, + "learning_rate": 4.379799662922721e-06, + "loss": 0.6307, + "step": 5040 + }, + { + "epoch": 1.396398891966759, + "grad_norm": 0.6752461194992065, + "learning_rate": 4.379559486526569e-06, + "loss": 0.6107, + "step": 5041 + }, + { + "epoch": 1.3966759002770084, + "grad_norm": 0.7001312375068665, + "learning_rate": 4.379319270222569e-06, + "loss": 0.6305, + "step": 5042 + }, + { + "epoch": 1.3969529085872576, + "grad_norm": 0.7060052156448364, + "learning_rate": 4.379079014015822e-06, + "loss": 0.626, + "step": 5043 + }, + { + "epoch": 1.3972299168975069, + "grad_norm": 0.684003472328186, + "learning_rate": 4.378838717911429e-06, + "loss": 0.6246, + "step": 5044 + }, + { + "epoch": 1.397506925207756, + "grad_norm": 0.6880972385406494, + "learning_rate": 4.378598381914491e-06, + "loss": 0.628, + "step": 5045 + }, + { + "epoch": 1.3977839335180056, + "grad_norm": 0.7145376801490784, + "learning_rate": 4.3783580060301135e-06, + "loss": 0.6342, + "step": 5046 + }, + { + "epoch": 1.3980609418282548, + "grad_norm": 0.7057557106018066, + "learning_rate": 4.378117590263398e-06, + "loss": 0.6696, + "step": 5047 + }, + { + "epoch": 1.398337950138504, + "grad_norm": 0.6906470656394958, + "learning_rate": 4.37787713461945e-06, + "loss": 0.5806, + "step": 5048 + }, + { + "epoch": 1.3986149584487535, + "grad_norm": 0.669457733631134, + "learning_rate": 4.377636639103375e-06, + "loss": 0.5847, + "step": 5049 + }, + { + "epoch": 1.3988919667590027, + "grad_norm": 0.6445726752281189, + "learning_rate": 4.3773961037202784e-06, + "loss": 0.5876, + "step": 5050 + }, + { + "epoch": 1.399168975069252, + "grad_norm": 0.7106205224990845, + "learning_rate": 4.377155528475268e-06, + "loss": 0.6559, + "step": 5051 + }, + { + "epoch": 1.3994459833795014, + "grad_norm": 0.6805086135864258, + "learning_rate": 4.376914913373453e-06, + "loss": 0.6066, + "step": 5052 + }, + { + "epoch": 1.3997229916897507, + "grad_norm": 0.8467181921005249, + "learning_rate": 4.3766742584199415e-06, + "loss": 0.623, + "step": 5053 + }, + { + "epoch": 1.4, + "grad_norm": 0.687349796295166, + "learning_rate": 4.376433563619843e-06, + "loss": 0.6407, + "step": 5054 + }, + { + "epoch": 1.4002770083102494, + "grad_norm": 0.7355000376701355, + "learning_rate": 4.376192828978266e-06, + "loss": 0.6078, + "step": 5055 + }, + { + "epoch": 1.4005540166204986, + "grad_norm": 0.6767886877059937, + "learning_rate": 4.375952054500326e-06, + "loss": 0.6645, + "step": 5056 + }, + { + "epoch": 1.4008310249307478, + "grad_norm": 0.7190877199172974, + "learning_rate": 4.3757112401911326e-06, + "loss": 0.6333, + "step": 5057 + }, + { + "epoch": 1.4011080332409973, + "grad_norm": 0.702785849571228, + "learning_rate": 4.3754703860558e-06, + "loss": 0.6458, + "step": 5058 + }, + { + "epoch": 1.4013850415512465, + "grad_norm": 0.6814035773277283, + "learning_rate": 4.375229492099441e-06, + "loss": 0.6532, + "step": 5059 + }, + { + "epoch": 1.4016620498614958, + "grad_norm": 0.6846272349357605, + "learning_rate": 4.374988558327171e-06, + "loss": 0.6325, + "step": 5060 + }, + { + "epoch": 1.4019390581717452, + "grad_norm": 0.6913299560546875, + "learning_rate": 4.374747584744106e-06, + "loss": 0.613, + "step": 5061 + }, + { + "epoch": 1.4022160664819945, + "grad_norm": 0.6918284893035889, + "learning_rate": 4.374506571355362e-06, + "loss": 0.6599, + "step": 5062 + }, + { + "epoch": 1.4024930747922437, + "grad_norm": 0.7418447136878967, + "learning_rate": 4.374265518166057e-06, + "loss": 0.6344, + "step": 5063 + }, + { + "epoch": 1.4027700831024932, + "grad_norm": 0.6935515403747559, + "learning_rate": 4.374024425181308e-06, + "loss": 0.6421, + "step": 5064 + }, + { + "epoch": 1.4030470914127424, + "grad_norm": 0.6994535326957703, + "learning_rate": 4.3737832924062365e-06, + "loss": 0.655, + "step": 5065 + }, + { + "epoch": 1.4033240997229917, + "grad_norm": 0.6631689667701721, + "learning_rate": 4.373542119845959e-06, + "loss": 0.5511, + "step": 5066 + }, + { + "epoch": 1.4036011080332411, + "grad_norm": 0.6856090426445007, + "learning_rate": 4.3733009075055975e-06, + "loss": 0.6661, + "step": 5067 + }, + { + "epoch": 1.4038781163434904, + "grad_norm": 0.693882405757904, + "learning_rate": 4.373059655390274e-06, + "loss": 0.625, + "step": 5068 + }, + { + "epoch": 1.4041551246537396, + "grad_norm": 0.6695311069488525, + "learning_rate": 4.372818363505112e-06, + "loss": 0.5803, + "step": 5069 + }, + { + "epoch": 1.404432132963989, + "grad_norm": 0.6360461115837097, + "learning_rate": 4.372577031855232e-06, + "loss": 0.6051, + "step": 5070 + }, + { + "epoch": 1.4047091412742383, + "grad_norm": 0.6905994415283203, + "learning_rate": 4.37233566044576e-06, + "loss": 0.6168, + "step": 5071 + }, + { + "epoch": 1.4049861495844875, + "grad_norm": 0.7065186500549316, + "learning_rate": 4.37209424928182e-06, + "loss": 0.6804, + "step": 5072 + }, + { + "epoch": 1.4052631578947368, + "grad_norm": 0.7095279097557068, + "learning_rate": 4.37185279836854e-06, + "loss": 0.6599, + "step": 5073 + }, + { + "epoch": 1.4055401662049862, + "grad_norm": 0.6844799518585205, + "learning_rate": 4.371611307711044e-06, + "loss": 0.642, + "step": 5074 + }, + { + "epoch": 1.4058171745152355, + "grad_norm": 0.6771243810653687, + "learning_rate": 4.37136977731446e-06, + "loss": 0.6468, + "step": 5075 + }, + { + "epoch": 1.4060941828254847, + "grad_norm": 0.6787841320037842, + "learning_rate": 4.371128207183917e-06, + "loss": 0.6116, + "step": 5076 + }, + { + "epoch": 1.406371191135734, + "grad_norm": 0.6891928315162659, + "learning_rate": 4.3708865973245436e-06, + "loss": 0.6625, + "step": 5077 + }, + { + "epoch": 1.4066481994459834, + "grad_norm": 0.6616782546043396, + "learning_rate": 4.3706449477414704e-06, + "loss": 0.5704, + "step": 5078 + }, + { + "epoch": 1.4069252077562326, + "grad_norm": 0.686374306678772, + "learning_rate": 4.370403258439827e-06, + "loss": 0.6485, + "step": 5079 + }, + { + "epoch": 1.4072022160664819, + "grad_norm": 0.7003771662712097, + "learning_rate": 4.370161529424747e-06, + "loss": 0.6582, + "step": 5080 + }, + { + "epoch": 1.4074792243767313, + "grad_norm": 0.6979756355285645, + "learning_rate": 4.369919760701361e-06, + "loss": 0.5902, + "step": 5081 + }, + { + "epoch": 1.4077562326869806, + "grad_norm": 0.7365483641624451, + "learning_rate": 4.369677952274803e-06, + "loss": 0.637, + "step": 5082 + }, + { + "epoch": 1.4080332409972298, + "grad_norm": 0.650162935256958, + "learning_rate": 4.369436104150208e-06, + "loss": 0.6277, + "step": 5083 + }, + { + "epoch": 1.4083102493074793, + "grad_norm": 0.7429336905479431, + "learning_rate": 4.369194216332712e-06, + "loss": 0.6687, + "step": 5084 + }, + { + "epoch": 1.4085872576177285, + "grad_norm": 0.7430762052536011, + "learning_rate": 4.368952288827447e-06, + "loss": 0.6399, + "step": 5085 + }, + { + "epoch": 1.4088642659279778, + "grad_norm": 0.680882453918457, + "learning_rate": 4.368710321639553e-06, + "loss": 0.6187, + "step": 5086 + }, + { + "epoch": 1.4091412742382272, + "grad_norm": 0.714248538017273, + "learning_rate": 4.368468314774167e-06, + "loss": 0.6387, + "step": 5087 + }, + { + "epoch": 1.4094182825484765, + "grad_norm": 0.6917892694473267, + "learning_rate": 4.368226268236427e-06, + "loss": 0.6168, + "step": 5088 + }, + { + "epoch": 1.4096952908587257, + "grad_norm": 0.6685717105865479, + "learning_rate": 4.367984182031472e-06, + "loss": 0.6311, + "step": 5089 + }, + { + "epoch": 1.4099722991689752, + "grad_norm": 0.7154122591018677, + "learning_rate": 4.367742056164443e-06, + "loss": 0.6407, + "step": 5090 + }, + { + "epoch": 1.4102493074792244, + "grad_norm": 0.73028165102005, + "learning_rate": 4.36749989064048e-06, + "loss": 0.6499, + "step": 5091 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 0.7013839483261108, + "learning_rate": 4.367257685464726e-06, + "loss": 0.658, + "step": 5092 + }, + { + "epoch": 1.410803324099723, + "grad_norm": 0.6611378788948059, + "learning_rate": 4.367015440642321e-06, + "loss": 0.6839, + "step": 5093 + }, + { + "epoch": 1.4110803324099723, + "grad_norm": 0.6703953146934509, + "learning_rate": 4.366773156178413e-06, + "loss": 0.5726, + "step": 5094 + }, + { + "epoch": 1.4113573407202216, + "grad_norm": 0.7304432392120361, + "learning_rate": 4.366530832078142e-06, + "loss": 0.6775, + "step": 5095 + }, + { + "epoch": 1.411634349030471, + "grad_norm": 0.6808119416236877, + "learning_rate": 4.366288468346655e-06, + "loss": 0.6244, + "step": 5096 + }, + { + "epoch": 1.4119113573407203, + "grad_norm": 0.7225993871688843, + "learning_rate": 4.366046064989099e-06, + "loss": 0.6391, + "step": 5097 + }, + { + "epoch": 1.4121883656509695, + "grad_norm": 0.7567205429077148, + "learning_rate": 4.365803622010618e-06, + "loss": 0.602, + "step": 5098 + }, + { + "epoch": 1.412465373961219, + "grad_norm": 0.7190272212028503, + "learning_rate": 4.365561139416362e-06, + "loss": 0.6829, + "step": 5099 + }, + { + "epoch": 1.4127423822714682, + "grad_norm": 0.7132216691970825, + "learning_rate": 4.365318617211479e-06, + "loss": 0.6793, + "step": 5100 + }, + { + "epoch": 1.4130193905817174, + "grad_norm": 0.7090128660202026, + "learning_rate": 4.365076055401118e-06, + "loss": 0.6117, + "step": 5101 + }, + { + "epoch": 1.4132963988919667, + "grad_norm": 0.658957302570343, + "learning_rate": 4.36483345399043e-06, + "loss": 0.6093, + "step": 5102 + }, + { + "epoch": 1.4135734072022161, + "grad_norm": 0.7175829410552979, + "learning_rate": 4.364590812984565e-06, + "loss": 0.6222, + "step": 5103 + }, + { + "epoch": 1.4138504155124654, + "grad_norm": 0.7089779376983643, + "learning_rate": 4.364348132388676e-06, + "loss": 0.6569, + "step": 5104 + }, + { + "epoch": 1.4141274238227146, + "grad_norm": 0.6942974925041199, + "learning_rate": 4.364105412207914e-06, + "loss": 0.6322, + "step": 5105 + }, + { + "epoch": 1.4144044321329639, + "grad_norm": 0.6663142442703247, + "learning_rate": 4.3638626524474345e-06, + "loss": 0.6599, + "step": 5106 + }, + { + "epoch": 1.4146814404432133, + "grad_norm": 0.6893264651298523, + "learning_rate": 4.363619853112391e-06, + "loss": 0.5942, + "step": 5107 + }, + { + "epoch": 1.4149584487534625, + "grad_norm": 0.7206471562385559, + "learning_rate": 4.363377014207939e-06, + "loss": 0.6543, + "step": 5108 + }, + { + "epoch": 1.4152354570637118, + "grad_norm": 0.7506195306777954, + "learning_rate": 4.363134135739234e-06, + "loss": 0.6474, + "step": 5109 + }, + { + "epoch": 1.4155124653739612, + "grad_norm": 0.6773307919502258, + "learning_rate": 4.362891217711434e-06, + "loss": 0.586, + "step": 5110 + }, + { + "epoch": 1.4157894736842105, + "grad_norm": 0.6862797141075134, + "learning_rate": 4.362648260129696e-06, + "loss": 0.633, + "step": 5111 + }, + { + "epoch": 1.4160664819944597, + "grad_norm": 0.6824764609336853, + "learning_rate": 4.3624052629991785e-06, + "loss": 0.6649, + "step": 5112 + }, + { + "epoch": 1.4163434903047092, + "grad_norm": 0.6997144222259521, + "learning_rate": 4.362162226325042e-06, + "loss": 0.6156, + "step": 5113 + }, + { + "epoch": 1.4166204986149584, + "grad_norm": 0.7159362435340881, + "learning_rate": 4.361919150112444e-06, + "loss": 0.6039, + "step": 5114 + }, + { + "epoch": 1.4168975069252077, + "grad_norm": 0.6508421897888184, + "learning_rate": 4.3616760343665495e-06, + "loss": 0.5546, + "step": 5115 + }, + { + "epoch": 1.4171745152354571, + "grad_norm": 0.7000342011451721, + "learning_rate": 4.361432879092518e-06, + "loss": 0.6295, + "step": 5116 + }, + { + "epoch": 1.4174515235457064, + "grad_norm": 0.6941149830818176, + "learning_rate": 4.361189684295513e-06, + "loss": 0.6089, + "step": 5117 + }, + { + "epoch": 1.4177285318559556, + "grad_norm": 0.7070619463920593, + "learning_rate": 4.360946449980698e-06, + "loss": 0.6032, + "step": 5118 + }, + { + "epoch": 1.418005540166205, + "grad_norm": 0.6684051752090454, + "learning_rate": 4.360703176153237e-06, + "loss": 0.6237, + "step": 5119 + }, + { + "epoch": 1.4182825484764543, + "grad_norm": 0.6901852488517761, + "learning_rate": 4.360459862818296e-06, + "loss": 0.6612, + "step": 5120 + }, + { + "epoch": 1.4185595567867035, + "grad_norm": 0.7107248306274414, + "learning_rate": 4.360216509981042e-06, + "loss": 0.6356, + "step": 5121 + }, + { + "epoch": 1.418836565096953, + "grad_norm": 0.7306237816810608, + "learning_rate": 4.35997311764664e-06, + "loss": 0.6512, + "step": 5122 + }, + { + "epoch": 1.4191135734072022, + "grad_norm": 0.7205201387405396, + "learning_rate": 4.359729685820259e-06, + "loss": 0.6506, + "step": 5123 + }, + { + "epoch": 1.4193905817174515, + "grad_norm": 0.7062286138534546, + "learning_rate": 4.359486214507068e-06, + "loss": 0.6119, + "step": 5124 + }, + { + "epoch": 1.419667590027701, + "grad_norm": 0.6981781125068665, + "learning_rate": 4.359242703712235e-06, + "loss": 0.5616, + "step": 5125 + }, + { + "epoch": 1.4199445983379502, + "grad_norm": 0.7032713890075684, + "learning_rate": 4.358999153440932e-06, + "loss": 0.5854, + "step": 5126 + }, + { + "epoch": 1.4202216066481994, + "grad_norm": 0.6792232990264893, + "learning_rate": 4.358755563698331e-06, + "loss": 0.6368, + "step": 5127 + }, + { + "epoch": 1.4204986149584489, + "grad_norm": 0.6883552670478821, + "learning_rate": 4.358511934489601e-06, + "loss": 0.5862, + "step": 5128 + }, + { + "epoch": 1.420775623268698, + "grad_norm": 0.7066919803619385, + "learning_rate": 4.3582682658199175e-06, + "loss": 0.6181, + "step": 5129 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.7226448059082031, + "learning_rate": 4.358024557694452e-06, + "loss": 0.6098, + "step": 5130 + }, + { + "epoch": 1.4213296398891968, + "grad_norm": 0.7035112380981445, + "learning_rate": 4.357780810118381e-06, + "loss": 0.6122, + "step": 5131 + }, + { + "epoch": 1.421606648199446, + "grad_norm": 0.6860722899436951, + "learning_rate": 4.35753702309688e-06, + "loss": 0.6283, + "step": 5132 + }, + { + "epoch": 1.4218836565096953, + "grad_norm": 0.6797297596931458, + "learning_rate": 4.357293196635122e-06, + "loss": 0.5826, + "step": 5133 + }, + { + "epoch": 1.4221606648199445, + "grad_norm": 0.6257482767105103, + "learning_rate": 4.357049330738288e-06, + "loss": 0.6165, + "step": 5134 + }, + { + "epoch": 1.422437673130194, + "grad_norm": 0.6722486615180969, + "learning_rate": 4.356805425411555e-06, + "loss": 0.6323, + "step": 5135 + }, + { + "epoch": 1.4227146814404432, + "grad_norm": 0.6958386301994324, + "learning_rate": 4.3565614806600995e-06, + "loss": 0.6151, + "step": 5136 + }, + { + "epoch": 1.4229916897506925, + "grad_norm": 0.7121996879577637, + "learning_rate": 4.356317496489104e-06, + "loss": 0.6806, + "step": 5137 + }, + { + "epoch": 1.4232686980609417, + "grad_norm": 0.7728790640830994, + "learning_rate": 4.356073472903747e-06, + "loss": 0.6119, + "step": 5138 + }, + { + "epoch": 1.4235457063711912, + "grad_norm": 0.6925826072692871, + "learning_rate": 4.3558294099092095e-06, + "loss": 0.6252, + "step": 5139 + }, + { + "epoch": 1.4238227146814404, + "grad_norm": 0.717306911945343, + "learning_rate": 4.355585307510675e-06, + "loss": 0.6625, + "step": 5140 + }, + { + "epoch": 1.4240997229916896, + "grad_norm": 0.6816144585609436, + "learning_rate": 4.355341165713326e-06, + "loss": 0.6055, + "step": 5141 + }, + { + "epoch": 1.424376731301939, + "grad_norm": 0.6522184610366821, + "learning_rate": 4.355096984522346e-06, + "loss": 0.5748, + "step": 5142 + }, + { + "epoch": 1.4246537396121883, + "grad_norm": 0.7107172012329102, + "learning_rate": 4.354852763942919e-06, + "loss": 0.6665, + "step": 5143 + }, + { + "epoch": 1.4249307479224376, + "grad_norm": 0.7126984596252441, + "learning_rate": 4.354608503980232e-06, + "loss": 0.6294, + "step": 5144 + }, + { + "epoch": 1.425207756232687, + "grad_norm": 0.7015909552574158, + "learning_rate": 4.3543642046394696e-06, + "loss": 0.6172, + "step": 5145 + }, + { + "epoch": 1.4254847645429363, + "grad_norm": 0.6696215271949768, + "learning_rate": 4.35411986592582e-06, + "loss": 0.635, + "step": 5146 + }, + { + "epoch": 1.4257617728531855, + "grad_norm": 0.6961736679077148, + "learning_rate": 4.35387548784447e-06, + "loss": 0.6564, + "step": 5147 + }, + { + "epoch": 1.426038781163435, + "grad_norm": 0.6732077598571777, + "learning_rate": 4.3536310704006106e-06, + "loss": 0.5947, + "step": 5148 + }, + { + "epoch": 1.4263157894736842, + "grad_norm": 0.6792954802513123, + "learning_rate": 4.3533866135994284e-06, + "loss": 0.594, + "step": 5149 + }, + { + "epoch": 1.4265927977839334, + "grad_norm": 0.747936487197876, + "learning_rate": 4.353142117446115e-06, + "loss": 0.6103, + "step": 5150 + }, + { + "epoch": 1.426869806094183, + "grad_norm": 0.6711264252662659, + "learning_rate": 4.352897581945864e-06, + "loss": 0.6538, + "step": 5151 + }, + { + "epoch": 1.4271468144044321, + "grad_norm": 0.721182107925415, + "learning_rate": 4.352653007103864e-06, + "loss": 0.6649, + "step": 5152 + }, + { + "epoch": 1.4274238227146814, + "grad_norm": 0.6888142824172974, + "learning_rate": 4.352408392925311e-06, + "loss": 0.6392, + "step": 5153 + }, + { + "epoch": 1.4277008310249308, + "grad_norm": 0.7058656215667725, + "learning_rate": 4.352163739415396e-06, + "loss": 0.6148, + "step": 5154 + }, + { + "epoch": 1.42797783933518, + "grad_norm": 0.7143036127090454, + "learning_rate": 4.351919046579315e-06, + "loss": 0.6343, + "step": 5155 + }, + { + "epoch": 1.4282548476454293, + "grad_norm": 0.7155359983444214, + "learning_rate": 4.351674314422264e-06, + "loss": 0.6475, + "step": 5156 + }, + { + "epoch": 1.4285318559556788, + "grad_norm": 0.6871824860572815, + "learning_rate": 4.351429542949438e-06, + "loss": 0.6427, + "step": 5157 + }, + { + "epoch": 1.428808864265928, + "grad_norm": 0.6846596598625183, + "learning_rate": 4.351184732166035e-06, + "loss": 0.5837, + "step": 5158 + }, + { + "epoch": 1.4290858725761773, + "grad_norm": 0.7276217937469482, + "learning_rate": 4.350939882077252e-06, + "loss": 0.5906, + "step": 5159 + }, + { + "epoch": 1.4293628808864267, + "grad_norm": 0.7454106211662292, + "learning_rate": 4.350694992688289e-06, + "loss": 0.6098, + "step": 5160 + }, + { + "epoch": 1.429639889196676, + "grad_norm": 0.6848341226577759, + "learning_rate": 4.3504500640043455e-06, + "loss": 0.6442, + "step": 5161 + }, + { + "epoch": 1.4299168975069252, + "grad_norm": 0.6798859238624573, + "learning_rate": 4.350205096030622e-06, + "loss": 0.619, + "step": 5162 + }, + { + "epoch": 1.4301939058171746, + "grad_norm": 0.6953790187835693, + "learning_rate": 4.349960088772318e-06, + "loss": 0.5988, + "step": 5163 + }, + { + "epoch": 1.4304709141274239, + "grad_norm": 0.7027685642242432, + "learning_rate": 4.349715042234638e-06, + "loss": 0.6127, + "step": 5164 + }, + { + "epoch": 1.4307479224376731, + "grad_norm": 0.7262356877326965, + "learning_rate": 4.349469956422784e-06, + "loss": 0.678, + "step": 5165 + }, + { + "epoch": 1.4310249307479224, + "grad_norm": 0.7216992974281311, + "learning_rate": 4.34922483134196e-06, + "loss": 0.6285, + "step": 5166 + }, + { + "epoch": 1.4313019390581718, + "grad_norm": 0.6989957690238953, + "learning_rate": 4.34897966699737e-06, + "loss": 0.6434, + "step": 5167 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 0.726123571395874, + "learning_rate": 4.34873446339422e-06, + "loss": 0.6514, + "step": 5168 + }, + { + "epoch": 1.4318559556786703, + "grad_norm": 0.7209747433662415, + "learning_rate": 4.348489220537716e-06, + "loss": 0.635, + "step": 5169 + }, + { + "epoch": 1.4321329639889195, + "grad_norm": 0.6753904223442078, + "learning_rate": 4.348243938433065e-06, + "loss": 0.6142, + "step": 5170 + }, + { + "epoch": 1.432409972299169, + "grad_norm": 0.740227222442627, + "learning_rate": 4.347998617085477e-06, + "loss": 0.631, + "step": 5171 + }, + { + "epoch": 1.4326869806094182, + "grad_norm": 0.6714673638343811, + "learning_rate": 4.347753256500158e-06, + "loss": 0.602, + "step": 5172 + }, + { + "epoch": 1.4329639889196675, + "grad_norm": 0.6836201548576355, + "learning_rate": 4.347507856682318e-06, + "loss": 0.63, + "step": 5173 + }, + { + "epoch": 1.433240997229917, + "grad_norm": 0.7172759175300598, + "learning_rate": 4.347262417637169e-06, + "loss": 0.6962, + "step": 5174 + }, + { + "epoch": 1.4335180055401662, + "grad_norm": 0.6770889163017273, + "learning_rate": 4.347016939369921e-06, + "loss": 0.6794, + "step": 5175 + }, + { + "epoch": 1.4337950138504154, + "grad_norm": 0.738587498664856, + "learning_rate": 4.346771421885787e-06, + "loss": 0.6369, + "step": 5176 + }, + { + "epoch": 1.4340720221606649, + "grad_norm": 0.7032896876335144, + "learning_rate": 4.346525865189979e-06, + "loss": 0.6236, + "step": 5177 + }, + { + "epoch": 1.434349030470914, + "grad_norm": 0.7206043004989624, + "learning_rate": 4.346280269287712e-06, + "loss": 0.6393, + "step": 5178 + }, + { + "epoch": 1.4346260387811633, + "grad_norm": 0.7211987376213074, + "learning_rate": 4.3460346341842e-06, + "loss": 0.6348, + "step": 5179 + }, + { + "epoch": 1.4349030470914128, + "grad_norm": 0.6673139333724976, + "learning_rate": 4.345788959884658e-06, + "loss": 0.6243, + "step": 5180 + }, + { + "epoch": 1.435180055401662, + "grad_norm": 0.7322635054588318, + "learning_rate": 4.345543246394303e-06, + "loss": 0.6455, + "step": 5181 + }, + { + "epoch": 1.4354570637119113, + "grad_norm": 0.6816807985305786, + "learning_rate": 4.345297493718352e-06, + "loss": 0.6102, + "step": 5182 + }, + { + "epoch": 1.4357340720221607, + "grad_norm": 0.6793221831321716, + "learning_rate": 4.345051701862023e-06, + "loss": 0.6377, + "step": 5183 + }, + { + "epoch": 1.43601108033241, + "grad_norm": 0.6505659222602844, + "learning_rate": 4.344805870830534e-06, + "loss": 0.5451, + "step": 5184 + }, + { + "epoch": 1.4362880886426592, + "grad_norm": 0.7179732322692871, + "learning_rate": 4.344560000629105e-06, + "loss": 0.6237, + "step": 5185 + }, + { + "epoch": 1.4365650969529087, + "grad_norm": 0.7613140940666199, + "learning_rate": 4.344314091262958e-06, + "loss": 0.6479, + "step": 5186 + }, + { + "epoch": 1.436842105263158, + "grad_norm": 0.7074609398841858, + "learning_rate": 4.344068142737312e-06, + "loss": 0.6614, + "step": 5187 + }, + { + "epoch": 1.4371191135734072, + "grad_norm": 0.7174100279808044, + "learning_rate": 4.34382215505739e-06, + "loss": 0.622, + "step": 5188 + }, + { + "epoch": 1.4373961218836566, + "grad_norm": 0.6789400577545166, + "learning_rate": 4.343576128228416e-06, + "loss": 0.6135, + "step": 5189 + }, + { + "epoch": 1.4376731301939059, + "grad_norm": 0.6755404472351074, + "learning_rate": 4.343330062255612e-06, + "loss": 0.5856, + "step": 5190 + }, + { + "epoch": 1.437950138504155, + "grad_norm": 0.6998751163482666, + "learning_rate": 4.343083957144204e-06, + "loss": 0.6333, + "step": 5191 + }, + { + "epoch": 1.4382271468144046, + "grad_norm": 0.70278000831604, + "learning_rate": 4.3428378128994165e-06, + "loss": 0.5697, + "step": 5192 + }, + { + "epoch": 1.4385041551246538, + "grad_norm": 0.6691498160362244, + "learning_rate": 4.342591629526476e-06, + "loss": 0.6252, + "step": 5193 + }, + { + "epoch": 1.438781163434903, + "grad_norm": 0.6869835257530212, + "learning_rate": 4.34234540703061e-06, + "loss": 0.6272, + "step": 5194 + }, + { + "epoch": 1.4390581717451525, + "grad_norm": 0.663788914680481, + "learning_rate": 4.342099145417046e-06, + "loss": 0.6791, + "step": 5195 + }, + { + "epoch": 1.4393351800554017, + "grad_norm": 0.6984208822250366, + "learning_rate": 4.341852844691012e-06, + "loss": 0.6582, + "step": 5196 + }, + { + "epoch": 1.439612188365651, + "grad_norm": 0.6600984930992126, + "learning_rate": 4.34160650485774e-06, + "loss": 0.6389, + "step": 5197 + }, + { + "epoch": 1.4398891966759002, + "grad_norm": 0.7514785528182983, + "learning_rate": 4.341360125922458e-06, + "loss": 0.6174, + "step": 5198 + }, + { + "epoch": 1.4401662049861497, + "grad_norm": 0.7093493342399597, + "learning_rate": 4.3411137078904e-06, + "loss": 0.617, + "step": 5199 + }, + { + "epoch": 1.440443213296399, + "grad_norm": 0.639533281326294, + "learning_rate": 4.340867250766794e-06, + "loss": 0.5836, + "step": 5200 + }, + { + "epoch": 1.4407202216066481, + "grad_norm": 0.6882728934288025, + "learning_rate": 4.340620754556876e-06, + "loss": 0.6223, + "step": 5201 + }, + { + "epoch": 1.4409972299168974, + "grad_norm": 0.7091029286384583, + "learning_rate": 4.340374219265879e-06, + "loss": 0.6513, + "step": 5202 + }, + { + "epoch": 1.4412742382271468, + "grad_norm": 0.6923207640647888, + "learning_rate": 4.340127644899038e-06, + "loss": 0.6318, + "step": 5203 + }, + { + "epoch": 1.441551246537396, + "grad_norm": 0.7474663257598877, + "learning_rate": 4.339881031461588e-06, + "loss": 0.6651, + "step": 5204 + }, + { + "epoch": 1.4418282548476453, + "grad_norm": 0.7120457291603088, + "learning_rate": 4.339634378958765e-06, + "loss": 0.6202, + "step": 5205 + }, + { + "epoch": 1.4421052631578948, + "grad_norm": 0.7063796520233154, + "learning_rate": 4.339387687395805e-06, + "loss": 0.6105, + "step": 5206 + }, + { + "epoch": 1.442382271468144, + "grad_norm": 0.6743805408477783, + "learning_rate": 4.3391409567779485e-06, + "loss": 0.5991, + "step": 5207 + }, + { + "epoch": 1.4426592797783933, + "grad_norm": 0.6794716715812683, + "learning_rate": 4.338894187110432e-06, + "loss": 0.5837, + "step": 5208 + }, + { + "epoch": 1.4429362880886427, + "grad_norm": 0.709031879901886, + "learning_rate": 4.338647378398496e-06, + "loss": 0.5933, + "step": 5209 + }, + { + "epoch": 1.443213296398892, + "grad_norm": 0.6801929473876953, + "learning_rate": 4.338400530647382e-06, + "loss": 0.6406, + "step": 5210 + }, + { + "epoch": 1.4434903047091412, + "grad_norm": 0.7303329706192017, + "learning_rate": 4.3381536438623286e-06, + "loss": 0.6533, + "step": 5211 + }, + { + "epoch": 1.4437673130193907, + "grad_norm": 0.7102288603782654, + "learning_rate": 4.3379067180485795e-06, + "loss": 0.6582, + "step": 5212 + }, + { + "epoch": 1.44404432132964, + "grad_norm": 0.7001471519470215, + "learning_rate": 4.337659753211377e-06, + "loss": 0.6493, + "step": 5213 + }, + { + "epoch": 1.4443213296398891, + "grad_norm": 0.7027942538261414, + "learning_rate": 4.337412749355966e-06, + "loss": 0.6005, + "step": 5214 + }, + { + "epoch": 1.4445983379501386, + "grad_norm": 0.6876331567764282, + "learning_rate": 4.337165706487589e-06, + "loss": 0.669, + "step": 5215 + }, + { + "epoch": 1.4448753462603878, + "grad_norm": 0.707457959651947, + "learning_rate": 4.336918624611494e-06, + "loss": 0.6003, + "step": 5216 + }, + { + "epoch": 1.445152354570637, + "grad_norm": 0.7237023115158081, + "learning_rate": 4.336671503732924e-06, + "loss": 0.6595, + "step": 5217 + }, + { + "epoch": 1.4454293628808865, + "grad_norm": 0.7044596672058105, + "learning_rate": 4.336424343857128e-06, + "loss": 0.6118, + "step": 5218 + }, + { + "epoch": 1.4457063711911358, + "grad_norm": 0.719316303730011, + "learning_rate": 4.336177144989354e-06, + "loss": 0.6243, + "step": 5219 + }, + { + "epoch": 1.445983379501385, + "grad_norm": 0.6969180107116699, + "learning_rate": 4.335929907134849e-06, + "loss": 0.656, + "step": 5220 + }, + { + "epoch": 1.4462603878116345, + "grad_norm": 0.7368977665901184, + "learning_rate": 4.335682630298865e-06, + "loss": 0.6949, + "step": 5221 + }, + { + "epoch": 1.4465373961218837, + "grad_norm": 0.68007892370224, + "learning_rate": 4.33543531448665e-06, + "loss": 0.6299, + "step": 5222 + }, + { + "epoch": 1.446814404432133, + "grad_norm": 0.6848158836364746, + "learning_rate": 4.335187959703457e-06, + "loss": 0.6451, + "step": 5223 + }, + { + "epoch": 1.4470914127423824, + "grad_norm": 0.7511315941810608, + "learning_rate": 4.334940565954536e-06, + "loss": 0.651, + "step": 5224 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.6904433965682983, + "learning_rate": 4.334693133245141e-06, + "loss": 0.6535, + "step": 5225 + }, + { + "epoch": 1.4476454293628809, + "grad_norm": 0.7157757878303528, + "learning_rate": 4.334445661580527e-06, + "loss": 0.6092, + "step": 5226 + }, + { + "epoch": 1.4479224376731301, + "grad_norm": 0.7234309315681458, + "learning_rate": 4.334198150965946e-06, + "loss": 0.6465, + "step": 5227 + }, + { + "epoch": 1.4481994459833796, + "grad_norm": 0.7146196961402893, + "learning_rate": 4.333950601406653e-06, + "loss": 0.6399, + "step": 5228 + }, + { + "epoch": 1.4484764542936288, + "grad_norm": 0.7037093043327332, + "learning_rate": 4.333703012907907e-06, + "loss": 0.6356, + "step": 5229 + }, + { + "epoch": 1.448753462603878, + "grad_norm": 0.6986050605773926, + "learning_rate": 4.333455385474962e-06, + "loss": 0.6332, + "step": 5230 + }, + { + "epoch": 1.4490304709141273, + "grad_norm": 0.6712711453437805, + "learning_rate": 4.333207719113078e-06, + "loss": 0.6631, + "step": 5231 + }, + { + "epoch": 1.4493074792243767, + "grad_norm": 0.7003492116928101, + "learning_rate": 4.332960013827512e-06, + "loss": 0.6268, + "step": 5232 + }, + { + "epoch": 1.449584487534626, + "grad_norm": 0.705556333065033, + "learning_rate": 4.332712269623525e-06, + "loss": 0.6109, + "step": 5233 + }, + { + "epoch": 1.4498614958448752, + "grad_norm": 0.69657301902771, + "learning_rate": 4.332464486506375e-06, + "loss": 0.5833, + "step": 5234 + }, + { + "epoch": 1.4501385041551247, + "grad_norm": 0.682807445526123, + "learning_rate": 4.332216664481325e-06, + "loss": 0.5722, + "step": 5235 + }, + { + "epoch": 1.450415512465374, + "grad_norm": 0.6681585907936096, + "learning_rate": 4.331968803553636e-06, + "loss": 0.6012, + "step": 5236 + }, + { + "epoch": 1.4506925207756232, + "grad_norm": 0.6505764722824097, + "learning_rate": 4.331720903728571e-06, + "loss": 0.6116, + "step": 5237 + }, + { + "epoch": 1.4509695290858726, + "grad_norm": 0.7191979885101318, + "learning_rate": 4.331472965011394e-06, + "loss": 0.6577, + "step": 5238 + }, + { + "epoch": 1.4512465373961219, + "grad_norm": 0.6958252191543579, + "learning_rate": 4.331224987407369e-06, + "loss": 0.6515, + "step": 5239 + }, + { + "epoch": 1.451523545706371, + "grad_norm": 0.6717762351036072, + "learning_rate": 4.3309769709217605e-06, + "loss": 0.5901, + "step": 5240 + }, + { + "epoch": 1.4518005540166206, + "grad_norm": 0.716303825378418, + "learning_rate": 4.330728915559834e-06, + "loss": 0.652, + "step": 5241 + }, + { + "epoch": 1.4520775623268698, + "grad_norm": 0.6768020391464233, + "learning_rate": 4.33048082132686e-06, + "loss": 0.5722, + "step": 5242 + }, + { + "epoch": 1.452354570637119, + "grad_norm": 0.7079585790634155, + "learning_rate": 4.330232688228102e-06, + "loss": 0.6415, + "step": 5243 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 0.7371026277542114, + "learning_rate": 4.329984516268831e-06, + "loss": 0.6219, + "step": 5244 + }, + { + "epoch": 1.4529085872576177, + "grad_norm": 0.6973332166671753, + "learning_rate": 4.329736305454314e-06, + "loss": 0.6243, + "step": 5245 + }, + { + "epoch": 1.453185595567867, + "grad_norm": 0.6986327767372131, + "learning_rate": 4.329488055789824e-06, + "loss": 0.6318, + "step": 5246 + }, + { + "epoch": 1.4534626038781164, + "grad_norm": 0.6920615434646606, + "learning_rate": 4.3292397672806296e-06, + "loss": 0.6202, + "step": 5247 + }, + { + "epoch": 1.4537396121883657, + "grad_norm": 0.7052721977233887, + "learning_rate": 4.328991439932003e-06, + "loss": 0.6409, + "step": 5248 + }, + { + "epoch": 1.454016620498615, + "grad_norm": 0.6558910608291626, + "learning_rate": 4.328743073749219e-06, + "loss": 0.6174, + "step": 5249 + }, + { + "epoch": 1.4542936288088644, + "grad_norm": 0.696160078048706, + "learning_rate": 4.328494668737549e-06, + "loss": 0.6321, + "step": 5250 + }, + { + "epoch": 1.4545706371191136, + "grad_norm": 0.7242604494094849, + "learning_rate": 4.328246224902267e-06, + "loss": 0.6464, + "step": 5251 + }, + { + "epoch": 1.4548476454293628, + "grad_norm": 0.6753219366073608, + "learning_rate": 4.3279977422486495e-06, + "loss": 0.6298, + "step": 5252 + }, + { + "epoch": 1.4551246537396123, + "grad_norm": 0.7261792421340942, + "learning_rate": 4.327749220781972e-06, + "loss": 0.6281, + "step": 5253 + }, + { + "epoch": 1.4554016620498615, + "grad_norm": 0.7096898555755615, + "learning_rate": 4.327500660507511e-06, + "loss": 0.6053, + "step": 5254 + }, + { + "epoch": 1.4556786703601108, + "grad_norm": 0.6910092234611511, + "learning_rate": 4.327252061430543e-06, + "loss": 0.6212, + "step": 5255 + }, + { + "epoch": 1.4559556786703602, + "grad_norm": 0.6957384943962097, + "learning_rate": 4.327003423556349e-06, + "loss": 0.6263, + "step": 5256 + }, + { + "epoch": 1.4562326869806095, + "grad_norm": 0.6883211731910706, + "learning_rate": 4.326754746890205e-06, + "loss": 0.644, + "step": 5257 + }, + { + "epoch": 1.4565096952908587, + "grad_norm": 0.7107007503509521, + "learning_rate": 4.326506031437393e-06, + "loss": 0.6443, + "step": 5258 + }, + { + "epoch": 1.456786703601108, + "grad_norm": 0.708169162273407, + "learning_rate": 4.326257277203194e-06, + "loss": 0.6279, + "step": 5259 + }, + { + "epoch": 1.4570637119113574, + "grad_norm": 0.7273499369621277, + "learning_rate": 4.326008484192889e-06, + "loss": 0.684, + "step": 5260 + }, + { + "epoch": 1.4573407202216067, + "grad_norm": 0.7274107336997986, + "learning_rate": 4.325759652411762e-06, + "loss": 0.5453, + "step": 5261 + }, + { + "epoch": 1.457617728531856, + "grad_norm": 0.6847637891769409, + "learning_rate": 4.325510781865094e-06, + "loss": 0.6211, + "step": 5262 + }, + { + "epoch": 1.4578947368421051, + "grad_norm": 0.6615282893180847, + "learning_rate": 4.32526187255817e-06, + "loss": 0.5822, + "step": 5263 + }, + { + "epoch": 1.4581717451523546, + "grad_norm": 0.6708592176437378, + "learning_rate": 4.325012924496276e-06, + "loss": 0.637, + "step": 5264 + }, + { + "epoch": 1.4584487534626038, + "grad_norm": 0.7095727324485779, + "learning_rate": 4.324763937684696e-06, + "loss": 0.6257, + "step": 5265 + }, + { + "epoch": 1.458725761772853, + "grad_norm": 0.6905611753463745, + "learning_rate": 4.324514912128719e-06, + "loss": 0.6167, + "step": 5266 + }, + { + "epoch": 1.4590027700831025, + "grad_norm": 0.7230263352394104, + "learning_rate": 4.32426584783363e-06, + "loss": 0.5622, + "step": 5267 + }, + { + "epoch": 1.4592797783933518, + "grad_norm": 0.6828804612159729, + "learning_rate": 4.324016744804719e-06, + "loss": 0.6402, + "step": 5268 + }, + { + "epoch": 1.459556786703601, + "grad_norm": 0.7129217982292175, + "learning_rate": 4.323767603047274e-06, + "loss": 0.6777, + "step": 5269 + }, + { + "epoch": 1.4598337950138505, + "grad_norm": 0.6748174428939819, + "learning_rate": 4.323518422566586e-06, + "loss": 0.609, + "step": 5270 + }, + { + "epoch": 1.4601108033240997, + "grad_norm": 0.6736699342727661, + "learning_rate": 4.323269203367944e-06, + "loss": 0.6251, + "step": 5271 + }, + { + "epoch": 1.460387811634349, + "grad_norm": 0.706195592880249, + "learning_rate": 4.323019945456642e-06, + "loss": 0.5806, + "step": 5272 + }, + { + "epoch": 1.4606648199445984, + "grad_norm": 0.7386101484298706, + "learning_rate": 4.32277064883797e-06, + "loss": 0.6047, + "step": 5273 + }, + { + "epoch": 1.4609418282548476, + "grad_norm": 0.7328194975852966, + "learning_rate": 4.322521313517223e-06, + "loss": 0.6254, + "step": 5274 + }, + { + "epoch": 1.4612188365650969, + "grad_norm": 0.6765896081924438, + "learning_rate": 4.3222719394996944e-06, + "loss": 0.6273, + "step": 5275 + }, + { + "epoch": 1.4614958448753463, + "grad_norm": 0.6815961003303528, + "learning_rate": 4.322022526790679e-06, + "loss": 0.6604, + "step": 5276 + }, + { + "epoch": 1.4617728531855956, + "grad_norm": 0.7059053778648376, + "learning_rate": 4.321773075395473e-06, + "loss": 0.6445, + "step": 5277 + }, + { + "epoch": 1.4620498614958448, + "grad_norm": 0.6844702363014221, + "learning_rate": 4.321523585319371e-06, + "loss": 0.6199, + "step": 5278 + }, + { + "epoch": 1.4623268698060943, + "grad_norm": 0.6714056134223938, + "learning_rate": 4.321274056567672e-06, + "loss": 0.5777, + "step": 5279 + }, + { + "epoch": 1.4626038781163435, + "grad_norm": 0.7048715353012085, + "learning_rate": 4.321024489145674e-06, + "loss": 0.658, + "step": 5280 + }, + { + "epoch": 1.4628808864265928, + "grad_norm": 0.6695634126663208, + "learning_rate": 4.320774883058675e-06, + "loss": 0.6579, + "step": 5281 + }, + { + "epoch": 1.4631578947368422, + "grad_norm": 0.735461413860321, + "learning_rate": 4.320525238311976e-06, + "loss": 0.6551, + "step": 5282 + }, + { + "epoch": 1.4634349030470915, + "grad_norm": 0.6574238538742065, + "learning_rate": 4.320275554910876e-06, + "loss": 0.5934, + "step": 5283 + }, + { + "epoch": 1.4637119113573407, + "grad_norm": 0.6849135160446167, + "learning_rate": 4.320025832860679e-06, + "loss": 0.5843, + "step": 5284 + }, + { + "epoch": 1.4639889196675901, + "grad_norm": 0.7034268975257874, + "learning_rate": 4.319776072166686e-06, + "loss": 0.6099, + "step": 5285 + }, + { + "epoch": 1.4642659279778394, + "grad_norm": 0.7210450768470764, + "learning_rate": 4.319526272834198e-06, + "loss": 0.6202, + "step": 5286 + }, + { + "epoch": 1.4645429362880886, + "grad_norm": 0.7004812359809875, + "learning_rate": 4.319276434868522e-06, + "loss": 0.6692, + "step": 5287 + }, + { + "epoch": 1.464819944598338, + "grad_norm": 0.7177503705024719, + "learning_rate": 4.319026558274961e-06, + "loss": 0.6493, + "step": 5288 + }, + { + "epoch": 1.4650969529085873, + "grad_norm": 0.6847087740898132, + "learning_rate": 4.3187766430588215e-06, + "loss": 0.6499, + "step": 5289 + }, + { + "epoch": 1.4653739612188366, + "grad_norm": 0.7065343856811523, + "learning_rate": 4.318526689225408e-06, + "loss": 0.6308, + "step": 5290 + }, + { + "epoch": 1.4656509695290858, + "grad_norm": 0.687520444393158, + "learning_rate": 4.31827669678003e-06, + "loss": 0.66, + "step": 5291 + }, + { + "epoch": 1.4659279778393353, + "grad_norm": 0.6558403968811035, + "learning_rate": 4.318026665727993e-06, + "loss": 0.6164, + "step": 5292 + }, + { + "epoch": 1.4662049861495845, + "grad_norm": 0.7158898711204529, + "learning_rate": 4.317776596074609e-06, + "loss": 0.6015, + "step": 5293 + }, + { + "epoch": 1.4664819944598337, + "grad_norm": 0.6923995614051819, + "learning_rate": 4.317526487825185e-06, + "loss": 0.617, + "step": 5294 + }, + { + "epoch": 1.466759002770083, + "grad_norm": 0.6958532333374023, + "learning_rate": 4.317276340985032e-06, + "loss": 0.6142, + "step": 5295 + }, + { + "epoch": 1.4670360110803324, + "grad_norm": 0.6740797758102417, + "learning_rate": 4.3170261555594615e-06, + "loss": 0.6077, + "step": 5296 + }, + { + "epoch": 1.4673130193905817, + "grad_norm": 0.7106777429580688, + "learning_rate": 4.316775931553785e-06, + "loss": 0.6472, + "step": 5297 + }, + { + "epoch": 1.467590027700831, + "grad_norm": 0.6848693490028381, + "learning_rate": 4.316525668973317e-06, + "loss": 0.6531, + "step": 5298 + }, + { + "epoch": 1.4678670360110804, + "grad_norm": 0.6626380681991577, + "learning_rate": 4.316275367823369e-06, + "loss": 0.6212, + "step": 5299 + }, + { + "epoch": 1.4681440443213296, + "grad_norm": 0.7039237022399902, + "learning_rate": 4.3160250281092575e-06, + "loss": 0.653, + "step": 5300 + }, + { + "epoch": 1.4684210526315788, + "grad_norm": 0.6798372268676758, + "learning_rate": 4.315774649836297e-06, + "loss": 0.5864, + "step": 5301 + }, + { + "epoch": 1.4686980609418283, + "grad_norm": 0.6488889455795288, + "learning_rate": 4.315524233009803e-06, + "loss": 0.6323, + "step": 5302 + }, + { + "epoch": 1.4689750692520775, + "grad_norm": 0.6437256932258606, + "learning_rate": 4.3152737776350935e-06, + "loss": 0.5617, + "step": 5303 + }, + { + "epoch": 1.4692520775623268, + "grad_norm": 0.6839866638183594, + "learning_rate": 4.315023283717487e-06, + "loss": 0.6437, + "step": 5304 + }, + { + "epoch": 1.4695290858725762, + "grad_norm": 0.6503082513809204, + "learning_rate": 4.3147727512623e-06, + "loss": 0.6379, + "step": 5305 + }, + { + "epoch": 1.4698060941828255, + "grad_norm": 0.7124677300453186, + "learning_rate": 4.314522180274854e-06, + "loss": 0.6483, + "step": 5306 + }, + { + "epoch": 1.4700831024930747, + "grad_norm": 0.6634410619735718, + "learning_rate": 4.314271570760467e-06, + "loss": 0.6248, + "step": 5307 + }, + { + "epoch": 1.4703601108033242, + "grad_norm": 0.671969473361969, + "learning_rate": 4.314020922724462e-06, + "loss": 0.6142, + "step": 5308 + }, + { + "epoch": 1.4706371191135734, + "grad_norm": 0.7019748091697693, + "learning_rate": 4.313770236172161e-06, + "loss": 0.6396, + "step": 5309 + }, + { + "epoch": 1.4709141274238227, + "grad_norm": 0.7120486497879028, + "learning_rate": 4.313519511108886e-06, + "loss": 0.6307, + "step": 5310 + }, + { + "epoch": 1.4711911357340721, + "grad_norm": 0.7347435355186462, + "learning_rate": 4.313268747539959e-06, + "loss": 0.6312, + "step": 5311 + }, + { + "epoch": 1.4714681440443214, + "grad_norm": 0.6481282114982605, + "learning_rate": 4.3130179454707075e-06, + "loss": 0.6131, + "step": 5312 + }, + { + "epoch": 1.4717451523545706, + "grad_norm": 0.6802014708518982, + "learning_rate": 4.3127671049064546e-06, + "loss": 0.6703, + "step": 5313 + }, + { + "epoch": 1.47202216066482, + "grad_norm": 0.6929247975349426, + "learning_rate": 4.3125162258525265e-06, + "loss": 0.6416, + "step": 5314 + }, + { + "epoch": 1.4722991689750693, + "grad_norm": 0.7569963932037354, + "learning_rate": 4.312265308314251e-06, + "loss": 0.6125, + "step": 5315 + }, + { + "epoch": 1.4725761772853185, + "grad_norm": 0.6945342421531677, + "learning_rate": 4.312014352296954e-06, + "loss": 0.6376, + "step": 5316 + }, + { + "epoch": 1.472853185595568, + "grad_norm": 0.6991958618164062, + "learning_rate": 4.3117633578059645e-06, + "loss": 0.5975, + "step": 5317 + }, + { + "epoch": 1.4731301939058172, + "grad_norm": 0.709173321723938, + "learning_rate": 4.311512324846614e-06, + "loss": 0.648, + "step": 5318 + }, + { + "epoch": 1.4734072022160665, + "grad_norm": 0.7404986619949341, + "learning_rate": 4.311261253424229e-06, + "loss": 0.6158, + "step": 5319 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.7493718266487122, + "learning_rate": 4.311010143544143e-06, + "loss": 0.6611, + "step": 5320 + }, + { + "epoch": 1.4739612188365652, + "grad_norm": 0.7251834273338318, + "learning_rate": 4.310758995211687e-06, + "loss": 0.545, + "step": 5321 + }, + { + "epoch": 1.4742382271468144, + "grad_norm": 0.7508950233459473, + "learning_rate": 4.310507808432193e-06, + "loss": 0.645, + "step": 5322 + }, + { + "epoch": 1.4745152354570636, + "grad_norm": 0.7097291350364685, + "learning_rate": 4.310256583210995e-06, + "loss": 0.6233, + "step": 5323 + }, + { + "epoch": 1.4747922437673129, + "grad_norm": 0.6722292900085449, + "learning_rate": 4.3100053195534265e-06, + "loss": 0.6236, + "step": 5324 + }, + { + "epoch": 1.4750692520775623, + "grad_norm": 0.6865547895431519, + "learning_rate": 4.309754017464823e-06, + "loss": 0.6743, + "step": 5325 + }, + { + "epoch": 1.4753462603878116, + "grad_norm": 0.6935882568359375, + "learning_rate": 4.30950267695052e-06, + "loss": 0.634, + "step": 5326 + }, + { + "epoch": 1.4756232686980608, + "grad_norm": 0.7458108067512512, + "learning_rate": 4.3092512980158545e-06, + "loss": 0.6486, + "step": 5327 + }, + { + "epoch": 1.4759002770083103, + "grad_norm": 0.693493664264679, + "learning_rate": 4.308999880666163e-06, + "loss": 0.6331, + "step": 5328 + }, + { + "epoch": 1.4761772853185595, + "grad_norm": 0.6644585728645325, + "learning_rate": 4.308748424906785e-06, + "loss": 0.6216, + "step": 5329 + }, + { + "epoch": 1.4764542936288088, + "grad_norm": 0.7142424583435059, + "learning_rate": 4.308496930743058e-06, + "loss": 0.6425, + "step": 5330 + }, + { + "epoch": 1.4767313019390582, + "grad_norm": 0.7267554402351379, + "learning_rate": 4.308245398180323e-06, + "loss": 0.6313, + "step": 5331 + }, + { + "epoch": 1.4770083102493075, + "grad_norm": 0.666397213935852, + "learning_rate": 4.307993827223922e-06, + "loss": 0.6631, + "step": 5332 + }, + { + "epoch": 1.4772853185595567, + "grad_norm": 0.7025532722473145, + "learning_rate": 4.307742217879193e-06, + "loss": 0.6206, + "step": 5333 + }, + { + "epoch": 1.4775623268698062, + "grad_norm": 0.7297946810722351, + "learning_rate": 4.307490570151481e-06, + "loss": 0.5774, + "step": 5334 + }, + { + "epoch": 1.4778393351800554, + "grad_norm": 0.672098696231842, + "learning_rate": 4.307238884046129e-06, + "loss": 0.612, + "step": 5335 + }, + { + "epoch": 1.4781163434903046, + "grad_norm": 0.6860193610191345, + "learning_rate": 4.3069871595684795e-06, + "loss": 0.6446, + "step": 5336 + }, + { + "epoch": 1.478393351800554, + "grad_norm": 0.6797826886177063, + "learning_rate": 4.306735396723878e-06, + "loss": 0.6536, + "step": 5337 + }, + { + "epoch": 1.4786703601108033, + "grad_norm": 0.6978946924209595, + "learning_rate": 4.30648359551767e-06, + "loss": 0.6221, + "step": 5338 + }, + { + "epoch": 1.4789473684210526, + "grad_norm": 0.694804847240448, + "learning_rate": 4.306231755955202e-06, + "loss": 0.6012, + "step": 5339 + }, + { + "epoch": 1.479224376731302, + "grad_norm": 0.7225304841995239, + "learning_rate": 4.305979878041823e-06, + "loss": 0.6955, + "step": 5340 + }, + { + "epoch": 1.4795013850415513, + "grad_norm": 0.7552931904792786, + "learning_rate": 4.305727961782877e-06, + "loss": 0.5893, + "step": 5341 + }, + { + "epoch": 1.4797783933518005, + "grad_norm": 0.7309013605117798, + "learning_rate": 4.305476007183715e-06, + "loss": 0.6621, + "step": 5342 + }, + { + "epoch": 1.48005540166205, + "grad_norm": 0.6812788844108582, + "learning_rate": 4.305224014249688e-06, + "loss": 0.6633, + "step": 5343 + }, + { + "epoch": 1.4803324099722992, + "grad_norm": 0.7306602597236633, + "learning_rate": 4.304971982986145e-06, + "loss": 0.6242, + "step": 5344 + }, + { + "epoch": 1.4806094182825484, + "grad_norm": 0.6713859438896179, + "learning_rate": 4.304719913398438e-06, + "loss": 0.5877, + "step": 5345 + }, + { + "epoch": 1.480886426592798, + "grad_norm": 0.6959987878799438, + "learning_rate": 4.304467805491916e-06, + "loss": 0.648, + "step": 5346 + }, + { + "epoch": 1.4811634349030471, + "grad_norm": 0.6816545128822327, + "learning_rate": 4.304215659271936e-06, + "loss": 0.6096, + "step": 5347 + }, + { + "epoch": 1.4814404432132964, + "grad_norm": 0.7139344811439514, + "learning_rate": 4.303963474743851e-06, + "loss": 0.6545, + "step": 5348 + }, + { + "epoch": 1.4817174515235458, + "grad_norm": 0.7413231730461121, + "learning_rate": 4.303711251913013e-06, + "loss": 0.6533, + "step": 5349 + }, + { + "epoch": 1.481994459833795, + "grad_norm": 0.6867762804031372, + "learning_rate": 4.30345899078478e-06, + "loss": 0.6578, + "step": 5350 + }, + { + "epoch": 1.4822714681440443, + "grad_norm": 0.7088814377784729, + "learning_rate": 4.303206691364507e-06, + "loss": 0.6596, + "step": 5351 + }, + { + "epoch": 1.4825484764542936, + "grad_norm": 0.7046895027160645, + "learning_rate": 4.30295435365755e-06, + "loss": 0.6589, + "step": 5352 + }, + { + "epoch": 1.482825484764543, + "grad_norm": 0.6934412121772766, + "learning_rate": 4.302701977669269e-06, + "loss": 0.5886, + "step": 5353 + }, + { + "epoch": 1.4831024930747922, + "grad_norm": 0.7157440185546875, + "learning_rate": 4.302449563405021e-06, + "loss": 0.6188, + "step": 5354 + }, + { + "epoch": 1.4833795013850415, + "grad_norm": 0.6679389476776123, + "learning_rate": 4.302197110870165e-06, + "loss": 0.6092, + "step": 5355 + }, + { + "epoch": 1.4836565096952907, + "grad_norm": 0.6910407543182373, + "learning_rate": 4.301944620070063e-06, + "loss": 0.5916, + "step": 5356 + }, + { + "epoch": 1.4839335180055402, + "grad_norm": 0.7141585946083069, + "learning_rate": 4.301692091010075e-06, + "loss": 0.6066, + "step": 5357 + }, + { + "epoch": 1.4842105263157894, + "grad_norm": 0.7129756808280945, + "learning_rate": 4.3014395236955635e-06, + "loss": 0.6816, + "step": 5358 + }, + { + "epoch": 1.4844875346260387, + "grad_norm": 0.7230321168899536, + "learning_rate": 4.301186918131889e-06, + "loss": 0.5535, + "step": 5359 + }, + { + "epoch": 1.4847645429362881, + "grad_norm": 0.6658442616462708, + "learning_rate": 4.300934274324417e-06, + "loss": 0.6308, + "step": 5360 + }, + { + "epoch": 1.4850415512465374, + "grad_norm": 0.7296638488769531, + "learning_rate": 4.3006815922785115e-06, + "loss": 0.6358, + "step": 5361 + }, + { + "epoch": 1.4853185595567866, + "grad_norm": 0.7094091176986694, + "learning_rate": 4.300428871999538e-06, + "loss": 0.6209, + "step": 5362 + }, + { + "epoch": 1.485595567867036, + "grad_norm": 0.669418215751648, + "learning_rate": 4.300176113492861e-06, + "loss": 0.6032, + "step": 5363 + }, + { + "epoch": 1.4858725761772853, + "grad_norm": 0.7072867751121521, + "learning_rate": 4.299923316763848e-06, + "loss": 0.654, + "step": 5364 + }, + { + "epoch": 1.4861495844875345, + "grad_norm": 0.6897730827331543, + "learning_rate": 4.299670481817867e-06, + "loss": 0.6145, + "step": 5365 + }, + { + "epoch": 1.486426592797784, + "grad_norm": 0.6724587678909302, + "learning_rate": 4.299417608660285e-06, + "loss": 0.6323, + "step": 5366 + }, + { + "epoch": 1.4867036011080332, + "grad_norm": 0.6695003509521484, + "learning_rate": 4.299164697296473e-06, + "loss": 0.6383, + "step": 5367 + }, + { + "epoch": 1.4869806094182825, + "grad_norm": 0.6462554931640625, + "learning_rate": 4.298911747731798e-06, + "loss": 0.5728, + "step": 5368 + }, + { + "epoch": 1.487257617728532, + "grad_norm": 0.676162838935852, + "learning_rate": 4.298658759971635e-06, + "loss": 0.6123, + "step": 5369 + }, + { + "epoch": 1.4875346260387812, + "grad_norm": 0.7211152911186218, + "learning_rate": 4.298405734021352e-06, + "loss": 0.6803, + "step": 5370 + }, + { + "epoch": 1.4878116343490304, + "grad_norm": 0.7380316853523254, + "learning_rate": 4.298152669886323e-06, + "loss": 0.6448, + "step": 5371 + }, + { + "epoch": 1.4880886426592799, + "grad_norm": 0.7026437520980835, + "learning_rate": 4.297899567571921e-06, + "loss": 0.6442, + "step": 5372 + }, + { + "epoch": 1.488365650969529, + "grad_norm": 0.6982065439224243, + "learning_rate": 4.297646427083519e-06, + "loss": 0.6463, + "step": 5373 + }, + { + "epoch": 1.4886426592797783, + "grad_norm": 0.6661611199378967, + "learning_rate": 4.297393248426494e-06, + "loss": 0.6259, + "step": 5374 + }, + { + "epoch": 1.4889196675900278, + "grad_norm": 0.7075362801551819, + "learning_rate": 4.297140031606218e-06, + "loss": 0.6677, + "step": 5375 + }, + { + "epoch": 1.489196675900277, + "grad_norm": 0.7266721129417419, + "learning_rate": 4.296886776628072e-06, + "loss": 0.6782, + "step": 5376 + }, + { + "epoch": 1.4894736842105263, + "grad_norm": 0.7158677577972412, + "learning_rate": 4.2966334834974295e-06, + "loss": 0.6519, + "step": 5377 + }, + { + "epoch": 1.4897506925207757, + "grad_norm": 0.6620814204216003, + "learning_rate": 4.296380152219671e-06, + "loss": 0.6416, + "step": 5378 + }, + { + "epoch": 1.490027700831025, + "grad_norm": 0.74250727891922, + "learning_rate": 4.2961267828001744e-06, + "loss": 0.6586, + "step": 5379 + }, + { + "epoch": 1.4903047091412742, + "grad_norm": 0.6609101295471191, + "learning_rate": 4.295873375244319e-06, + "loss": 0.6372, + "step": 5380 + }, + { + "epoch": 1.4905817174515237, + "grad_norm": 0.7089360952377319, + "learning_rate": 4.295619929557486e-06, + "loss": 0.622, + "step": 5381 + }, + { + "epoch": 1.490858725761773, + "grad_norm": 0.6710687875747681, + "learning_rate": 4.295366445745056e-06, + "loss": 0.6346, + "step": 5382 + }, + { + "epoch": 1.4911357340720222, + "grad_norm": 0.7197329998016357, + "learning_rate": 4.295112923812412e-06, + "loss": 0.6322, + "step": 5383 + }, + { + "epoch": 1.4914127423822714, + "grad_norm": 0.7263287901878357, + "learning_rate": 4.294859363764936e-06, + "loss": 0.6461, + "step": 5384 + }, + { + "epoch": 1.4916897506925209, + "grad_norm": 0.6800792217254639, + "learning_rate": 4.294605765608013e-06, + "loss": 0.5554, + "step": 5385 + }, + { + "epoch": 1.49196675900277, + "grad_norm": 0.6973571181297302, + "learning_rate": 4.294352129347026e-06, + "loss": 0.6044, + "step": 5386 + }, + { + "epoch": 1.4922437673130193, + "grad_norm": 0.6802106499671936, + "learning_rate": 4.29409845498736e-06, + "loss": 0.6012, + "step": 5387 + }, + { + "epoch": 1.4925207756232686, + "grad_norm": 0.7250824570655823, + "learning_rate": 4.2938447425344034e-06, + "loss": 0.664, + "step": 5388 + }, + { + "epoch": 1.492797783933518, + "grad_norm": 0.6981179118156433, + "learning_rate": 4.293590991993542e-06, + "loss": 0.6245, + "step": 5389 + }, + { + "epoch": 1.4930747922437673, + "grad_norm": 0.7301192879676819, + "learning_rate": 4.293337203370162e-06, + "loss": 0.686, + "step": 5390 + }, + { + "epoch": 1.4933518005540165, + "grad_norm": 0.716961681842804, + "learning_rate": 4.293083376669655e-06, + "loss": 0.6187, + "step": 5391 + }, + { + "epoch": 1.493628808864266, + "grad_norm": 0.7076943516731262, + "learning_rate": 4.292829511897409e-06, + "loss": 0.6584, + "step": 5392 + }, + { + "epoch": 1.4939058171745152, + "grad_norm": 0.676984965801239, + "learning_rate": 4.292575609058814e-06, + "loss": 0.6488, + "step": 5393 + }, + { + "epoch": 1.4941828254847644, + "grad_norm": 0.6705701947212219, + "learning_rate": 4.292321668159261e-06, + "loss": 0.5845, + "step": 5394 + }, + { + "epoch": 1.494459833795014, + "grad_norm": 0.7139601111412048, + "learning_rate": 4.292067689204142e-06, + "loss": 0.6151, + "step": 5395 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 0.6521778702735901, + "learning_rate": 4.2918136721988475e-06, + "loss": 0.5775, + "step": 5396 + }, + { + "epoch": 1.4950138504155124, + "grad_norm": 0.7110154032707214, + "learning_rate": 4.291559617148775e-06, + "loss": 0.6636, + "step": 5397 + }, + { + "epoch": 1.4952908587257618, + "grad_norm": 0.672015368938446, + "learning_rate": 4.291305524059317e-06, + "loss": 0.6439, + "step": 5398 + }, + { + "epoch": 1.495567867036011, + "grad_norm": 0.6617342233657837, + "learning_rate": 4.2910513929358665e-06, + "loss": 0.6578, + "step": 5399 + }, + { + "epoch": 1.4958448753462603, + "grad_norm": 0.6488606929779053, + "learning_rate": 4.2907972237838225e-06, + "loss": 0.6104, + "step": 5400 + }, + { + "epoch": 1.4961218836565098, + "grad_norm": 0.7130759358406067, + "learning_rate": 4.29054301660858e-06, + "loss": 0.6166, + "step": 5401 + }, + { + "epoch": 1.496398891966759, + "grad_norm": 0.7276590466499329, + "learning_rate": 4.290288771415536e-06, + "loss": 0.6528, + "step": 5402 + }, + { + "epoch": 1.4966759002770083, + "grad_norm": 0.7055269479751587, + "learning_rate": 4.29003448821009e-06, + "loss": 0.6317, + "step": 5403 + }, + { + "epoch": 1.4969529085872577, + "grad_norm": 0.6901901364326477, + "learning_rate": 4.28978016699764e-06, + "loss": 0.642, + "step": 5404 + }, + { + "epoch": 1.497229916897507, + "grad_norm": 0.7070296406745911, + "learning_rate": 4.289525807783588e-06, + "loss": 0.6675, + "step": 5405 + }, + { + "epoch": 1.4975069252077562, + "grad_norm": 0.6661621332168579, + "learning_rate": 4.289271410573332e-06, + "loss": 0.6203, + "step": 5406 + }, + { + "epoch": 1.4977839335180057, + "grad_norm": 0.6991596221923828, + "learning_rate": 4.2890169753722746e-06, + "loss": 0.5962, + "step": 5407 + }, + { + "epoch": 1.4980609418282549, + "grad_norm": 0.7044360041618347, + "learning_rate": 4.288762502185818e-06, + "loss": 0.6288, + "step": 5408 + }, + { + "epoch": 1.4983379501385041, + "grad_norm": 0.6955025792121887, + "learning_rate": 4.288507991019366e-06, + "loss": 0.6694, + "step": 5409 + }, + { + "epoch": 1.4986149584487536, + "grad_norm": 0.7140067219734192, + "learning_rate": 4.2882534418783216e-06, + "loss": 0.6491, + "step": 5410 + }, + { + "epoch": 1.4988919667590028, + "grad_norm": 0.7064623236656189, + "learning_rate": 4.287998854768091e-06, + "loss": 0.6259, + "step": 5411 + }, + { + "epoch": 1.499168975069252, + "grad_norm": 0.6820512413978577, + "learning_rate": 4.287744229694077e-06, + "loss": 0.6292, + "step": 5412 + }, + { + "epoch": 1.4994459833795015, + "grad_norm": 0.7026885747909546, + "learning_rate": 4.287489566661689e-06, + "loss": 0.6626, + "step": 5413 + }, + { + "epoch": 1.4997229916897508, + "grad_norm": 0.6979086995124817, + "learning_rate": 4.287234865676332e-06, + "loss": 0.6331, + "step": 5414 + }, + { + "epoch": 1.5, + "grad_norm": 0.6782578825950623, + "learning_rate": 4.2869801267434145e-06, + "loss": 0.6163, + "step": 5415 + }, + { + "epoch": 1.5002770083102495, + "grad_norm": 0.7158074378967285, + "learning_rate": 4.2867253498683455e-06, + "loss": 0.6391, + "step": 5416 + }, + { + "epoch": 1.5005540166204985, + "grad_norm": 0.703403115272522, + "learning_rate": 4.286470535056536e-06, + "loss": 0.6348, + "step": 5417 + }, + { + "epoch": 1.500831024930748, + "grad_norm": 0.6722911596298218, + "learning_rate": 4.286215682313393e-06, + "loss": 0.6328, + "step": 5418 + }, + { + "epoch": 1.5011080332409972, + "grad_norm": 0.7120633125305176, + "learning_rate": 4.28596079164433e-06, + "loss": 0.6293, + "step": 5419 + }, + { + "epoch": 1.5013850415512464, + "grad_norm": 0.692536473274231, + "learning_rate": 4.285705863054759e-06, + "loss": 0.6305, + "step": 5420 + }, + { + "epoch": 1.5016620498614959, + "grad_norm": 0.685097873210907, + "learning_rate": 4.285450896550093e-06, + "loss": 0.6588, + "step": 5421 + }, + { + "epoch": 1.5019390581717451, + "grad_norm": 0.6913133263587952, + "learning_rate": 4.285195892135744e-06, + "loss": 0.6222, + "step": 5422 + }, + { + "epoch": 1.5022160664819943, + "grad_norm": 0.7574515342712402, + "learning_rate": 4.284940849817127e-06, + "loss": 0.6058, + "step": 5423 + }, + { + "epoch": 1.5024930747922438, + "grad_norm": 0.7188212871551514, + "learning_rate": 4.284685769599658e-06, + "loss": 0.6449, + "step": 5424 + }, + { + "epoch": 1.502770083102493, + "grad_norm": 0.7023669481277466, + "learning_rate": 4.284430651488753e-06, + "loss": 0.6061, + "step": 5425 + }, + { + "epoch": 1.5030470914127423, + "grad_norm": 0.6833453178405762, + "learning_rate": 4.2841754954898274e-06, + "loss": 0.6079, + "step": 5426 + }, + { + "epoch": 1.5033240997229917, + "grad_norm": 0.7508666515350342, + "learning_rate": 4.2839203016083e-06, + "loss": 0.6261, + "step": 5427 + }, + { + "epoch": 1.503601108033241, + "grad_norm": 0.6877349615097046, + "learning_rate": 4.283665069849589e-06, + "loss": 0.6221, + "step": 5428 + }, + { + "epoch": 1.5038781163434902, + "grad_norm": 0.6670334339141846, + "learning_rate": 4.283409800219114e-06, + "loss": 0.5992, + "step": 5429 + }, + { + "epoch": 1.5041551246537397, + "grad_norm": 0.7320896983146667, + "learning_rate": 4.283154492722294e-06, + "loss": 0.6989, + "step": 5430 + }, + { + "epoch": 1.504432132963989, + "grad_norm": 0.6807078719139099, + "learning_rate": 4.282899147364551e-06, + "loss": 0.6386, + "step": 5431 + }, + { + "epoch": 1.5047091412742382, + "grad_norm": 0.6722118854522705, + "learning_rate": 4.282643764151306e-06, + "loss": 0.5941, + "step": 5432 + }, + { + "epoch": 1.5049861495844876, + "grad_norm": 0.6991921663284302, + "learning_rate": 4.282388343087981e-06, + "loss": 0.6243, + "step": 5433 + }, + { + "epoch": 1.5052631578947369, + "grad_norm": 0.7761150002479553, + "learning_rate": 4.28213288418e-06, + "loss": 0.6775, + "step": 5434 + }, + { + "epoch": 1.505540166204986, + "grad_norm": 0.7118150591850281, + "learning_rate": 4.281877387432787e-06, + "loss": 0.6609, + "step": 5435 + }, + { + "epoch": 1.5058171745152356, + "grad_norm": 0.6934765577316284, + "learning_rate": 4.281621852851766e-06, + "loss": 0.6241, + "step": 5436 + }, + { + "epoch": 1.5060941828254848, + "grad_norm": 0.7331833839416504, + "learning_rate": 4.2813662804423645e-06, + "loss": 0.6768, + "step": 5437 + }, + { + "epoch": 1.506371191135734, + "grad_norm": 0.7009062767028809, + "learning_rate": 4.281110670210007e-06, + "loss": 0.6504, + "step": 5438 + }, + { + "epoch": 1.5066481994459835, + "grad_norm": 0.668209433555603, + "learning_rate": 4.280855022160121e-06, + "loss": 0.5986, + "step": 5439 + }, + { + "epoch": 1.5069252077562327, + "grad_norm": 0.6664574146270752, + "learning_rate": 4.2805993362981355e-06, + "loss": 0.5815, + "step": 5440 + }, + { + "epoch": 1.507202216066482, + "grad_norm": 0.6569491028785706, + "learning_rate": 4.28034361262948e-06, + "loss": 0.6364, + "step": 5441 + }, + { + "epoch": 1.5074792243767314, + "grad_norm": 0.6996865272521973, + "learning_rate": 4.280087851159582e-06, + "loss": 0.6371, + "step": 5442 + }, + { + "epoch": 1.5077562326869804, + "grad_norm": 0.7376276254653931, + "learning_rate": 4.279832051893873e-06, + "loss": 0.6692, + "step": 5443 + }, + { + "epoch": 1.50803324099723, + "grad_norm": 0.7286374568939209, + "learning_rate": 4.279576214837784e-06, + "loss": 0.6352, + "step": 5444 + }, + { + "epoch": 1.5083102493074794, + "grad_norm": 0.6960650682449341, + "learning_rate": 4.279320339996749e-06, + "loss": 0.5829, + "step": 5445 + }, + { + "epoch": 1.5085872576177284, + "grad_norm": 0.6842361688613892, + "learning_rate": 4.279064427376199e-06, + "loss": 0.6139, + "step": 5446 + }, + { + "epoch": 1.5088642659279778, + "grad_norm": 0.6859242916107178, + "learning_rate": 4.278808476981567e-06, + "loss": 0.6111, + "step": 5447 + }, + { + "epoch": 1.5091412742382273, + "grad_norm": 0.7144955992698669, + "learning_rate": 4.2785524888182885e-06, + "loss": 0.6538, + "step": 5448 + }, + { + "epoch": 1.5094182825484763, + "grad_norm": 0.7113151550292969, + "learning_rate": 4.278296462891798e-06, + "loss": 0.6355, + "step": 5449 + }, + { + "epoch": 1.5096952908587258, + "grad_norm": 0.7035449147224426, + "learning_rate": 4.278040399207533e-06, + "loss": 0.6406, + "step": 5450 + }, + { + "epoch": 1.509972299168975, + "grad_norm": 0.6973490118980408, + "learning_rate": 4.2777842977709305e-06, + "loss": 0.6132, + "step": 5451 + }, + { + "epoch": 1.5102493074792243, + "grad_norm": 0.7446228861808777, + "learning_rate": 4.2775281585874275e-06, + "loss": 0.6287, + "step": 5452 + }, + { + "epoch": 1.5105263157894737, + "grad_norm": 0.7096726894378662, + "learning_rate": 4.2772719816624615e-06, + "loss": 0.6358, + "step": 5453 + }, + { + "epoch": 1.510803324099723, + "grad_norm": 0.6858077049255371, + "learning_rate": 4.277015767001473e-06, + "loss": 0.6146, + "step": 5454 + }, + { + "epoch": 1.5110803324099722, + "grad_norm": 0.6863676905632019, + "learning_rate": 4.276759514609902e-06, + "loss": 0.6236, + "step": 5455 + }, + { + "epoch": 1.5113573407202217, + "grad_norm": 0.6914564967155457, + "learning_rate": 4.276503224493188e-06, + "loss": 0.6244, + "step": 5456 + }, + { + "epoch": 1.511634349030471, + "grad_norm": 0.6845644116401672, + "learning_rate": 4.276246896656775e-06, + "loss": 0.642, + "step": 5457 + }, + { + "epoch": 1.5119113573407201, + "grad_norm": 0.6796553134918213, + "learning_rate": 4.275990531106104e-06, + "loss": 0.648, + "step": 5458 + }, + { + "epoch": 1.5121883656509696, + "grad_norm": 0.6412548422813416, + "learning_rate": 4.275734127846619e-06, + "loss": 0.5683, + "step": 5459 + }, + { + "epoch": 1.5124653739612188, + "grad_norm": 0.725028932094574, + "learning_rate": 4.275477686883763e-06, + "loss": 0.6299, + "step": 5460 + }, + { + "epoch": 1.512742382271468, + "grad_norm": 0.7068825960159302, + "learning_rate": 4.275221208222982e-06, + "loss": 0.6453, + "step": 5461 + }, + { + "epoch": 1.5130193905817175, + "grad_norm": 0.7068927884101868, + "learning_rate": 4.274964691869722e-06, + "loss": 0.6541, + "step": 5462 + }, + { + "epoch": 1.5132963988919668, + "grad_norm": 0.7255887985229492, + "learning_rate": 4.2747081378294275e-06, + "loss": 0.6562, + "step": 5463 + }, + { + "epoch": 1.513573407202216, + "grad_norm": 0.7126734852790833, + "learning_rate": 4.274451546107548e-06, + "loss": 0.6605, + "step": 5464 + }, + { + "epoch": 1.5138504155124655, + "grad_norm": 0.6980812549591064, + "learning_rate": 4.274194916709531e-06, + "loss": 0.623, + "step": 5465 + }, + { + "epoch": 1.5141274238227147, + "grad_norm": 0.6746025681495667, + "learning_rate": 4.2739382496408245e-06, + "loss": 0.5671, + "step": 5466 + }, + { + "epoch": 1.514404432132964, + "grad_norm": 0.7552827000617981, + "learning_rate": 4.2736815449068794e-06, + "loss": 0.6663, + "step": 5467 + }, + { + "epoch": 1.5146814404432134, + "grad_norm": 0.704200267791748, + "learning_rate": 4.273424802513145e-06, + "loss": 0.6326, + "step": 5468 + }, + { + "epoch": 1.5149584487534626, + "grad_norm": 0.7429570555686951, + "learning_rate": 4.273168022465073e-06, + "loss": 0.6494, + "step": 5469 + }, + { + "epoch": 1.5152354570637119, + "grad_norm": 0.7143659591674805, + "learning_rate": 4.272911204768116e-06, + "loss": 0.6555, + "step": 5470 + }, + { + "epoch": 1.5155124653739613, + "grad_norm": 0.6867467164993286, + "learning_rate": 4.272654349427726e-06, + "loss": 0.6375, + "step": 5471 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 0.7136903405189514, + "learning_rate": 4.272397456449359e-06, + "loss": 0.6331, + "step": 5472 + }, + { + "epoch": 1.5160664819944598, + "grad_norm": 0.7227846384048462, + "learning_rate": 4.2721405258384666e-06, + "loss": 0.6297, + "step": 5473 + }, + { + "epoch": 1.5163434903047093, + "grad_norm": 0.717075526714325, + "learning_rate": 4.271883557600506e-06, + "loss": 0.6408, + "step": 5474 + }, + { + "epoch": 1.5166204986149583, + "grad_norm": 0.6881066560745239, + "learning_rate": 4.271626551740932e-06, + "loss": 0.6404, + "step": 5475 + }, + { + "epoch": 1.5168975069252078, + "grad_norm": 0.6838085651397705, + "learning_rate": 4.271369508265202e-06, + "loss": 0.6611, + "step": 5476 + }, + { + "epoch": 1.5171745152354572, + "grad_norm": 0.6700277924537659, + "learning_rate": 4.271112427178774e-06, + "loss": 0.6043, + "step": 5477 + }, + { + "epoch": 1.5174515235457062, + "grad_norm": 0.6892854571342468, + "learning_rate": 4.270855308487106e-06, + "loss": 0.6272, + "step": 5478 + }, + { + "epoch": 1.5177285318559557, + "grad_norm": 0.6893463730812073, + "learning_rate": 4.270598152195657e-06, + "loss": 0.6337, + "step": 5479 + }, + { + "epoch": 1.5180055401662051, + "grad_norm": 0.696556568145752, + "learning_rate": 4.270340958309889e-06, + "loss": 0.6119, + "step": 5480 + }, + { + "epoch": 1.5182825484764542, + "grad_norm": 0.6970028877258301, + "learning_rate": 4.2700837268352605e-06, + "loss": 0.6339, + "step": 5481 + }, + { + "epoch": 1.5185595567867036, + "grad_norm": 0.6837161779403687, + "learning_rate": 4.269826457777234e-06, + "loss": 0.6501, + "step": 5482 + }, + { + "epoch": 1.5188365650969529, + "grad_norm": 0.7038562297821045, + "learning_rate": 4.2695691511412726e-06, + "loss": 0.6385, + "step": 5483 + }, + { + "epoch": 1.519113573407202, + "grad_norm": 0.7245030403137207, + "learning_rate": 4.26931180693284e-06, + "loss": 0.6326, + "step": 5484 + }, + { + "epoch": 1.5193905817174516, + "grad_norm": 0.7005324959754944, + "learning_rate": 4.269054425157398e-06, + "loss": 0.6136, + "step": 5485 + }, + { + "epoch": 1.5196675900277008, + "grad_norm": 0.7006979584693909, + "learning_rate": 4.268797005820413e-06, + "loss": 0.5971, + "step": 5486 + }, + { + "epoch": 1.51994459833795, + "grad_norm": 0.7217789888381958, + "learning_rate": 4.268539548927351e-06, + "loss": 0.666, + "step": 5487 + }, + { + "epoch": 1.5202216066481995, + "grad_norm": 0.6616018414497375, + "learning_rate": 4.2682820544836785e-06, + "loss": 0.5997, + "step": 5488 + }, + { + "epoch": 1.5204986149584487, + "grad_norm": 0.6509618759155273, + "learning_rate": 4.2680245224948615e-06, + "loss": 0.6043, + "step": 5489 + }, + { + "epoch": 1.520775623268698, + "grad_norm": 0.7281323075294495, + "learning_rate": 4.267766952966369e-06, + "loss": 0.6311, + "step": 5490 + }, + { + "epoch": 1.5210526315789474, + "grad_norm": 0.6721056699752808, + "learning_rate": 4.2675093459036695e-06, + "loss": 0.6389, + "step": 5491 + }, + { + "epoch": 1.5213296398891967, + "grad_norm": 0.7493563890457153, + "learning_rate": 4.267251701312234e-06, + "loss": 0.5979, + "step": 5492 + }, + { + "epoch": 1.521606648199446, + "grad_norm": 0.6652003526687622, + "learning_rate": 4.266994019197531e-06, + "loss": 0.6315, + "step": 5493 + }, + { + "epoch": 1.5218836565096954, + "grad_norm": 0.713382363319397, + "learning_rate": 4.266736299565032e-06, + "loss": 0.6358, + "step": 5494 + }, + { + "epoch": 1.5221606648199446, + "grad_norm": 0.6911031007766724, + "learning_rate": 4.2664785424202105e-06, + "loss": 0.682, + "step": 5495 + }, + { + "epoch": 1.5224376731301938, + "grad_norm": 0.6979799270629883, + "learning_rate": 4.266220747768538e-06, + "loss": 0.6399, + "step": 5496 + }, + { + "epoch": 1.5227146814404433, + "grad_norm": 0.6786274313926697, + "learning_rate": 4.2659629156154894e-06, + "loss": 0.5986, + "step": 5497 + }, + { + "epoch": 1.5229916897506925, + "grad_norm": 0.6834037899971008, + "learning_rate": 4.2657050459665375e-06, + "loss": 0.6497, + "step": 5498 + }, + { + "epoch": 1.5232686980609418, + "grad_norm": 0.6988540887832642, + "learning_rate": 4.265447138827158e-06, + "loss": 0.6382, + "step": 5499 + }, + { + "epoch": 1.5235457063711912, + "grad_norm": 0.6756704449653625, + "learning_rate": 4.265189194202828e-06, + "loss": 0.62, + "step": 5500 + }, + { + "epoch": 1.5238227146814405, + "grad_norm": 0.6644415855407715, + "learning_rate": 4.264931212099023e-06, + "loss": 0.5966, + "step": 5501 + }, + { + "epoch": 1.5240997229916897, + "grad_norm": 0.704603910446167, + "learning_rate": 4.264673192521221e-06, + "loss": 0.6487, + "step": 5502 + }, + { + "epoch": 1.5243767313019392, + "grad_norm": 0.6660934090614319, + "learning_rate": 4.264415135474902e-06, + "loss": 0.6117, + "step": 5503 + }, + { + "epoch": 1.5246537396121884, + "grad_norm": 0.658829391002655, + "learning_rate": 4.264157040965543e-06, + "loss": 0.6567, + "step": 5504 + }, + { + "epoch": 1.5249307479224377, + "grad_norm": 0.738782525062561, + "learning_rate": 4.2638989089986245e-06, + "loss": 0.6261, + "step": 5505 + }, + { + "epoch": 1.5252077562326871, + "grad_norm": 0.7513110041618347, + "learning_rate": 4.263640739579629e-06, + "loss": 0.6354, + "step": 5506 + }, + { + "epoch": 1.5254847645429361, + "grad_norm": 0.6777371168136597, + "learning_rate": 4.2633825327140355e-06, + "loss": 0.6373, + "step": 5507 + }, + { + "epoch": 1.5257617728531856, + "grad_norm": 0.6883520483970642, + "learning_rate": 4.263124288407328e-06, + "loss": 0.628, + "step": 5508 + }, + { + "epoch": 1.526038781163435, + "grad_norm": 0.7177495956420898, + "learning_rate": 4.262866006664989e-06, + "loss": 0.6231, + "step": 5509 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.7112988233566284, + "learning_rate": 4.262607687492503e-06, + "loss": 0.6436, + "step": 5510 + }, + { + "epoch": 1.5265927977839335, + "grad_norm": 0.6998488306999207, + "learning_rate": 4.262349330895354e-06, + "loss": 0.5978, + "step": 5511 + }, + { + "epoch": 1.526869806094183, + "grad_norm": 0.7161896824836731, + "learning_rate": 4.26209093687903e-06, + "loss": 0.6391, + "step": 5512 + }, + { + "epoch": 1.527146814404432, + "grad_norm": 0.6859282851219177, + "learning_rate": 4.261832505449014e-06, + "loss": 0.6053, + "step": 5513 + }, + { + "epoch": 1.5274238227146815, + "grad_norm": 0.6807940006256104, + "learning_rate": 4.261574036610795e-06, + "loss": 0.6584, + "step": 5514 + }, + { + "epoch": 1.5277008310249307, + "grad_norm": 0.710959255695343, + "learning_rate": 4.261315530369861e-06, + "loss": 0.6322, + "step": 5515 + }, + { + "epoch": 1.52797783933518, + "grad_norm": 0.7400763034820557, + "learning_rate": 4.2610569867316994e-06, + "loss": 0.6589, + "step": 5516 + }, + { + "epoch": 1.5282548476454294, + "grad_norm": 0.7326130867004395, + "learning_rate": 4.260798405701802e-06, + "loss": 0.6851, + "step": 5517 + }, + { + "epoch": 1.5285318559556786, + "grad_norm": 0.6840008497238159, + "learning_rate": 4.2605397872856565e-06, + "loss": 0.6138, + "step": 5518 + }, + { + "epoch": 1.5288088642659279, + "grad_norm": 0.6635662317276001, + "learning_rate": 4.260281131488757e-06, + "loss": 0.6023, + "step": 5519 + }, + { + "epoch": 1.5290858725761773, + "grad_norm": 0.706579864025116, + "learning_rate": 4.260022438316593e-06, + "loss": 0.6307, + "step": 5520 + }, + { + "epoch": 1.5293628808864266, + "grad_norm": 0.7006165981292725, + "learning_rate": 4.259763707774658e-06, + "loss": 0.6226, + "step": 5521 + }, + { + "epoch": 1.5296398891966758, + "grad_norm": 0.6810389757156372, + "learning_rate": 4.259504939868445e-06, + "loss": 0.6825, + "step": 5522 + }, + { + "epoch": 1.5299168975069253, + "grad_norm": 0.6524600982666016, + "learning_rate": 4.25924613460345e-06, + "loss": 0.598, + "step": 5523 + }, + { + "epoch": 1.5301939058171745, + "grad_norm": 0.714477002620697, + "learning_rate": 4.258987291985166e-06, + "loss": 0.6285, + "step": 5524 + }, + { + "epoch": 1.5304709141274238, + "grad_norm": 0.6839551329612732, + "learning_rate": 4.258728412019089e-06, + "loss": 0.5905, + "step": 5525 + }, + { + "epoch": 1.5307479224376732, + "grad_norm": 0.695607602596283, + "learning_rate": 4.258469494710719e-06, + "loss": 0.5707, + "step": 5526 + }, + { + "epoch": 1.5310249307479225, + "grad_norm": 0.7326964139938354, + "learning_rate": 4.258210540065549e-06, + "loss": 0.6337, + "step": 5527 + }, + { + "epoch": 1.5313019390581717, + "grad_norm": 0.7270690202713013, + "learning_rate": 4.2579515480890805e-06, + "loss": 0.6443, + "step": 5528 + }, + { + "epoch": 1.5315789473684212, + "grad_norm": 0.7199828028678894, + "learning_rate": 4.25769251878681e-06, + "loss": 0.6347, + "step": 5529 + }, + { + "epoch": 1.5318559556786704, + "grad_norm": 0.7613186240196228, + "learning_rate": 4.2574334521642404e-06, + "loss": 0.6838, + "step": 5530 + }, + { + "epoch": 1.5321329639889196, + "grad_norm": 0.7245945930480957, + "learning_rate": 4.257174348226869e-06, + "loss": 0.644, + "step": 5531 + }, + { + "epoch": 1.532409972299169, + "grad_norm": 0.7069911360740662, + "learning_rate": 4.256915206980199e-06, + "loss": 0.6928, + "step": 5532 + }, + { + "epoch": 1.5326869806094183, + "grad_norm": 0.7057062387466431, + "learning_rate": 4.256656028429733e-06, + "loss": 0.628, + "step": 5533 + }, + { + "epoch": 1.5329639889196676, + "grad_norm": 0.7141075134277344, + "learning_rate": 4.2563968125809734e-06, + "loss": 0.6596, + "step": 5534 + }, + { + "epoch": 1.533240997229917, + "grad_norm": 0.6988198757171631, + "learning_rate": 4.256137559439424e-06, + "loss": 0.6343, + "step": 5535 + }, + { + "epoch": 1.533518005540166, + "grad_norm": 0.6692717671394348, + "learning_rate": 4.25587826901059e-06, + "loss": 0.6166, + "step": 5536 + }, + { + "epoch": 1.5337950138504155, + "grad_norm": 0.6966196894645691, + "learning_rate": 4.255618941299976e-06, + "loss": 0.607, + "step": 5537 + }, + { + "epoch": 1.534072022160665, + "grad_norm": 0.6670767664909363, + "learning_rate": 4.255359576313089e-06, + "loss": 0.5969, + "step": 5538 + }, + { + "epoch": 1.534349030470914, + "grad_norm": 0.6590955257415771, + "learning_rate": 4.255100174055434e-06, + "loss": 0.6277, + "step": 5539 + }, + { + "epoch": 1.5346260387811634, + "grad_norm": 0.6713624596595764, + "learning_rate": 4.254840734532521e-06, + "loss": 0.6466, + "step": 5540 + }, + { + "epoch": 1.534903047091413, + "grad_norm": 0.8212193250656128, + "learning_rate": 4.254581257749859e-06, + "loss": 0.6556, + "step": 5541 + }, + { + "epoch": 1.535180055401662, + "grad_norm": 0.7253426313400269, + "learning_rate": 4.254321743712955e-06, + "loss": 0.6175, + "step": 5542 + }, + { + "epoch": 1.5354570637119114, + "grad_norm": 0.6754900813102722, + "learning_rate": 4.254062192427321e-06, + "loss": 0.6497, + "step": 5543 + }, + { + "epoch": 1.5357340720221606, + "grad_norm": 0.6963987350463867, + "learning_rate": 4.2538026038984665e-06, + "loss": 0.6163, + "step": 5544 + }, + { + "epoch": 1.5360110803324099, + "grad_norm": 0.7260979413986206, + "learning_rate": 4.253542978131904e-06, + "loss": 0.6287, + "step": 5545 + }, + { + "epoch": 1.5362880886426593, + "grad_norm": 0.7109147906303406, + "learning_rate": 4.253283315133146e-06, + "loss": 0.658, + "step": 5546 + }, + { + "epoch": 1.5365650969529085, + "grad_norm": 0.7043456435203552, + "learning_rate": 4.253023614907707e-06, + "loss": 0.6204, + "step": 5547 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 0.6898895502090454, + "learning_rate": 4.252763877461099e-06, + "loss": 0.6093, + "step": 5548 + }, + { + "epoch": 1.5371191135734072, + "grad_norm": 0.7038882374763489, + "learning_rate": 4.252504102798838e-06, + "loss": 0.6297, + "step": 5549 + }, + { + "epoch": 1.5373961218836565, + "grad_norm": 0.7062937021255493, + "learning_rate": 4.25224429092644e-06, + "loss": 0.6274, + "step": 5550 + }, + { + "epoch": 1.5376731301939057, + "grad_norm": 0.7120965719223022, + "learning_rate": 4.25198444184942e-06, + "loss": 0.6055, + "step": 5551 + }, + { + "epoch": 1.5379501385041552, + "grad_norm": 0.7174949645996094, + "learning_rate": 4.251724555573297e-06, + "loss": 0.6823, + "step": 5552 + }, + { + "epoch": 1.5382271468144044, + "grad_norm": 0.7251701354980469, + "learning_rate": 4.251464632103587e-06, + "loss": 0.6277, + "step": 5553 + }, + { + "epoch": 1.5385041551246537, + "grad_norm": 0.7451249361038208, + "learning_rate": 4.251204671445811e-06, + "loss": 0.6663, + "step": 5554 + }, + { + "epoch": 1.5387811634349031, + "grad_norm": 0.72591632604599, + "learning_rate": 4.250944673605488e-06, + "loss": 0.6495, + "step": 5555 + }, + { + "epoch": 1.5390581717451524, + "grad_norm": 0.6595091223716736, + "learning_rate": 4.2506846385881375e-06, + "loss": 0.6072, + "step": 5556 + }, + { + "epoch": 1.5393351800554016, + "grad_norm": 0.705683171749115, + "learning_rate": 4.250424566399282e-06, + "loss": 0.6637, + "step": 5557 + }, + { + "epoch": 1.539612188365651, + "grad_norm": 0.6721247434616089, + "learning_rate": 4.250164457044443e-06, + "loss": 0.6715, + "step": 5558 + }, + { + "epoch": 1.5398891966759003, + "grad_norm": 0.6472792029380798, + "learning_rate": 4.249904310529143e-06, + "loss": 0.6056, + "step": 5559 + }, + { + "epoch": 1.5401662049861495, + "grad_norm": 0.7078953385353088, + "learning_rate": 4.249644126858905e-06, + "loss": 0.6375, + "step": 5560 + }, + { + "epoch": 1.540443213296399, + "grad_norm": 0.6639420986175537, + "learning_rate": 4.249383906039254e-06, + "loss": 0.6428, + "step": 5561 + }, + { + "epoch": 1.5407202216066482, + "grad_norm": 0.7171564698219299, + "learning_rate": 4.249123648075716e-06, + "loss": 0.5995, + "step": 5562 + }, + { + "epoch": 1.5409972299168975, + "grad_norm": 0.7072045803070068, + "learning_rate": 4.248863352973817e-06, + "loss": 0.634, + "step": 5563 + }, + { + "epoch": 1.541274238227147, + "grad_norm": 0.6976851224899292, + "learning_rate": 4.248603020739081e-06, + "loss": 0.6055, + "step": 5564 + }, + { + "epoch": 1.5415512465373962, + "grad_norm": 0.6449586153030396, + "learning_rate": 4.248342651377038e-06, + "loss": 0.6068, + "step": 5565 + }, + { + "epoch": 1.5418282548476454, + "grad_norm": 0.7084159851074219, + "learning_rate": 4.248082244893216e-06, + "loss": 0.6633, + "step": 5566 + }, + { + "epoch": 1.5421052631578949, + "grad_norm": 0.6994836926460266, + "learning_rate": 4.247821801293144e-06, + "loss": 0.613, + "step": 5567 + }, + { + "epoch": 1.5423822714681439, + "grad_norm": 0.7218542695045471, + "learning_rate": 4.247561320582352e-06, + "loss": 0.6203, + "step": 5568 + }, + { + "epoch": 1.5426592797783933, + "grad_norm": 0.6681088805198669, + "learning_rate": 4.24730080276637e-06, + "loss": 0.6178, + "step": 5569 + }, + { + "epoch": 1.5429362880886428, + "grad_norm": 0.6878266334533691, + "learning_rate": 4.247040247850729e-06, + "loss": 0.615, + "step": 5570 + }, + { + "epoch": 1.5432132963988918, + "grad_norm": 0.6583474278450012, + "learning_rate": 4.246779655840963e-06, + "loss": 0.5656, + "step": 5571 + }, + { + "epoch": 1.5434903047091413, + "grad_norm": 0.6586458086967468, + "learning_rate": 4.246519026742604e-06, + "loss": 0.5824, + "step": 5572 + }, + { + "epoch": 1.5437673130193907, + "grad_norm": 0.7186761498451233, + "learning_rate": 4.2462583605611865e-06, + "loss": 0.6897, + "step": 5573 + }, + { + "epoch": 1.5440443213296398, + "grad_norm": 0.6883975267410278, + "learning_rate": 4.245997657302243e-06, + "loss": 0.5985, + "step": 5574 + }, + { + "epoch": 1.5443213296398892, + "grad_norm": 0.7059137225151062, + "learning_rate": 4.245736916971312e-06, + "loss": 0.6583, + "step": 5575 + }, + { + "epoch": 1.5445983379501385, + "grad_norm": 0.7427789568901062, + "learning_rate": 4.245476139573928e-06, + "loss": 0.6418, + "step": 5576 + }, + { + "epoch": 1.5448753462603877, + "grad_norm": 0.708476185798645, + "learning_rate": 4.245215325115627e-06, + "loss": 0.5888, + "step": 5577 + }, + { + "epoch": 1.5451523545706372, + "grad_norm": 0.6729523539543152, + "learning_rate": 4.2449544736019486e-06, + "loss": 0.649, + "step": 5578 + }, + { + "epoch": 1.5454293628808864, + "grad_norm": 0.6773388385772705, + "learning_rate": 4.24469358503843e-06, + "loss": 0.6112, + "step": 5579 + }, + { + "epoch": 1.5457063711911356, + "grad_norm": 0.7275780439376831, + "learning_rate": 4.244432659430612e-06, + "loss": 0.6699, + "step": 5580 + }, + { + "epoch": 1.545983379501385, + "grad_norm": 0.7121980786323547, + "learning_rate": 4.244171696784033e-06, + "loss": 0.6851, + "step": 5581 + }, + { + "epoch": 1.5462603878116343, + "grad_norm": 0.6924628019332886, + "learning_rate": 4.243910697104235e-06, + "loss": 0.6616, + "step": 5582 + }, + { + "epoch": 1.5465373961218836, + "grad_norm": 0.6828587651252747, + "learning_rate": 4.24364966039676e-06, + "loss": 0.6252, + "step": 5583 + }, + { + "epoch": 1.546814404432133, + "grad_norm": 0.688407301902771, + "learning_rate": 4.24338858666715e-06, + "loss": 0.6092, + "step": 5584 + }, + { + "epoch": 1.5470914127423823, + "grad_norm": 0.722294807434082, + "learning_rate": 4.2431274759209475e-06, + "loss": 0.599, + "step": 5585 + }, + { + "epoch": 1.5473684210526315, + "grad_norm": 0.6793596744537354, + "learning_rate": 4.242866328163697e-06, + "loss": 0.5869, + "step": 5586 + }, + { + "epoch": 1.547645429362881, + "grad_norm": 0.6670454740524292, + "learning_rate": 4.242605143400944e-06, + "loss": 0.6207, + "step": 5587 + }, + { + "epoch": 1.5479224376731302, + "grad_norm": 0.7123885154724121, + "learning_rate": 4.242343921638235e-06, + "loss": 0.6038, + "step": 5588 + }, + { + "epoch": 1.5481994459833794, + "grad_norm": 0.7021961212158203, + "learning_rate": 4.242082662881113e-06, + "loss": 0.6067, + "step": 5589 + }, + { + "epoch": 1.548476454293629, + "grad_norm": 0.6837357878684998, + "learning_rate": 4.241821367135128e-06, + "loss": 0.6044, + "step": 5590 + }, + { + "epoch": 1.5487534626038781, + "grad_norm": 0.6798022389411926, + "learning_rate": 4.241560034405827e-06, + "loss": 0.6291, + "step": 5591 + }, + { + "epoch": 1.5490304709141274, + "grad_norm": 0.7640137076377869, + "learning_rate": 4.24129866469876e-06, + "loss": 0.607, + "step": 5592 + }, + { + "epoch": 1.5493074792243768, + "grad_norm": 0.6815189123153687, + "learning_rate": 4.241037258019474e-06, + "loss": 0.6259, + "step": 5593 + }, + { + "epoch": 1.549584487534626, + "grad_norm": 0.6690675616264343, + "learning_rate": 4.240775814373522e-06, + "loss": 0.6311, + "step": 5594 + }, + { + "epoch": 1.5498614958448753, + "grad_norm": 0.6691811680793762, + "learning_rate": 4.2405143337664535e-06, + "loss": 0.5653, + "step": 5595 + }, + { + "epoch": 1.5501385041551248, + "grad_norm": 0.6947237849235535, + "learning_rate": 4.240252816203821e-06, + "loss": 0.6133, + "step": 5596 + }, + { + "epoch": 1.550415512465374, + "grad_norm": 0.7483521699905396, + "learning_rate": 4.239991261691178e-06, + "loss": 0.5988, + "step": 5597 + }, + { + "epoch": 1.5506925207756233, + "grad_norm": 0.6979661583900452, + "learning_rate": 4.2397296702340755e-06, + "loss": 0.5872, + "step": 5598 + }, + { + "epoch": 1.5509695290858727, + "grad_norm": 0.7207865118980408, + "learning_rate": 4.239468041838071e-06, + "loss": 0.5924, + "step": 5599 + }, + { + "epoch": 1.5512465373961217, + "grad_norm": 0.7086422443389893, + "learning_rate": 4.239206376508716e-06, + "loss": 0.6513, + "step": 5600 + }, + { + "epoch": 1.5515235457063712, + "grad_norm": 0.688262403011322, + "learning_rate": 4.238944674251569e-06, + "loss": 0.6381, + "step": 5601 + }, + { + "epoch": 1.5518005540166206, + "grad_norm": 0.6768400073051453, + "learning_rate": 4.238682935072186e-06, + "loss": 0.6527, + "step": 5602 + }, + { + "epoch": 1.5520775623268697, + "grad_norm": 0.7429431676864624, + "learning_rate": 4.238421158976124e-06, + "loss": 0.6481, + "step": 5603 + }, + { + "epoch": 1.5523545706371191, + "grad_norm": 0.6837465167045593, + "learning_rate": 4.238159345968942e-06, + "loss": 0.6411, + "step": 5604 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.6943199634552002, + "learning_rate": 4.237897496056198e-06, + "loss": 0.6303, + "step": 5605 + }, + { + "epoch": 1.5529085872576176, + "grad_norm": 0.7053850889205933, + "learning_rate": 4.2376356092434514e-06, + "loss": 0.6542, + "step": 5606 + }, + { + "epoch": 1.553185595567867, + "grad_norm": 0.6722522377967834, + "learning_rate": 4.237373685536264e-06, + "loss": 0.6155, + "step": 5607 + }, + { + "epoch": 1.5534626038781163, + "grad_norm": 0.6928883790969849, + "learning_rate": 4.237111724940196e-06, + "loss": 0.6355, + "step": 5608 + }, + { + "epoch": 1.5537396121883655, + "grad_norm": 0.7059097290039062, + "learning_rate": 4.23684972746081e-06, + "loss": 0.6008, + "step": 5609 + }, + { + "epoch": 1.554016620498615, + "grad_norm": 0.6975715160369873, + "learning_rate": 4.23658769310367e-06, + "loss": 0.616, + "step": 5610 + }, + { + "epoch": 1.5542936288088642, + "grad_norm": 0.6692479252815247, + "learning_rate": 4.236325621874337e-06, + "loss": 0.626, + "step": 5611 + }, + { + "epoch": 1.5545706371191135, + "grad_norm": 0.6283546686172485, + "learning_rate": 4.236063513778378e-06, + "loss": 0.6119, + "step": 5612 + }, + { + "epoch": 1.554847645429363, + "grad_norm": 0.678083062171936, + "learning_rate": 4.235801368821356e-06, + "loss": 0.6154, + "step": 5613 + }, + { + "epoch": 1.5551246537396122, + "grad_norm": 0.7457267045974731, + "learning_rate": 4.23553918700884e-06, + "loss": 0.6322, + "step": 5614 + }, + { + "epoch": 1.5554016620498614, + "grad_norm": 0.721976637840271, + "learning_rate": 4.2352769683463935e-06, + "loss": 0.6533, + "step": 5615 + }, + { + "epoch": 1.5556786703601109, + "grad_norm": 0.655988872051239, + "learning_rate": 4.235014712839586e-06, + "loss": 0.6062, + "step": 5616 + }, + { + "epoch": 1.55595567867036, + "grad_norm": 0.6747307777404785, + "learning_rate": 4.234752420493985e-06, + "loss": 0.599, + "step": 5617 + }, + { + "epoch": 1.5562326869806093, + "grad_norm": 0.7203627824783325, + "learning_rate": 4.234490091315161e-06, + "loss": 0.6121, + "step": 5618 + }, + { + "epoch": 1.5565096952908588, + "grad_norm": 0.7498337030410767, + "learning_rate": 4.234227725308682e-06, + "loss": 0.6466, + "step": 5619 + }, + { + "epoch": 1.556786703601108, + "grad_norm": 0.6924552321434021, + "learning_rate": 4.23396532248012e-06, + "loss": 0.6343, + "step": 5620 + }, + { + "epoch": 1.5570637119113573, + "grad_norm": 0.7387464046478271, + "learning_rate": 4.2337028828350465e-06, + "loss": 0.6427, + "step": 5621 + }, + { + "epoch": 1.5573407202216067, + "grad_norm": 0.6742679476737976, + "learning_rate": 4.233440406379032e-06, + "loss": 0.6186, + "step": 5622 + }, + { + "epoch": 1.557617728531856, + "grad_norm": 0.6853709816932678, + "learning_rate": 4.233177893117653e-06, + "loss": 0.6418, + "step": 5623 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 0.709339439868927, + "learning_rate": 4.23291534305648e-06, + "loss": 0.6201, + "step": 5624 + }, + { + "epoch": 1.5581717451523547, + "grad_norm": 0.7180233001708984, + "learning_rate": 4.232652756201089e-06, + "loss": 0.6691, + "step": 5625 + }, + { + "epoch": 1.558448753462604, + "grad_norm": 0.7282020449638367, + "learning_rate": 4.232390132557056e-06, + "loss": 0.6479, + "step": 5626 + }, + { + "epoch": 1.5587257617728532, + "grad_norm": 0.7290409207344055, + "learning_rate": 4.232127472129955e-06, + "loss": 0.6528, + "step": 5627 + }, + { + "epoch": 1.5590027700831026, + "grad_norm": 0.7034227252006531, + "learning_rate": 4.231864774925366e-06, + "loss": 0.6252, + "step": 5628 + }, + { + "epoch": 1.5592797783933516, + "grad_norm": 0.7223827838897705, + "learning_rate": 4.231602040948864e-06, + "loss": 0.6817, + "step": 5629 + }, + { + "epoch": 1.559556786703601, + "grad_norm": 0.6938064694404602, + "learning_rate": 4.23133927020603e-06, + "loss": 0.6437, + "step": 5630 + }, + { + "epoch": 1.5598337950138506, + "grad_norm": 0.7022026777267456, + "learning_rate": 4.231076462702441e-06, + "loss": 0.6048, + "step": 5631 + }, + { + "epoch": 1.5601108033240996, + "grad_norm": 0.6670244932174683, + "learning_rate": 4.230813618443678e-06, + "loss": 0.6502, + "step": 5632 + }, + { + "epoch": 1.560387811634349, + "grad_norm": 0.6831268668174744, + "learning_rate": 4.230550737435321e-06, + "loss": 0.6364, + "step": 5633 + }, + { + "epoch": 1.5606648199445985, + "grad_norm": 0.6734561324119568, + "learning_rate": 4.230287819682953e-06, + "loss": 0.6567, + "step": 5634 + }, + { + "epoch": 1.5609418282548475, + "grad_norm": 0.69823157787323, + "learning_rate": 4.230024865192156e-06, + "loss": 0.6737, + "step": 5635 + }, + { + "epoch": 1.561218836565097, + "grad_norm": 0.6804174184799194, + "learning_rate": 4.229761873968514e-06, + "loss": 0.6264, + "step": 5636 + }, + { + "epoch": 1.5614958448753462, + "grad_norm": 0.6687811613082886, + "learning_rate": 4.22949884601761e-06, + "loss": 0.6173, + "step": 5637 + }, + { + "epoch": 1.5617728531855954, + "grad_norm": 0.6959174871444702, + "learning_rate": 4.229235781345028e-06, + "loss": 0.6483, + "step": 5638 + }, + { + "epoch": 1.562049861495845, + "grad_norm": 0.7006558179855347, + "learning_rate": 4.228972679956355e-06, + "loss": 0.6643, + "step": 5639 + }, + { + "epoch": 1.5623268698060941, + "grad_norm": 0.6768726706504822, + "learning_rate": 4.228709541857176e-06, + "loss": 0.6393, + "step": 5640 + }, + { + "epoch": 1.5626038781163434, + "grad_norm": 0.6525865793228149, + "learning_rate": 4.2284463670530795e-06, + "loss": 0.6102, + "step": 5641 + }, + { + "epoch": 1.5628808864265928, + "grad_norm": 0.6904488801956177, + "learning_rate": 4.228183155549652e-06, + "loss": 0.6141, + "step": 5642 + }, + { + "epoch": 1.563157894736842, + "grad_norm": 0.6868029832839966, + "learning_rate": 4.2279199073524836e-06, + "loss": 0.6719, + "step": 5643 + }, + { + "epoch": 1.5634349030470913, + "grad_norm": 0.6973175406455994, + "learning_rate": 4.227656622467162e-06, + "loss": 0.6318, + "step": 5644 + }, + { + "epoch": 1.5637119113573408, + "grad_norm": 0.7145452499389648, + "learning_rate": 4.227393300899279e-06, + "loss": 0.6533, + "step": 5645 + }, + { + "epoch": 1.56398891966759, + "grad_norm": 0.6662142276763916, + "learning_rate": 4.2271299426544246e-06, + "loss": 0.6255, + "step": 5646 + }, + { + "epoch": 1.5642659279778393, + "grad_norm": 0.6924449801445007, + "learning_rate": 4.226866547738191e-06, + "loss": 0.6635, + "step": 5647 + }, + { + "epoch": 1.5645429362880887, + "grad_norm": 0.7409833073616028, + "learning_rate": 4.22660311615617e-06, + "loss": 0.6129, + "step": 5648 + }, + { + "epoch": 1.564819944598338, + "grad_norm": 0.7077249884605408, + "learning_rate": 4.226339647913957e-06, + "loss": 0.6303, + "step": 5649 + }, + { + "epoch": 1.5650969529085872, + "grad_norm": 0.7363086938858032, + "learning_rate": 4.2260761430171436e-06, + "loss": 0.6602, + "step": 5650 + }, + { + "epoch": 1.5653739612188367, + "grad_norm": 0.6939164996147156, + "learning_rate": 4.225812601471326e-06, + "loss": 0.6097, + "step": 5651 + }, + { + "epoch": 1.565650969529086, + "grad_norm": 0.6998023390769958, + "learning_rate": 4.2255490232821e-06, + "loss": 0.6487, + "step": 5652 + }, + { + "epoch": 1.5659279778393351, + "grad_norm": 0.6921630501747131, + "learning_rate": 4.225285408455062e-06, + "loss": 0.6318, + "step": 5653 + }, + { + "epoch": 1.5662049861495846, + "grad_norm": 0.6838632822036743, + "learning_rate": 4.225021756995808e-06, + "loss": 0.604, + "step": 5654 + }, + { + "epoch": 1.5664819944598338, + "grad_norm": 0.687044620513916, + "learning_rate": 4.2247580689099376e-06, + "loss": 0.6232, + "step": 5655 + }, + { + "epoch": 1.566759002770083, + "grad_norm": 0.7102434635162354, + "learning_rate": 4.224494344203048e-06, + "loss": 0.6399, + "step": 5656 + }, + { + "epoch": 1.5670360110803325, + "grad_norm": 0.7067602276802063, + "learning_rate": 4.224230582880741e-06, + "loss": 0.6927, + "step": 5657 + }, + { + "epoch": 1.5673130193905818, + "grad_norm": 0.7272063493728638, + "learning_rate": 4.223966784948614e-06, + "loss": 0.6373, + "step": 5658 + }, + { + "epoch": 1.567590027700831, + "grad_norm": 0.6936042904853821, + "learning_rate": 4.22370295041227e-06, + "loss": 0.645, + "step": 5659 + }, + { + "epoch": 1.5678670360110805, + "grad_norm": 0.6833428740501404, + "learning_rate": 4.22343907927731e-06, + "loss": 0.5977, + "step": 5660 + }, + { + "epoch": 1.5681440443213295, + "grad_norm": 0.6883816123008728, + "learning_rate": 4.2231751715493385e-06, + "loss": 0.645, + "step": 5661 + }, + { + "epoch": 1.568421052631579, + "grad_norm": 0.6898177862167358, + "learning_rate": 4.2229112272339564e-06, + "loss": 0.5879, + "step": 5662 + }, + { + "epoch": 1.5686980609418284, + "grad_norm": 0.7070897817611694, + "learning_rate": 4.22264724633677e-06, + "loss": 0.6639, + "step": 5663 + }, + { + "epoch": 1.5689750692520774, + "grad_norm": 0.7003639936447144, + "learning_rate": 4.222383228863383e-06, + "loss": 0.6656, + "step": 5664 + }, + { + "epoch": 1.5692520775623269, + "grad_norm": 0.7034210562705994, + "learning_rate": 4.222119174819402e-06, + "loss": 0.621, + "step": 5665 + }, + { + "epoch": 1.5695290858725763, + "grad_norm": 0.6807825565338135, + "learning_rate": 4.221855084210433e-06, + "loss": 0.6402, + "step": 5666 + }, + { + "epoch": 1.5698060941828254, + "grad_norm": 0.7109666466712952, + "learning_rate": 4.221590957042083e-06, + "loss": 0.651, + "step": 5667 + }, + { + "epoch": 1.5700831024930748, + "grad_norm": 0.6760213375091553, + "learning_rate": 4.221326793319961e-06, + "loss": 0.5901, + "step": 5668 + }, + { + "epoch": 1.570360110803324, + "grad_norm": 0.6880183815956116, + "learning_rate": 4.2210625930496745e-06, + "loss": 0.6271, + "step": 5669 + }, + { + "epoch": 1.5706371191135733, + "grad_norm": 0.6721237301826477, + "learning_rate": 4.2207983562368345e-06, + "loss": 0.6651, + "step": 5670 + }, + { + "epoch": 1.5709141274238227, + "grad_norm": 0.6518550515174866, + "learning_rate": 4.220534082887051e-06, + "loss": 0.6267, + "step": 5671 + }, + { + "epoch": 1.571191135734072, + "grad_norm": 0.7003772854804993, + "learning_rate": 4.220269773005936e-06, + "loss": 0.6366, + "step": 5672 + }, + { + "epoch": 1.5714681440443212, + "grad_norm": 0.6701993346214294, + "learning_rate": 4.220005426599099e-06, + "loss": 0.6311, + "step": 5673 + }, + { + "epoch": 1.5717451523545707, + "grad_norm": 0.7315934896469116, + "learning_rate": 4.219741043672156e-06, + "loss": 0.6511, + "step": 5674 + }, + { + "epoch": 1.57202216066482, + "grad_norm": 0.6972675323486328, + "learning_rate": 4.219476624230717e-06, + "loss": 0.6386, + "step": 5675 + }, + { + "epoch": 1.5722991689750692, + "grad_norm": 0.6991989612579346, + "learning_rate": 4.2192121682803995e-06, + "loss": 0.6406, + "step": 5676 + }, + { + "epoch": 1.5725761772853186, + "grad_norm": 0.7138469219207764, + "learning_rate": 4.2189476758268174e-06, + "loss": 0.6512, + "step": 5677 + }, + { + "epoch": 1.5728531855955679, + "grad_norm": 0.7503437995910645, + "learning_rate": 4.218683146875586e-06, + "loss": 0.5921, + "step": 5678 + }, + { + "epoch": 1.573130193905817, + "grad_norm": 0.6732651591300964, + "learning_rate": 4.218418581432323e-06, + "loss": 0.5849, + "step": 5679 + }, + { + "epoch": 1.5734072022160666, + "grad_norm": 0.710514485836029, + "learning_rate": 4.2181539795026435e-06, + "loss": 0.6108, + "step": 5680 + }, + { + "epoch": 1.5736842105263158, + "grad_norm": 0.6641442179679871, + "learning_rate": 4.217889341092169e-06, + "loss": 0.6162, + "step": 5681 + }, + { + "epoch": 1.573961218836565, + "grad_norm": 0.6942542791366577, + "learning_rate": 4.217624666206516e-06, + "loss": 0.6349, + "step": 5682 + }, + { + "epoch": 1.5742382271468145, + "grad_norm": 0.7054006457328796, + "learning_rate": 4.217359954851305e-06, + "loss": 0.6625, + "step": 5683 + }, + { + "epoch": 1.5745152354570637, + "grad_norm": 0.7004750967025757, + "learning_rate": 4.217095207032157e-06, + "loss": 0.6717, + "step": 5684 + }, + { + "epoch": 1.574792243767313, + "grad_norm": 0.7107009887695312, + "learning_rate": 4.216830422754693e-06, + "loss": 0.6459, + "step": 5685 + }, + { + "epoch": 1.5750692520775624, + "grad_norm": 0.6618795394897461, + "learning_rate": 4.216565602024533e-06, + "loss": 0.6437, + "step": 5686 + }, + { + "epoch": 1.5753462603878117, + "grad_norm": 0.6664261817932129, + "learning_rate": 4.216300744847304e-06, + "loss": 0.6536, + "step": 5687 + }, + { + "epoch": 1.575623268698061, + "grad_norm": 0.70806884765625, + "learning_rate": 4.2160358512286266e-06, + "loss": 0.6301, + "step": 5688 + }, + { + "epoch": 1.5759002770083104, + "grad_norm": 0.697137176990509, + "learning_rate": 4.215770921174125e-06, + "loss": 0.6338, + "step": 5689 + }, + { + "epoch": 1.5761772853185596, + "grad_norm": 0.6988281607627869, + "learning_rate": 4.2155059546894254e-06, + "loss": 0.6173, + "step": 5690 + }, + { + "epoch": 1.5764542936288088, + "grad_norm": 0.7043812274932861, + "learning_rate": 4.215240951780154e-06, + "loss": 0.61, + "step": 5691 + }, + { + "epoch": 1.5767313019390583, + "grad_norm": 0.7179585695266724, + "learning_rate": 4.214975912451936e-06, + "loss": 0.6334, + "step": 5692 + }, + { + "epoch": 1.5770083102493073, + "grad_norm": 0.672431230545044, + "learning_rate": 4.214710836710401e-06, + "loss": 0.6086, + "step": 5693 + }, + { + "epoch": 1.5772853185595568, + "grad_norm": 0.7191444039344788, + "learning_rate": 4.214445724561175e-06, + "loss": 0.6191, + "step": 5694 + }, + { + "epoch": 1.5775623268698062, + "grad_norm": 0.7192513346672058, + "learning_rate": 4.214180576009888e-06, + "loss": 0.5964, + "step": 5695 + }, + { + "epoch": 1.5778393351800553, + "grad_norm": 0.6964846849441528, + "learning_rate": 4.213915391062171e-06, + "loss": 0.6572, + "step": 5696 + }, + { + "epoch": 1.5781163434903047, + "grad_norm": 0.7242207527160645, + "learning_rate": 4.213650169723652e-06, + "loss": 0.6375, + "step": 5697 + }, + { + "epoch": 1.5783933518005542, + "grad_norm": 0.6550120115280151, + "learning_rate": 4.213384911999965e-06, + "loss": 0.5441, + "step": 5698 + }, + { + "epoch": 1.5786703601108032, + "grad_norm": 0.72943115234375, + "learning_rate": 4.21311961789674e-06, + "loss": 0.637, + "step": 5699 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6830201745033264, + "learning_rate": 4.212854287419611e-06, + "loss": 0.5859, + "step": 5700 + }, + { + "epoch": 1.579224376731302, + "grad_norm": 0.7375404238700867, + "learning_rate": 4.212588920574211e-06, + "loss": 0.6329, + "step": 5701 + }, + { + "epoch": 1.5795013850415511, + "grad_norm": 0.6917905807495117, + "learning_rate": 4.212323517366175e-06, + "loss": 0.6007, + "step": 5702 + }, + { + "epoch": 1.5797783933518006, + "grad_norm": 0.6942349076271057, + "learning_rate": 4.2120580778011375e-06, + "loss": 0.6181, + "step": 5703 + }, + { + "epoch": 1.5800554016620498, + "grad_norm": 0.6996932029724121, + "learning_rate": 4.211792601884736e-06, + "loss": 0.6177, + "step": 5704 + }, + { + "epoch": 1.580332409972299, + "grad_norm": 0.7083521485328674, + "learning_rate": 4.2115270896226045e-06, + "loss": 0.6672, + "step": 5705 + }, + { + "epoch": 1.5806094182825485, + "grad_norm": 0.7094027996063232, + "learning_rate": 4.211261541020383e-06, + "loss": 0.626, + "step": 5706 + }, + { + "epoch": 1.5808864265927978, + "grad_norm": 0.6889899373054504, + "learning_rate": 4.210995956083709e-06, + "loss": 0.6546, + "step": 5707 + }, + { + "epoch": 1.581163434903047, + "grad_norm": 0.7199320793151855, + "learning_rate": 4.2107303348182214e-06, + "loss": 0.5984, + "step": 5708 + }, + { + "epoch": 1.5814404432132965, + "grad_norm": 0.6789028644561768, + "learning_rate": 4.210464677229561e-06, + "loss": 0.6422, + "step": 5709 + }, + { + "epoch": 1.5817174515235457, + "grad_norm": 0.6862844824790955, + "learning_rate": 4.210198983323366e-06, + "loss": 0.568, + "step": 5710 + }, + { + "epoch": 1.581994459833795, + "grad_norm": 0.6758617162704468, + "learning_rate": 4.20993325310528e-06, + "loss": 0.6054, + "step": 5711 + }, + { + "epoch": 1.5822714681440444, + "grad_norm": 0.7183728814125061, + "learning_rate": 4.209667486580943e-06, + "loss": 0.6674, + "step": 5712 + }, + { + "epoch": 1.5825484764542936, + "grad_norm": 0.6828016638755798, + "learning_rate": 4.209401683756e-06, + "loss": 0.5502, + "step": 5713 + }, + { + "epoch": 1.5828254847645429, + "grad_norm": 0.6988704204559326, + "learning_rate": 4.209135844636095e-06, + "loss": 0.6313, + "step": 5714 + }, + { + "epoch": 1.5831024930747923, + "grad_norm": 0.7439329028129578, + "learning_rate": 4.2088699692268704e-06, + "loss": 0.6421, + "step": 5715 + }, + { + "epoch": 1.5833795013850416, + "grad_norm": 0.6711263060569763, + "learning_rate": 4.2086040575339726e-06, + "loss": 0.6273, + "step": 5716 + }, + { + "epoch": 1.5836565096952908, + "grad_norm": 0.7117868065834045, + "learning_rate": 4.2083381095630474e-06, + "loss": 0.6029, + "step": 5717 + }, + { + "epoch": 1.5839335180055403, + "grad_norm": 0.6777087450027466, + "learning_rate": 4.208072125319741e-06, + "loss": 0.5973, + "step": 5718 + }, + { + "epoch": 1.5842105263157895, + "grad_norm": 0.6707855463027954, + "learning_rate": 4.2078061048097015e-06, + "loss": 0.6251, + "step": 5719 + }, + { + "epoch": 1.5844875346260388, + "grad_norm": 0.6649163961410522, + "learning_rate": 4.207540048038578e-06, + "loss": 0.5889, + "step": 5720 + }, + { + "epoch": 1.5847645429362882, + "grad_norm": 0.6663191318511963, + "learning_rate": 4.207273955012018e-06, + "loss": 0.6105, + "step": 5721 + }, + { + "epoch": 1.5850415512465375, + "grad_norm": 0.6953136324882507, + "learning_rate": 4.2070078257356714e-06, + "loss": 0.6114, + "step": 5722 + }, + { + "epoch": 1.5853185595567867, + "grad_norm": 0.6712597012519836, + "learning_rate": 4.206741660215191e-06, + "loss": 0.6083, + "step": 5723 + }, + { + "epoch": 1.5855955678670361, + "grad_norm": 0.6431275606155396, + "learning_rate": 4.2064754584562255e-06, + "loss": 0.5897, + "step": 5724 + }, + { + "epoch": 1.5858725761772852, + "grad_norm": 0.7072885632514954, + "learning_rate": 4.206209220464429e-06, + "loss": 0.6651, + "step": 5725 + }, + { + "epoch": 1.5861495844875346, + "grad_norm": 0.6720908284187317, + "learning_rate": 4.205942946245453e-06, + "loss": 0.6164, + "step": 5726 + }, + { + "epoch": 1.586426592797784, + "grad_norm": 0.6930977702140808, + "learning_rate": 4.205676635804952e-06, + "loss": 0.6553, + "step": 5727 + }, + { + "epoch": 1.586703601108033, + "grad_norm": 0.691413938999176, + "learning_rate": 4.20541028914858e-06, + "loss": 0.6186, + "step": 5728 + }, + { + "epoch": 1.5869806094182826, + "grad_norm": 0.7159149646759033, + "learning_rate": 4.205143906281992e-06, + "loss": 0.6417, + "step": 5729 + }, + { + "epoch": 1.587257617728532, + "grad_norm": 0.6715986132621765, + "learning_rate": 4.204877487210845e-06, + "loss": 0.6001, + "step": 5730 + }, + { + "epoch": 1.587534626038781, + "grad_norm": 0.6652202010154724, + "learning_rate": 4.204611031940795e-06, + "loss": 0.6552, + "step": 5731 + }, + { + "epoch": 1.5878116343490305, + "grad_norm": 0.7530796527862549, + "learning_rate": 4.204344540477499e-06, + "loss": 0.6387, + "step": 5732 + }, + { + "epoch": 1.5880886426592797, + "grad_norm": 0.6929827928543091, + "learning_rate": 4.2040780128266165e-06, + "loss": 0.6227, + "step": 5733 + }, + { + "epoch": 1.588365650969529, + "grad_norm": 0.6594801545143127, + "learning_rate": 4.203811448993806e-06, + "loss": 0.5887, + "step": 5734 + }, + { + "epoch": 1.5886426592797784, + "grad_norm": 0.6690226197242737, + "learning_rate": 4.203544848984729e-06, + "loss": 0.5977, + "step": 5735 + }, + { + "epoch": 1.5889196675900277, + "grad_norm": 0.7046278119087219, + "learning_rate": 4.203278212805042e-06, + "loss": 0.6422, + "step": 5736 + }, + { + "epoch": 1.589196675900277, + "grad_norm": 0.6667335033416748, + "learning_rate": 4.20301154046041e-06, + "loss": 0.6092, + "step": 5737 + }, + { + "epoch": 1.5894736842105264, + "grad_norm": 0.66410893201828, + "learning_rate": 4.202744831956494e-06, + "loss": 0.6406, + "step": 5738 + }, + { + "epoch": 1.5897506925207756, + "grad_norm": 0.6676257252693176, + "learning_rate": 4.202478087298957e-06, + "loss": 0.6684, + "step": 5739 + }, + { + "epoch": 1.5900277008310248, + "grad_norm": 0.6923943758010864, + "learning_rate": 4.202211306493462e-06, + "loss": 0.5827, + "step": 5740 + }, + { + "epoch": 1.5903047091412743, + "grad_norm": 0.6694300770759583, + "learning_rate": 4.201944489545674e-06, + "loss": 0.6266, + "step": 5741 + }, + { + "epoch": 1.5905817174515235, + "grad_norm": 0.6623051762580872, + "learning_rate": 4.201677636461259e-06, + "loss": 0.6549, + "step": 5742 + }, + { + "epoch": 1.5908587257617728, + "grad_norm": 0.6522954106330872, + "learning_rate": 4.201410747245881e-06, + "loss": 0.6173, + "step": 5743 + }, + { + "epoch": 1.5911357340720222, + "grad_norm": 0.7510678768157959, + "learning_rate": 4.201143821905209e-06, + "loss": 0.6355, + "step": 5744 + }, + { + "epoch": 1.5914127423822715, + "grad_norm": 0.6667211651802063, + "learning_rate": 4.200876860444909e-06, + "loss": 0.6082, + "step": 5745 + }, + { + "epoch": 1.5916897506925207, + "grad_norm": 0.6723722219467163, + "learning_rate": 4.2006098628706496e-06, + "loss": 0.6256, + "step": 5746 + }, + { + "epoch": 1.5919667590027702, + "grad_norm": 0.6964446306228638, + "learning_rate": 4.2003428291881e-06, + "loss": 0.6059, + "step": 5747 + }, + { + "epoch": 1.5922437673130194, + "grad_norm": 0.73036128282547, + "learning_rate": 4.20007575940293e-06, + "loss": 0.6385, + "step": 5748 + }, + { + "epoch": 1.5925207756232687, + "grad_norm": 0.6907002329826355, + "learning_rate": 4.19980865352081e-06, + "loss": 0.6662, + "step": 5749 + }, + { + "epoch": 1.5927977839335181, + "grad_norm": 0.7342780232429504, + "learning_rate": 4.199541511547412e-06, + "loss": 0.6752, + "step": 5750 + }, + { + "epoch": 1.5930747922437674, + "grad_norm": 0.7659139633178711, + "learning_rate": 4.1992743334884065e-06, + "loss": 0.6259, + "step": 5751 + }, + { + "epoch": 1.5933518005540166, + "grad_norm": 0.6953239440917969, + "learning_rate": 4.199007119349468e-06, + "loss": 0.672, + "step": 5752 + }, + { + "epoch": 1.593628808864266, + "grad_norm": 0.6655431985855103, + "learning_rate": 4.19873986913627e-06, + "loss": 0.6207, + "step": 5753 + }, + { + "epoch": 1.593905817174515, + "grad_norm": 0.7081992030143738, + "learning_rate": 4.1984725828544855e-06, + "loss": 0.6568, + "step": 5754 + }, + { + "epoch": 1.5941828254847645, + "grad_norm": 0.6927662491798401, + "learning_rate": 4.198205260509791e-06, + "loss": 0.6323, + "step": 5755 + }, + { + "epoch": 1.594459833795014, + "grad_norm": 0.698785662651062, + "learning_rate": 4.197937902107863e-06, + "loss": 0.6653, + "step": 5756 + }, + { + "epoch": 1.594736842105263, + "grad_norm": 0.7155250906944275, + "learning_rate": 4.197670507654376e-06, + "loss": 0.6522, + "step": 5757 + }, + { + "epoch": 1.5950138504155125, + "grad_norm": 0.6735395789146423, + "learning_rate": 4.197403077155009e-06, + "loss": 0.5891, + "step": 5758 + }, + { + "epoch": 1.595290858725762, + "grad_norm": 0.6795505285263062, + "learning_rate": 4.19713561061544e-06, + "loss": 0.6047, + "step": 5759 + }, + { + "epoch": 1.595567867036011, + "grad_norm": 0.6992025971412659, + "learning_rate": 4.196868108041348e-06, + "loss": 0.6649, + "step": 5760 + }, + { + "epoch": 1.5958448753462604, + "grad_norm": 0.7018173336982727, + "learning_rate": 4.196600569438413e-06, + "loss": 0.6429, + "step": 5761 + }, + { + "epoch": 1.5961218836565096, + "grad_norm": 0.7325546145439148, + "learning_rate": 4.196332994812316e-06, + "loss": 0.6058, + "step": 5762 + }, + { + "epoch": 1.5963988919667589, + "grad_norm": 0.7186279892921448, + "learning_rate": 4.196065384168736e-06, + "loss": 0.6224, + "step": 5763 + }, + { + "epoch": 1.5966759002770083, + "grad_norm": 0.7321295142173767, + "learning_rate": 4.195797737513357e-06, + "loss": 0.6758, + "step": 5764 + }, + { + "epoch": 1.5969529085872576, + "grad_norm": 0.7160171866416931, + "learning_rate": 4.195530054851863e-06, + "loss": 0.6367, + "step": 5765 + }, + { + "epoch": 1.5972299168975068, + "grad_norm": 0.6524419784545898, + "learning_rate": 4.1952623361899345e-06, + "loss": 0.5977, + "step": 5766 + }, + { + "epoch": 1.5975069252077563, + "grad_norm": 0.6996549367904663, + "learning_rate": 4.1949945815332574e-06, + "loss": 0.627, + "step": 5767 + }, + { + "epoch": 1.5977839335180055, + "grad_norm": 0.6982312798500061, + "learning_rate": 4.194726790887517e-06, + "loss": 0.6677, + "step": 5768 + }, + { + "epoch": 1.5980609418282548, + "grad_norm": 0.6984411478042603, + "learning_rate": 4.1944589642583985e-06, + "loss": 0.5714, + "step": 5769 + }, + { + "epoch": 1.5983379501385042, + "grad_norm": 0.662164568901062, + "learning_rate": 4.1941911016515905e-06, + "loss": 0.5776, + "step": 5770 + }, + { + "epoch": 1.5986149584487535, + "grad_norm": 0.7050111889839172, + "learning_rate": 4.193923203072778e-06, + "loss": 0.6099, + "step": 5771 + }, + { + "epoch": 1.5988919667590027, + "grad_norm": 0.69181227684021, + "learning_rate": 4.19365526852765e-06, + "loss": 0.6415, + "step": 5772 + }, + { + "epoch": 1.5991689750692522, + "grad_norm": 0.6893934011459351, + "learning_rate": 4.193387298021895e-06, + "loss": 0.6694, + "step": 5773 + }, + { + "epoch": 1.5994459833795014, + "grad_norm": 0.7136704325675964, + "learning_rate": 4.193119291561204e-06, + "loss": 0.6497, + "step": 5774 + }, + { + "epoch": 1.5997229916897506, + "grad_norm": 0.717369556427002, + "learning_rate": 4.192851249151266e-06, + "loss": 0.6128, + "step": 5775 + }, + { + "epoch": 1.6, + "grad_norm": 0.7070300579071045, + "learning_rate": 4.192583170797775e-06, + "loss": 0.6448, + "step": 5776 + }, + { + "epoch": 1.6002770083102493, + "grad_norm": 0.7221513986587524, + "learning_rate": 4.192315056506418e-06, + "loss": 0.6756, + "step": 5777 + }, + { + "epoch": 1.6005540166204986, + "grad_norm": 0.7016468048095703, + "learning_rate": 4.1920469062828924e-06, + "loss": 0.6226, + "step": 5778 + }, + { + "epoch": 1.600831024930748, + "grad_norm": 0.7056813836097717, + "learning_rate": 4.1917787201328894e-06, + "loss": 0.6313, + "step": 5779 + }, + { + "epoch": 1.6011080332409973, + "grad_norm": 0.7116861939430237, + "learning_rate": 4.191510498062104e-06, + "loss": 0.6539, + "step": 5780 + }, + { + "epoch": 1.6013850415512465, + "grad_norm": 0.6958196759223938, + "learning_rate": 4.191242240076231e-06, + "loss": 0.6612, + "step": 5781 + }, + { + "epoch": 1.601662049861496, + "grad_norm": 0.707463800907135, + "learning_rate": 4.190973946180966e-06, + "loss": 0.6052, + "step": 5782 + }, + { + "epoch": 1.6019390581717452, + "grad_norm": 0.6749431490898132, + "learning_rate": 4.190705616382006e-06, + "loss": 0.6278, + "step": 5783 + }, + { + "epoch": 1.6022160664819944, + "grad_norm": 0.727830171585083, + "learning_rate": 4.190437250685049e-06, + "loss": 0.6063, + "step": 5784 + }, + { + "epoch": 1.602493074792244, + "grad_norm": 0.7113375663757324, + "learning_rate": 4.190168849095791e-06, + "loss": 0.645, + "step": 5785 + }, + { + "epoch": 1.602770083102493, + "grad_norm": 0.6866186261177063, + "learning_rate": 4.1899004116199335e-06, + "loss": 0.6072, + "step": 5786 + }, + { + "epoch": 1.6030470914127424, + "grad_norm": 0.7033379673957825, + "learning_rate": 4.189631938263173e-06, + "loss": 0.6638, + "step": 5787 + }, + { + "epoch": 1.6033240997229918, + "grad_norm": 0.6700825691223145, + "learning_rate": 4.1893634290312135e-06, + "loss": 0.6362, + "step": 5788 + }, + { + "epoch": 1.6036011080332409, + "grad_norm": 0.7247818112373352, + "learning_rate": 4.189094883929753e-06, + "loss": 0.6437, + "step": 5789 + }, + { + "epoch": 1.6038781163434903, + "grad_norm": 0.6923117637634277, + "learning_rate": 4.188826302964495e-06, + "loss": 0.6261, + "step": 5790 + }, + { + "epoch": 1.6041551246537398, + "grad_norm": 0.6868463754653931, + "learning_rate": 4.188557686141141e-06, + "loss": 0.6412, + "step": 5791 + }, + { + "epoch": 1.6044321329639888, + "grad_norm": 0.6793109774589539, + "learning_rate": 4.188289033465396e-06, + "loss": 0.6606, + "step": 5792 + }, + { + "epoch": 1.6047091412742382, + "grad_norm": 0.7150652408599854, + "learning_rate": 4.188020344942964e-06, + "loss": 0.6581, + "step": 5793 + }, + { + "epoch": 1.6049861495844875, + "grad_norm": 0.6954755783081055, + "learning_rate": 4.187751620579549e-06, + "loss": 0.6559, + "step": 5794 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.691980242729187, + "learning_rate": 4.187482860380857e-06, + "loss": 0.6021, + "step": 5795 + }, + { + "epoch": 1.6055401662049862, + "grad_norm": 0.670291006565094, + "learning_rate": 4.187214064352594e-06, + "loss": 0.6163, + "step": 5796 + }, + { + "epoch": 1.6058171745152354, + "grad_norm": 0.6734989881515503, + "learning_rate": 4.186945232500468e-06, + "loss": 0.6721, + "step": 5797 + }, + { + "epoch": 1.6060941828254847, + "grad_norm": 0.6668962240219116, + "learning_rate": 4.186676364830187e-06, + "loss": 0.5777, + "step": 5798 + }, + { + "epoch": 1.6063711911357341, + "grad_norm": 0.722041666507721, + "learning_rate": 4.186407461347459e-06, + "loss": 0.6112, + "step": 5799 + }, + { + "epoch": 1.6066481994459834, + "grad_norm": 0.73446124792099, + "learning_rate": 4.186138522057994e-06, + "loss": 0.6116, + "step": 5800 + }, + { + "epoch": 1.6069252077562326, + "grad_norm": 0.6750873923301697, + "learning_rate": 4.185869546967502e-06, + "loss": 0.6499, + "step": 5801 + }, + { + "epoch": 1.607202216066482, + "grad_norm": 0.6893951892852783, + "learning_rate": 4.185600536081694e-06, + "loss": 0.6315, + "step": 5802 + }, + { + "epoch": 1.6074792243767313, + "grad_norm": 0.7207143902778625, + "learning_rate": 4.185331489406283e-06, + "loss": 0.6556, + "step": 5803 + }, + { + "epoch": 1.6077562326869805, + "grad_norm": 0.6945677399635315, + "learning_rate": 4.1850624069469805e-06, + "loss": 0.5956, + "step": 5804 + }, + { + "epoch": 1.60803324099723, + "grad_norm": 0.6582666039466858, + "learning_rate": 4.1847932887095e-06, + "loss": 0.6429, + "step": 5805 + }, + { + "epoch": 1.6083102493074792, + "grad_norm": 0.6848458647727966, + "learning_rate": 4.184524134699556e-06, + "loss": 0.6427, + "step": 5806 + }, + { + "epoch": 1.6085872576177285, + "grad_norm": 0.6576165556907654, + "learning_rate": 4.184254944922862e-06, + "loss": 0.6079, + "step": 5807 + }, + { + "epoch": 1.608864265927978, + "grad_norm": 0.7028154134750366, + "learning_rate": 4.1839857193851345e-06, + "loss": 0.6434, + "step": 5808 + }, + { + "epoch": 1.6091412742382272, + "grad_norm": 0.684888482093811, + "learning_rate": 4.183716458092089e-06, + "loss": 0.6032, + "step": 5809 + }, + { + "epoch": 1.6094182825484764, + "grad_norm": 0.7266806364059448, + "learning_rate": 4.183447161049445e-06, + "loss": 0.6548, + "step": 5810 + }, + { + "epoch": 1.6096952908587259, + "grad_norm": 0.6765641570091248, + "learning_rate": 4.183177828262916e-06, + "loss": 0.6468, + "step": 5811 + }, + { + "epoch": 1.609972299168975, + "grad_norm": 0.6718610525131226, + "learning_rate": 4.182908459738226e-06, + "loss": 0.5708, + "step": 5812 + }, + { + "epoch": 1.6102493074792243, + "grad_norm": 0.714972734451294, + "learning_rate": 4.1826390554810905e-06, + "loss": 0.6184, + "step": 5813 + }, + { + "epoch": 1.6105263157894738, + "grad_norm": 0.6993780136108398, + "learning_rate": 4.182369615497232e-06, + "loss": 0.665, + "step": 5814 + }, + { + "epoch": 1.610803324099723, + "grad_norm": 0.6753741502761841, + "learning_rate": 4.182100139792369e-06, + "loss": 0.6266, + "step": 5815 + }, + { + "epoch": 1.6110803324099723, + "grad_norm": 0.6745509505271912, + "learning_rate": 4.1818306283722245e-06, + "loss": 0.6409, + "step": 5816 + }, + { + "epoch": 1.6113573407202217, + "grad_norm": 0.705803394317627, + "learning_rate": 4.181561081242521e-06, + "loss": 0.6208, + "step": 5817 + }, + { + "epoch": 1.6116343490304708, + "grad_norm": 0.7307119369506836, + "learning_rate": 4.181291498408983e-06, + "loss": 0.6313, + "step": 5818 + }, + { + "epoch": 1.6119113573407202, + "grad_norm": 0.6954278945922852, + "learning_rate": 4.181021879877331e-06, + "loss": 0.6508, + "step": 5819 + }, + { + "epoch": 1.6121883656509697, + "grad_norm": 0.7063873410224915, + "learning_rate": 4.1807522256532925e-06, + "loss": 0.6352, + "step": 5820 + }, + { + "epoch": 1.6124653739612187, + "grad_norm": 0.7044040560722351, + "learning_rate": 4.180482535742591e-06, + "loss": 0.603, + "step": 5821 + }, + { + "epoch": 1.6127423822714682, + "grad_norm": 0.6697425246238708, + "learning_rate": 4.1802128101509556e-06, + "loss": 0.5898, + "step": 5822 + }, + { + "epoch": 1.6130193905817176, + "grad_norm": 0.6932482719421387, + "learning_rate": 4.17994304888411e-06, + "loss": 0.6418, + "step": 5823 + }, + { + "epoch": 1.6132963988919666, + "grad_norm": 0.702626645565033, + "learning_rate": 4.179673251947784e-06, + "loss": 0.6289, + "step": 5824 + }, + { + "epoch": 1.613573407202216, + "grad_norm": 0.6617067456245422, + "learning_rate": 4.179403419347705e-06, + "loss": 0.621, + "step": 5825 + }, + { + "epoch": 1.6138504155124653, + "grad_norm": 0.6601985096931458, + "learning_rate": 4.179133551089604e-06, + "loss": 0.6575, + "step": 5826 + }, + { + "epoch": 1.6141274238227146, + "grad_norm": 0.6812746524810791, + "learning_rate": 4.178863647179208e-06, + "loss": 0.6089, + "step": 5827 + }, + { + "epoch": 1.614404432132964, + "grad_norm": 0.7015700936317444, + "learning_rate": 4.17859370762225e-06, + "loss": 0.5839, + "step": 5828 + }, + { + "epoch": 1.6146814404432133, + "grad_norm": 0.6997047066688538, + "learning_rate": 4.1783237324244605e-06, + "loss": 0.6244, + "step": 5829 + }, + { + "epoch": 1.6149584487534625, + "grad_norm": 0.6965237259864807, + "learning_rate": 4.178053721591574e-06, + "loss": 0.5986, + "step": 5830 + }, + { + "epoch": 1.615235457063712, + "grad_norm": 0.69244784116745, + "learning_rate": 4.17778367512932e-06, + "loss": 0.5947, + "step": 5831 + }, + { + "epoch": 1.6155124653739612, + "grad_norm": 0.7107717990875244, + "learning_rate": 4.177513593043434e-06, + "loss": 0.6405, + "step": 5832 + }, + { + "epoch": 1.6157894736842104, + "grad_norm": 0.6921395659446716, + "learning_rate": 4.177243475339651e-06, + "loss": 0.6364, + "step": 5833 + }, + { + "epoch": 1.61606648199446, + "grad_norm": 0.7329249382019043, + "learning_rate": 4.1769733220237055e-06, + "loss": 0.6618, + "step": 5834 + }, + { + "epoch": 1.6163434903047091, + "grad_norm": 0.6867843270301819, + "learning_rate": 4.176703133101334e-06, + "loss": 0.5642, + "step": 5835 + }, + { + "epoch": 1.6166204986149584, + "grad_norm": 0.6759707927703857, + "learning_rate": 4.176432908578274e-06, + "loss": 0.6479, + "step": 5836 + }, + { + "epoch": 1.6168975069252078, + "grad_norm": 0.7146587371826172, + "learning_rate": 4.176162648460261e-06, + "loss": 0.6392, + "step": 5837 + }, + { + "epoch": 1.617174515235457, + "grad_norm": 0.7169117331504822, + "learning_rate": 4.175892352753035e-06, + "loss": 0.6448, + "step": 5838 + }, + { + "epoch": 1.6174515235457063, + "grad_norm": 0.700145423412323, + "learning_rate": 4.175622021462335e-06, + "loss": 0.6157, + "step": 5839 + }, + { + "epoch": 1.6177285318559558, + "grad_norm": 0.7134767174720764, + "learning_rate": 4.1753516545939e-06, + "loss": 0.6314, + "step": 5840 + }, + { + "epoch": 1.618005540166205, + "grad_norm": 0.6458624005317688, + "learning_rate": 4.17508125215347e-06, + "loss": 0.5763, + "step": 5841 + }, + { + "epoch": 1.6182825484764543, + "grad_norm": 0.6700456142425537, + "learning_rate": 4.174810814146789e-06, + "loss": 0.6501, + "step": 5842 + }, + { + "epoch": 1.6185595567867037, + "grad_norm": 0.6677443981170654, + "learning_rate": 4.174540340579596e-06, + "loss": 0.6479, + "step": 5843 + }, + { + "epoch": 1.618836565096953, + "grad_norm": 0.6755135655403137, + "learning_rate": 4.174269831457636e-06, + "loss": 0.5886, + "step": 5844 + }, + { + "epoch": 1.6191135734072022, + "grad_norm": 0.7005934715270996, + "learning_rate": 4.173999286786652e-06, + "loss": 0.6009, + "step": 5845 + }, + { + "epoch": 1.6193905817174516, + "grad_norm": 0.682978630065918, + "learning_rate": 4.173728706572388e-06, + "loss": 0.6118, + "step": 5846 + }, + { + "epoch": 1.6196675900277007, + "grad_norm": 0.6874737739562988, + "learning_rate": 4.173458090820589e-06, + "loss": 0.6296, + "step": 5847 + }, + { + "epoch": 1.6199445983379501, + "grad_norm": 0.6969865560531616, + "learning_rate": 4.173187439537001e-06, + "loss": 0.6379, + "step": 5848 + }, + { + "epoch": 1.6202216066481996, + "grad_norm": 0.7983630895614624, + "learning_rate": 4.172916752727372e-06, + "loss": 0.6125, + "step": 5849 + }, + { + "epoch": 1.6204986149584486, + "grad_norm": 0.6705129742622375, + "learning_rate": 4.172646030397447e-06, + "loss": 0.5946, + "step": 5850 + }, + { + "epoch": 1.620775623268698, + "grad_norm": 0.6784124970436096, + "learning_rate": 4.172375272552976e-06, + "loss": 0.604, + "step": 5851 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 0.7067432403564453, + "learning_rate": 4.172104479199707e-06, + "loss": 0.667, + "step": 5852 + }, + { + "epoch": 1.6213296398891965, + "grad_norm": 0.7362405061721802, + "learning_rate": 4.17183365034339e-06, + "loss": 0.6497, + "step": 5853 + }, + { + "epoch": 1.621606648199446, + "grad_norm": 0.6884350180625916, + "learning_rate": 4.171562785989774e-06, + "loss": 0.6158, + "step": 5854 + }, + { + "epoch": 1.6218836565096952, + "grad_norm": 0.699510931968689, + "learning_rate": 4.171291886144614e-06, + "loss": 0.598, + "step": 5855 + }, + { + "epoch": 1.6221606648199445, + "grad_norm": 0.6367965936660767, + "learning_rate": 4.171020950813657e-06, + "loss": 0.6063, + "step": 5856 + }, + { + "epoch": 1.622437673130194, + "grad_norm": 0.6910621523857117, + "learning_rate": 4.170749980002658e-06, + "loss": 0.6346, + "step": 5857 + }, + { + "epoch": 1.6227146814404432, + "grad_norm": 0.702388346195221, + "learning_rate": 4.170478973717371e-06, + "loss": 0.6157, + "step": 5858 + }, + { + "epoch": 1.6229916897506924, + "grad_norm": 0.7175259590148926, + "learning_rate": 4.1702079319635495e-06, + "loss": 0.6365, + "step": 5859 + }, + { + "epoch": 1.6232686980609419, + "grad_norm": 0.6877427101135254, + "learning_rate": 4.169936854746947e-06, + "loss": 0.6248, + "step": 5860 + }, + { + "epoch": 1.623545706371191, + "grad_norm": 0.7271732091903687, + "learning_rate": 4.169665742073322e-06, + "loss": 0.6386, + "step": 5861 + }, + { + "epoch": 1.6238227146814403, + "grad_norm": 0.6602852940559387, + "learning_rate": 4.169394593948429e-06, + "loss": 0.6184, + "step": 5862 + }, + { + "epoch": 1.6240997229916898, + "grad_norm": 0.7164726853370667, + "learning_rate": 4.169123410378026e-06, + "loss": 0.6213, + "step": 5863 + }, + { + "epoch": 1.624376731301939, + "grad_norm": 0.6855493187904358, + "learning_rate": 4.1688521913678706e-06, + "loss": 0.6311, + "step": 5864 + }, + { + "epoch": 1.6246537396121883, + "grad_norm": 0.661792516708374, + "learning_rate": 4.1685809369237204e-06, + "loss": 0.6065, + "step": 5865 + }, + { + "epoch": 1.6249307479224377, + "grad_norm": 0.6726290583610535, + "learning_rate": 4.1683096470513365e-06, + "loss": 0.6425, + "step": 5866 + }, + { + "epoch": 1.625207756232687, + "grad_norm": 0.7385438084602356, + "learning_rate": 4.168038321756478e-06, + "loss": 0.6404, + "step": 5867 + }, + { + "epoch": 1.6254847645429362, + "grad_norm": 0.6656984090805054, + "learning_rate": 4.167766961044906e-06, + "loss": 0.6575, + "step": 5868 + }, + { + "epoch": 1.6257617728531857, + "grad_norm": 0.6919022798538208, + "learning_rate": 4.167495564922383e-06, + "loss": 0.6485, + "step": 5869 + }, + { + "epoch": 1.626038781163435, + "grad_norm": 0.7050084471702576, + "learning_rate": 4.1672241333946715e-06, + "loss": 0.6616, + "step": 5870 + }, + { + "epoch": 1.6263157894736842, + "grad_norm": 0.7358623147010803, + "learning_rate": 4.166952666467533e-06, + "loss": 0.6555, + "step": 5871 + }, + { + "epoch": 1.6265927977839336, + "grad_norm": 0.656574010848999, + "learning_rate": 4.166681164146733e-06, + "loss": 0.633, + "step": 5872 + }, + { + "epoch": 1.6268698060941829, + "grad_norm": 0.6728115677833557, + "learning_rate": 4.166409626438035e-06, + "loss": 0.5646, + "step": 5873 + }, + { + "epoch": 1.627146814404432, + "grad_norm": 0.6639528870582581, + "learning_rate": 4.166138053347205e-06, + "loss": 0.5575, + "step": 5874 + }, + { + "epoch": 1.6274238227146816, + "grad_norm": 0.6908798217773438, + "learning_rate": 4.1658664448800105e-06, + "loss": 0.5618, + "step": 5875 + }, + { + "epoch": 1.6277008310249308, + "grad_norm": 0.7277550101280212, + "learning_rate": 4.165594801042217e-06, + "loss": 0.5629, + "step": 5876 + }, + { + "epoch": 1.62797783933518, + "grad_norm": 0.7118584513664246, + "learning_rate": 4.165323121839591e-06, + "loss": 0.6638, + "step": 5877 + }, + { + "epoch": 1.6282548476454295, + "grad_norm": 0.6751552820205688, + "learning_rate": 4.165051407277903e-06, + "loss": 0.5756, + "step": 5878 + }, + { + "epoch": 1.6285318559556785, + "grad_norm": 0.6624975204467773, + "learning_rate": 4.164779657362922e-06, + "loss": 0.6035, + "step": 5879 + }, + { + "epoch": 1.628808864265928, + "grad_norm": 0.682131826877594, + "learning_rate": 4.164507872100417e-06, + "loss": 0.5847, + "step": 5880 + }, + { + "epoch": 1.6290858725761774, + "grad_norm": 0.6885446310043335, + "learning_rate": 4.16423605149616e-06, + "loss": 0.6109, + "step": 5881 + }, + { + "epoch": 1.6293628808864264, + "grad_norm": 0.6840854287147522, + "learning_rate": 4.1639641955559205e-06, + "loss": 0.6098, + "step": 5882 + }, + { + "epoch": 1.629639889196676, + "grad_norm": 0.6697250604629517, + "learning_rate": 4.163692304285473e-06, + "loss": 0.6197, + "step": 5883 + }, + { + "epoch": 1.6299168975069254, + "grad_norm": 0.6881322264671326, + "learning_rate": 4.163420377690588e-06, + "loss": 0.6151, + "step": 5884 + }, + { + "epoch": 1.6301939058171744, + "grad_norm": 0.69287109375, + "learning_rate": 4.163148415777042e-06, + "loss": 0.6573, + "step": 5885 + }, + { + "epoch": 1.6304709141274238, + "grad_norm": 0.6807845234870911, + "learning_rate": 4.162876418550606e-06, + "loss": 0.6245, + "step": 5886 + }, + { + "epoch": 1.630747922437673, + "grad_norm": 0.6973237991333008, + "learning_rate": 4.162604386017059e-06, + "loss": 0.6596, + "step": 5887 + }, + { + "epoch": 1.6310249307479223, + "grad_norm": 0.7001870274543762, + "learning_rate": 4.1623323181821736e-06, + "loss": 0.6374, + "step": 5888 + }, + { + "epoch": 1.6313019390581718, + "grad_norm": 0.6852225065231323, + "learning_rate": 4.162060215051729e-06, + "loss": 0.6621, + "step": 5889 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.7498638033866882, + "learning_rate": 4.161788076631501e-06, + "loss": 0.6141, + "step": 5890 + }, + { + "epoch": 1.6318559556786703, + "grad_norm": 0.7050655484199524, + "learning_rate": 4.1615159029272675e-06, + "loss": 0.6357, + "step": 5891 + }, + { + "epoch": 1.6321329639889197, + "grad_norm": 0.7564597725868225, + "learning_rate": 4.161243693944809e-06, + "loss": 0.6705, + "step": 5892 + }, + { + "epoch": 1.632409972299169, + "grad_norm": 0.6575698852539062, + "learning_rate": 4.1609714496899055e-06, + "loss": 0.6403, + "step": 5893 + }, + { + "epoch": 1.6326869806094182, + "grad_norm": 0.6732252836227417, + "learning_rate": 4.160699170168337e-06, + "loss": 0.6116, + "step": 5894 + }, + { + "epoch": 1.6329639889196677, + "grad_norm": 0.698630690574646, + "learning_rate": 4.160426855385882e-06, + "loss": 0.6599, + "step": 5895 + }, + { + "epoch": 1.633240997229917, + "grad_norm": 0.6858016848564148, + "learning_rate": 4.160154505348326e-06, + "loss": 0.641, + "step": 5896 + }, + { + "epoch": 1.6335180055401661, + "grad_norm": 0.6913908123970032, + "learning_rate": 4.159882120061449e-06, + "loss": 0.6289, + "step": 5897 + }, + { + "epoch": 1.6337950138504156, + "grad_norm": 0.7428088188171387, + "learning_rate": 4.159609699531038e-06, + "loss": 0.652, + "step": 5898 + }, + { + "epoch": 1.6340720221606648, + "grad_norm": 0.7286577820777893, + "learning_rate": 4.159337243762872e-06, + "loss": 0.6971, + "step": 5899 + }, + { + "epoch": 1.634349030470914, + "grad_norm": 0.6914637684822083, + "learning_rate": 4.15906475276274e-06, + "loss": 0.6216, + "step": 5900 + }, + { + "epoch": 1.6346260387811635, + "grad_norm": 0.6664144992828369, + "learning_rate": 4.158792226536428e-06, + "loss": 0.6421, + "step": 5901 + }, + { + "epoch": 1.6349030470914128, + "grad_norm": 0.707007646560669, + "learning_rate": 4.158519665089719e-06, + "loss": 0.6282, + "step": 5902 + }, + { + "epoch": 1.635180055401662, + "grad_norm": 0.6818870902061462, + "learning_rate": 4.158247068428403e-06, + "loss": 0.6298, + "step": 5903 + }, + { + "epoch": 1.6354570637119115, + "grad_norm": 0.7262736558914185, + "learning_rate": 4.157974436558267e-06, + "loss": 0.671, + "step": 5904 + }, + { + "epoch": 1.6357340720221607, + "grad_norm": 0.6701874136924744, + "learning_rate": 4.1577017694851e-06, + "loss": 0.5893, + "step": 5905 + }, + { + "epoch": 1.63601108033241, + "grad_norm": 0.7156980037689209, + "learning_rate": 4.1574290672146905e-06, + "loss": 0.6744, + "step": 5906 + }, + { + "epoch": 1.6362880886426594, + "grad_norm": 0.703990638256073, + "learning_rate": 4.157156329752829e-06, + "loss": 0.6432, + "step": 5907 + }, + { + "epoch": 1.6365650969529086, + "grad_norm": 0.7014253735542297, + "learning_rate": 4.156883557105308e-06, + "loss": 0.6514, + "step": 5908 + }, + { + "epoch": 1.6368421052631579, + "grad_norm": 0.6927618384361267, + "learning_rate": 4.1566107492779175e-06, + "loss": 0.6527, + "step": 5909 + }, + { + "epoch": 1.6371191135734073, + "grad_norm": 0.7299452424049377, + "learning_rate": 4.156337906276449e-06, + "loss": 0.6105, + "step": 5910 + }, + { + "epoch": 1.6373961218836564, + "grad_norm": 0.7313836812973022, + "learning_rate": 4.156065028106699e-06, + "loss": 0.6661, + "step": 5911 + }, + { + "epoch": 1.6376731301939058, + "grad_norm": 0.7255539298057556, + "learning_rate": 4.1557921147744575e-06, + "loss": 0.585, + "step": 5912 + }, + { + "epoch": 1.6379501385041553, + "grad_norm": 0.7102248668670654, + "learning_rate": 4.155519166285522e-06, + "loss": 0.5964, + "step": 5913 + }, + { + "epoch": 1.6382271468144043, + "grad_norm": 0.7126936316490173, + "learning_rate": 4.155246182645686e-06, + "loss": 0.6335, + "step": 5914 + }, + { + "epoch": 1.6385041551246537, + "grad_norm": 0.6877632737159729, + "learning_rate": 4.154973163860747e-06, + "loss": 0.6224, + "step": 5915 + }, + { + "epoch": 1.6387811634349032, + "grad_norm": 0.7150420546531677, + "learning_rate": 4.154700109936501e-06, + "loss": 0.6443, + "step": 5916 + }, + { + "epoch": 1.6390581717451522, + "grad_norm": 0.7128241658210754, + "learning_rate": 4.1544270208787465e-06, + "loss": 0.6616, + "step": 5917 + }, + { + "epoch": 1.6393351800554017, + "grad_norm": 0.7850664258003235, + "learning_rate": 4.154153896693282e-06, + "loss": 0.6313, + "step": 5918 + }, + { + "epoch": 1.639612188365651, + "grad_norm": 0.7086406350135803, + "learning_rate": 4.153880737385905e-06, + "loss": 0.6439, + "step": 5919 + }, + { + "epoch": 1.6398891966759002, + "grad_norm": 0.6542023420333862, + "learning_rate": 4.153607542962417e-06, + "loss": 0.5952, + "step": 5920 + }, + { + "epoch": 1.6401662049861496, + "grad_norm": 0.7138886451721191, + "learning_rate": 4.153334313428616e-06, + "loss": 0.6411, + "step": 5921 + }, + { + "epoch": 1.6404432132963989, + "grad_norm": 0.6809943318367004, + "learning_rate": 4.153061048790307e-06, + "loss": 0.6428, + "step": 5922 + }, + { + "epoch": 1.640720221606648, + "grad_norm": 0.6998105645179749, + "learning_rate": 4.152787749053292e-06, + "loss": 0.634, + "step": 5923 + }, + { + "epoch": 1.6409972299168976, + "grad_norm": 0.6871700882911682, + "learning_rate": 4.152514414223371e-06, + "loss": 0.5989, + "step": 5924 + }, + { + "epoch": 1.6412742382271468, + "grad_norm": 0.6837252974510193, + "learning_rate": 4.152241044306347e-06, + "loss": 0.5922, + "step": 5925 + }, + { + "epoch": 1.641551246537396, + "grad_norm": 0.7211995720863342, + "learning_rate": 4.151967639308029e-06, + "loss": 0.634, + "step": 5926 + }, + { + "epoch": 1.6418282548476455, + "grad_norm": 0.6932203769683838, + "learning_rate": 4.151694199234218e-06, + "loss": 0.6271, + "step": 5927 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 0.6809141635894775, + "learning_rate": 4.151420724090722e-06, + "loss": 0.6408, + "step": 5928 + }, + { + "epoch": 1.642382271468144, + "grad_norm": 0.6621232032775879, + "learning_rate": 4.151147213883346e-06, + "loss": 0.6117, + "step": 5929 + }, + { + "epoch": 1.6426592797783934, + "grad_norm": 0.6841561794281006, + "learning_rate": 4.150873668617899e-06, + "loss": 0.6742, + "step": 5930 + }, + { + "epoch": 1.6429362880886427, + "grad_norm": 0.713799774646759, + "learning_rate": 4.150600088300188e-06, + "loss": 0.6447, + "step": 5931 + }, + { + "epoch": 1.643213296398892, + "grad_norm": 0.7068320512771606, + "learning_rate": 4.150326472936021e-06, + "loss": 0.6563, + "step": 5932 + }, + { + "epoch": 1.6434903047091414, + "grad_norm": 0.736694872379303, + "learning_rate": 4.150052822531209e-06, + "loss": 0.6337, + "step": 5933 + }, + { + "epoch": 1.6437673130193906, + "grad_norm": 0.6896104216575623, + "learning_rate": 4.149779137091562e-06, + "loss": 0.6386, + "step": 5934 + }, + { + "epoch": 1.6440443213296398, + "grad_norm": 0.709987998008728, + "learning_rate": 4.14950541662289e-06, + "loss": 0.6438, + "step": 5935 + }, + { + "epoch": 1.6443213296398893, + "grad_norm": 0.7147083282470703, + "learning_rate": 4.149231661131007e-06, + "loss": 0.6226, + "step": 5936 + }, + { + "epoch": 1.6445983379501385, + "grad_norm": 0.7098004221916199, + "learning_rate": 4.148957870621723e-06, + "loss": 0.6436, + "step": 5937 + }, + { + "epoch": 1.6448753462603878, + "grad_norm": 0.677734375, + "learning_rate": 4.148684045100853e-06, + "loss": 0.5906, + "step": 5938 + }, + { + "epoch": 1.6451523545706372, + "grad_norm": 0.740846574306488, + "learning_rate": 4.148410184574211e-06, + "loss": 0.6357, + "step": 5939 + }, + { + "epoch": 1.6454293628808865, + "grad_norm": 0.6789989471435547, + "learning_rate": 4.148136289047609e-06, + "loss": 0.6532, + "step": 5940 + }, + { + "epoch": 1.6457063711911357, + "grad_norm": 0.6899898052215576, + "learning_rate": 4.147862358526867e-06, + "loss": 0.6571, + "step": 5941 + }, + { + "epoch": 1.6459833795013852, + "grad_norm": 0.6977363228797913, + "learning_rate": 4.147588393017798e-06, + "loss": 0.6629, + "step": 5942 + }, + { + "epoch": 1.6462603878116342, + "grad_norm": 0.7238714098930359, + "learning_rate": 4.147314392526221e-06, + "loss": 0.6504, + "step": 5943 + }, + { + "epoch": 1.6465373961218837, + "grad_norm": 0.6461185216903687, + "learning_rate": 4.147040357057951e-06, + "loss": 0.5962, + "step": 5944 + }, + { + "epoch": 1.6468144044321331, + "grad_norm": 0.7050908803939819, + "learning_rate": 4.146766286618808e-06, + "loss": 0.6689, + "step": 5945 + }, + { + "epoch": 1.6470914127423821, + "grad_norm": 0.6823935508728027, + "learning_rate": 4.146492181214613e-06, + "loss": 0.6609, + "step": 5946 + }, + { + "epoch": 1.6473684210526316, + "grad_norm": 0.7076802253723145, + "learning_rate": 4.146218040851185e-06, + "loss": 0.6364, + "step": 5947 + }, + { + "epoch": 1.6476454293628808, + "grad_norm": 0.6914806365966797, + "learning_rate": 4.145943865534341e-06, + "loss": 0.6272, + "step": 5948 + }, + { + "epoch": 1.64792243767313, + "grad_norm": 0.6949312090873718, + "learning_rate": 4.145669655269908e-06, + "loss": 0.6245, + "step": 5949 + }, + { + "epoch": 1.6481994459833795, + "grad_norm": 0.6770291924476624, + "learning_rate": 4.145395410063704e-06, + "loss": 0.6596, + "step": 5950 + }, + { + "epoch": 1.6484764542936288, + "grad_norm": 0.7212213277816772, + "learning_rate": 4.145121129921554e-06, + "loss": 0.6292, + "step": 5951 + }, + { + "epoch": 1.648753462603878, + "grad_norm": 0.6756653785705566, + "learning_rate": 4.144846814849282e-06, + "loss": 0.6754, + "step": 5952 + }, + { + "epoch": 1.6490304709141275, + "grad_norm": 0.7480176091194153, + "learning_rate": 4.144572464852711e-06, + "loss": 0.689, + "step": 5953 + }, + { + "epoch": 1.6493074792243767, + "grad_norm": 0.6765139698982239, + "learning_rate": 4.144298079937668e-06, + "loss": 0.5313, + "step": 5954 + }, + { + "epoch": 1.649584487534626, + "grad_norm": 0.7239553928375244, + "learning_rate": 4.144023660109976e-06, + "loss": 0.6449, + "step": 5955 + }, + { + "epoch": 1.6498614958448754, + "grad_norm": 0.6811375617980957, + "learning_rate": 4.143749205375464e-06, + "loss": 0.5836, + "step": 5956 + }, + { + "epoch": 1.6501385041551246, + "grad_norm": 0.7435874938964844, + "learning_rate": 4.143474715739958e-06, + "loss": 0.615, + "step": 5957 + }, + { + "epoch": 1.6504155124653739, + "grad_norm": 0.7210352420806885, + "learning_rate": 4.143200191209287e-06, + "loss": 0.6824, + "step": 5958 + }, + { + "epoch": 1.6506925207756233, + "grad_norm": 0.675553023815155, + "learning_rate": 4.142925631789281e-06, + "loss": 0.6526, + "step": 5959 + }, + { + "epoch": 1.6509695290858726, + "grad_norm": 0.6919329762458801, + "learning_rate": 4.142651037485766e-06, + "loss": 0.6208, + "step": 5960 + }, + { + "epoch": 1.6512465373961218, + "grad_norm": 0.6395980715751648, + "learning_rate": 4.142376408304577e-06, + "loss": 0.5776, + "step": 5961 + }, + { + "epoch": 1.6515235457063713, + "grad_norm": 0.6764992475509644, + "learning_rate": 4.142101744251541e-06, + "loss": 0.5655, + "step": 5962 + }, + { + "epoch": 1.6518005540166205, + "grad_norm": 0.6732890605926514, + "learning_rate": 4.141827045332493e-06, + "loss": 0.6311, + "step": 5963 + }, + { + "epoch": 1.6520775623268698, + "grad_norm": 0.6796002984046936, + "learning_rate": 4.141552311553262e-06, + "loss": 0.5606, + "step": 5964 + }, + { + "epoch": 1.6523545706371192, + "grad_norm": 0.7205073833465576, + "learning_rate": 4.141277542919685e-06, + "loss": 0.6066, + "step": 5965 + }, + { + "epoch": 1.6526315789473685, + "grad_norm": 0.6774910092353821, + "learning_rate": 4.141002739437594e-06, + "loss": 0.6294, + "step": 5966 + }, + { + "epoch": 1.6529085872576177, + "grad_norm": 0.6876561641693115, + "learning_rate": 4.140727901112824e-06, + "loss": 0.6169, + "step": 5967 + }, + { + "epoch": 1.6531855955678671, + "grad_norm": 0.7015146017074585, + "learning_rate": 4.140453027951211e-06, + "loss": 0.6245, + "step": 5968 + }, + { + "epoch": 1.6534626038781164, + "grad_norm": 0.6706700325012207, + "learning_rate": 4.140178119958591e-06, + "loss": 0.623, + "step": 5969 + }, + { + "epoch": 1.6537396121883656, + "grad_norm": 0.6837378740310669, + "learning_rate": 4.1399031771408005e-06, + "loss": 0.6311, + "step": 5970 + }, + { + "epoch": 1.654016620498615, + "grad_norm": 0.7064226865768433, + "learning_rate": 4.139628199503678e-06, + "loss": 0.6196, + "step": 5971 + }, + { + "epoch": 1.654293628808864, + "grad_norm": 0.7537009119987488, + "learning_rate": 4.139353187053061e-06, + "loss": 0.6401, + "step": 5972 + }, + { + "epoch": 1.6545706371191136, + "grad_norm": 0.694089949131012, + "learning_rate": 4.13907813979479e-06, + "loss": 0.6339, + "step": 5973 + }, + { + "epoch": 1.654847645429363, + "grad_norm": 0.7288523316383362, + "learning_rate": 4.138803057734705e-06, + "loss": 0.5988, + "step": 5974 + }, + { + "epoch": 1.655124653739612, + "grad_norm": 0.7179703116416931, + "learning_rate": 4.138527940878645e-06, + "loss": 0.6612, + "step": 5975 + }, + { + "epoch": 1.6554016620498615, + "grad_norm": 0.6668789982795715, + "learning_rate": 4.138252789232453e-06, + "loss": 0.6397, + "step": 5976 + }, + { + "epoch": 1.655678670360111, + "grad_norm": 0.6644570231437683, + "learning_rate": 4.137977602801971e-06, + "loss": 0.6482, + "step": 5977 + }, + { + "epoch": 1.65595567867036, + "grad_norm": 0.6780247092247009, + "learning_rate": 4.13770238159304e-06, + "loss": 0.6181, + "step": 5978 + }, + { + "epoch": 1.6562326869806094, + "grad_norm": 0.7090055346488953, + "learning_rate": 4.1374271256115066e-06, + "loss": 0.607, + "step": 5979 + }, + { + "epoch": 1.6565096952908587, + "grad_norm": 0.7301034927368164, + "learning_rate": 4.137151834863213e-06, + "loss": 0.6075, + "step": 5980 + }, + { + "epoch": 1.656786703601108, + "grad_norm": 0.6959425806999207, + "learning_rate": 4.136876509354005e-06, + "loss": 0.6439, + "step": 5981 + }, + { + "epoch": 1.6570637119113574, + "grad_norm": 0.709457516670227, + "learning_rate": 4.13660114908973e-06, + "loss": 0.6018, + "step": 5982 + }, + { + "epoch": 1.6573407202216066, + "grad_norm": 0.7351607084274292, + "learning_rate": 4.136325754076232e-06, + "loss": 0.6567, + "step": 5983 + }, + { + "epoch": 1.6576177285318558, + "grad_norm": 0.7283446192741394, + "learning_rate": 4.136050324319359e-06, + "loss": 0.6451, + "step": 5984 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.6645892858505249, + "learning_rate": 4.135774859824962e-06, + "loss": 0.622, + "step": 5985 + }, + { + "epoch": 1.6581717451523545, + "grad_norm": 0.7328609228134155, + "learning_rate": 4.135499360598884e-06, + "loss": 0.6407, + "step": 5986 + }, + { + "epoch": 1.6584487534626038, + "grad_norm": 0.7020002603530884, + "learning_rate": 4.13522382664698e-06, + "loss": 0.6337, + "step": 5987 + }, + { + "epoch": 1.6587257617728532, + "grad_norm": 0.6328570246696472, + "learning_rate": 4.1349482579750984e-06, + "loss": 0.5864, + "step": 5988 + }, + { + "epoch": 1.6590027700831025, + "grad_norm": 0.6584644317626953, + "learning_rate": 4.13467265458909e-06, + "loss": 0.6348, + "step": 5989 + }, + { + "epoch": 1.6592797783933517, + "grad_norm": 0.7188389897346497, + "learning_rate": 4.134397016494807e-06, + "loss": 0.6192, + "step": 5990 + }, + { + "epoch": 1.6595567867036012, + "grad_norm": 0.6980937719345093, + "learning_rate": 4.1341213436981e-06, + "loss": 0.6191, + "step": 5991 + }, + { + "epoch": 1.6598337950138504, + "grad_norm": 0.6944648027420044, + "learning_rate": 4.133845636204825e-06, + "loss": 0.608, + "step": 5992 + }, + { + "epoch": 1.6601108033240997, + "grad_norm": 0.7136276364326477, + "learning_rate": 4.1335698940208344e-06, + "loss": 0.6054, + "step": 5993 + }, + { + "epoch": 1.6603878116343491, + "grad_norm": 0.6488214731216431, + "learning_rate": 4.133294117151983e-06, + "loss": 0.5935, + "step": 5994 + }, + { + "epoch": 1.6606648199445984, + "grad_norm": 0.6956791877746582, + "learning_rate": 4.1330183056041275e-06, + "loss": 0.6339, + "step": 5995 + }, + { + "epoch": 1.6609418282548476, + "grad_norm": 0.7471428513526917, + "learning_rate": 4.132742459383122e-06, + "loss": 0.5852, + "step": 5996 + }, + { + "epoch": 1.661218836565097, + "grad_norm": 0.71979159116745, + "learning_rate": 4.132466578494825e-06, + "loss": 0.6563, + "step": 5997 + }, + { + "epoch": 1.6614958448753463, + "grad_norm": 0.6717140674591064, + "learning_rate": 4.132190662945093e-06, + "loss": 0.6134, + "step": 5998 + }, + { + "epoch": 1.6617728531855955, + "grad_norm": 0.6809756755828857, + "learning_rate": 4.131914712739786e-06, + "loss": 0.6311, + "step": 5999 + }, + { + "epoch": 1.662049861495845, + "grad_norm": 0.6761506795883179, + "learning_rate": 4.131638727884762e-06, + "loss": 0.6154, + "step": 6000 + }, + { + "epoch": 1.6623268698060942, + "grad_norm": 0.6920652389526367, + "learning_rate": 4.131362708385881e-06, + "loss": 0.6319, + "step": 6001 + }, + { + "epoch": 1.6626038781163435, + "grad_norm": 0.7034053206443787, + "learning_rate": 4.131086654249003e-06, + "loss": 0.5533, + "step": 6002 + }, + { + "epoch": 1.662880886426593, + "grad_norm": 0.7007937431335449, + "learning_rate": 4.130810565479991e-06, + "loss": 0.5813, + "step": 6003 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 0.6734853386878967, + "learning_rate": 4.130534442084706e-06, + "loss": 0.5991, + "step": 6004 + }, + { + "epoch": 1.6634349030470914, + "grad_norm": 0.7057620286941528, + "learning_rate": 4.130258284069009e-06, + "loss": 0.5601, + "step": 6005 + }, + { + "epoch": 1.6637119113573409, + "grad_norm": 0.6708484292030334, + "learning_rate": 4.129982091438768e-06, + "loss": 0.613, + "step": 6006 + }, + { + "epoch": 1.6639889196675899, + "grad_norm": 0.682408332824707, + "learning_rate": 4.129705864199844e-06, + "loss": 0.5747, + "step": 6007 + }, + { + "epoch": 1.6642659279778393, + "grad_norm": 0.7112895846366882, + "learning_rate": 4.129429602358101e-06, + "loss": 0.6028, + "step": 6008 + }, + { + "epoch": 1.6645429362880888, + "grad_norm": 0.721603512763977, + "learning_rate": 4.129153305919408e-06, + "loss": 0.6482, + "step": 6009 + }, + { + "epoch": 1.6648199445983378, + "grad_norm": 0.6883713603019714, + "learning_rate": 4.12887697488963e-06, + "loss": 0.6276, + "step": 6010 + }, + { + "epoch": 1.6650969529085873, + "grad_norm": 0.6822021007537842, + "learning_rate": 4.128600609274633e-06, + "loss": 0.6277, + "step": 6011 + }, + { + "epoch": 1.6653739612188365, + "grad_norm": 0.7424676418304443, + "learning_rate": 4.128324209080288e-06, + "loss": 0.6671, + "step": 6012 + }, + { + "epoch": 1.6656509695290858, + "grad_norm": 0.6871791481971741, + "learning_rate": 4.12804777431246e-06, + "loss": 0.6383, + "step": 6013 + }, + { + "epoch": 1.6659279778393352, + "grad_norm": 0.7980043292045593, + "learning_rate": 4.127771304977021e-06, + "loss": 0.6583, + "step": 6014 + }, + { + "epoch": 1.6662049861495845, + "grad_norm": 0.6801428198814392, + "learning_rate": 4.12749480107984e-06, + "loss": 0.6019, + "step": 6015 + }, + { + "epoch": 1.6664819944598337, + "grad_norm": 0.6864743828773499, + "learning_rate": 4.127218262626788e-06, + "loss": 0.6221, + "step": 6016 + }, + { + "epoch": 1.6667590027700832, + "grad_norm": 0.7071241140365601, + "learning_rate": 4.126941689623737e-06, + "loss": 0.6464, + "step": 6017 + }, + { + "epoch": 1.6670360110803324, + "grad_norm": 0.6872143149375916, + "learning_rate": 4.126665082076559e-06, + "loss": 0.5996, + "step": 6018 + }, + { + "epoch": 1.6673130193905816, + "grad_norm": 0.6813023686408997, + "learning_rate": 4.126388439991128e-06, + "loss": 0.6098, + "step": 6019 + }, + { + "epoch": 1.667590027700831, + "grad_norm": 0.7234155535697937, + "learning_rate": 4.126111763373316e-06, + "loss": 0.5886, + "step": 6020 + }, + { + "epoch": 1.6678670360110803, + "grad_norm": 0.7097522020339966, + "learning_rate": 4.125835052228999e-06, + "loss": 0.6022, + "step": 6021 + }, + { + "epoch": 1.6681440443213296, + "grad_norm": 0.7106451392173767, + "learning_rate": 4.125558306564053e-06, + "loss": 0.6189, + "step": 6022 + }, + { + "epoch": 1.668421052631579, + "grad_norm": 0.6844403743743896, + "learning_rate": 4.125281526384352e-06, + "loss": 0.6278, + "step": 6023 + }, + { + "epoch": 1.6686980609418283, + "grad_norm": 0.6791717410087585, + "learning_rate": 4.125004711695773e-06, + "loss": 0.6461, + "step": 6024 + }, + { + "epoch": 1.6689750692520775, + "grad_norm": 0.6500537991523743, + "learning_rate": 4.124727862504196e-06, + "loss": 0.5991, + "step": 6025 + }, + { + "epoch": 1.669252077562327, + "grad_norm": 0.7625722289085388, + "learning_rate": 4.124450978815495e-06, + "loss": 0.6168, + "step": 6026 + }, + { + "epoch": 1.6695290858725762, + "grad_norm": 0.6701847910881042, + "learning_rate": 4.124174060635553e-06, + "loss": 0.6208, + "step": 6027 + }, + { + "epoch": 1.6698060941828254, + "grad_norm": 0.7106515765190125, + "learning_rate": 4.123897107970248e-06, + "loss": 0.6424, + "step": 6028 + }, + { + "epoch": 1.670083102493075, + "grad_norm": 0.6984321475028992, + "learning_rate": 4.123620120825459e-06, + "loss": 0.6732, + "step": 6029 + }, + { + "epoch": 1.6703601108033241, + "grad_norm": 0.7095187306404114, + "learning_rate": 4.12334309920707e-06, + "loss": 0.6538, + "step": 6030 + }, + { + "epoch": 1.6706371191135734, + "grad_norm": 0.7166954874992371, + "learning_rate": 4.1230660431209615e-06, + "loss": 0.6093, + "step": 6031 + }, + { + "epoch": 1.6709141274238228, + "grad_norm": 0.7276129722595215, + "learning_rate": 4.1227889525730156e-06, + "loss": 0.6483, + "step": 6032 + }, + { + "epoch": 1.671191135734072, + "grad_norm": 0.6965504884719849, + "learning_rate": 4.122511827569115e-06, + "loss": 0.5949, + "step": 6033 + }, + { + "epoch": 1.6714681440443213, + "grad_norm": 0.6962923407554626, + "learning_rate": 4.122234668115147e-06, + "loss": 0.6356, + "step": 6034 + }, + { + "epoch": 1.6717451523545708, + "grad_norm": 0.7430986762046814, + "learning_rate": 4.121957474216992e-06, + "loss": 0.6417, + "step": 6035 + }, + { + "epoch": 1.6720221606648198, + "grad_norm": 0.7319906949996948, + "learning_rate": 4.121680245880539e-06, + "loss": 0.6445, + "step": 6036 + }, + { + "epoch": 1.6722991689750693, + "grad_norm": 0.7240617275238037, + "learning_rate": 4.121402983111673e-06, + "loss": 0.6573, + "step": 6037 + }, + { + "epoch": 1.6725761772853187, + "grad_norm": 0.7075072526931763, + "learning_rate": 4.121125685916281e-06, + "loss": 0.6018, + "step": 6038 + }, + { + "epoch": 1.6728531855955677, + "grad_norm": 0.6763313412666321, + "learning_rate": 4.1208483543002514e-06, + "loss": 0.6343, + "step": 6039 + }, + { + "epoch": 1.6731301939058172, + "grad_norm": 0.7229200601577759, + "learning_rate": 4.120570988269472e-06, + "loss": 0.5932, + "step": 6040 + }, + { + "epoch": 1.6734072022160666, + "grad_norm": 0.7736415266990662, + "learning_rate": 4.120293587829831e-06, + "loss": 0.6361, + "step": 6041 + }, + { + "epoch": 1.6736842105263157, + "grad_norm": 0.6882472038269043, + "learning_rate": 4.12001615298722e-06, + "loss": 0.6095, + "step": 6042 + }, + { + "epoch": 1.6739612188365651, + "grad_norm": 0.7033710479736328, + "learning_rate": 4.119738683747528e-06, + "loss": 0.6688, + "step": 6043 + }, + { + "epoch": 1.6742382271468144, + "grad_norm": 0.6977055072784424, + "learning_rate": 4.119461180116649e-06, + "loss": 0.6318, + "step": 6044 + }, + { + "epoch": 1.6745152354570636, + "grad_norm": 0.7398744225502014, + "learning_rate": 4.1191836421004726e-06, + "loss": 0.6096, + "step": 6045 + }, + { + "epoch": 1.674792243767313, + "grad_norm": 0.6894487738609314, + "learning_rate": 4.118906069704892e-06, + "loss": 0.6415, + "step": 6046 + }, + { + "epoch": 1.6750692520775623, + "grad_norm": 0.6845730543136597, + "learning_rate": 4.118628462935803e-06, + "loss": 0.6172, + "step": 6047 + }, + { + "epoch": 1.6753462603878115, + "grad_norm": 0.6749402284622192, + "learning_rate": 4.1183508217990964e-06, + "loss": 0.6417, + "step": 6048 + }, + { + "epoch": 1.675623268698061, + "grad_norm": 0.6568241715431213, + "learning_rate": 4.118073146300671e-06, + "loss": 0.6279, + "step": 6049 + }, + { + "epoch": 1.6759002770083102, + "grad_norm": 0.7095878720283508, + "learning_rate": 4.117795436446418e-06, + "loss": 0.6294, + "step": 6050 + }, + { + "epoch": 1.6761772853185595, + "grad_norm": 0.7073789238929749, + "learning_rate": 4.117517692242239e-06, + "loss": 0.637, + "step": 6051 + }, + { + "epoch": 1.676454293628809, + "grad_norm": 0.6689110398292542, + "learning_rate": 4.117239913694027e-06, + "loss": 0.6159, + "step": 6052 + }, + { + "epoch": 1.6767313019390582, + "grad_norm": 0.7140373587608337, + "learning_rate": 4.116962100807682e-06, + "loss": 0.594, + "step": 6053 + }, + { + "epoch": 1.6770083102493074, + "grad_norm": 0.7038289904594421, + "learning_rate": 4.116684253589102e-06, + "loss": 0.5982, + "step": 6054 + }, + { + "epoch": 1.6772853185595569, + "grad_norm": 0.6871324777603149, + "learning_rate": 4.116406372044188e-06, + "loss": 0.6365, + "step": 6055 + }, + { + "epoch": 1.677562326869806, + "grad_norm": 0.6705648899078369, + "learning_rate": 4.1161284561788385e-06, + "loss": 0.6244, + "step": 6056 + }, + { + "epoch": 1.6778393351800553, + "grad_norm": 0.6981586813926697, + "learning_rate": 4.115850505998954e-06, + "loss": 0.6711, + "step": 6057 + }, + { + "epoch": 1.6781163434903048, + "grad_norm": 0.6764851212501526, + "learning_rate": 4.115572521510438e-06, + "loss": 0.6196, + "step": 6058 + }, + { + "epoch": 1.678393351800554, + "grad_norm": 0.6880233883857727, + "learning_rate": 4.11529450271919e-06, + "loss": 0.6334, + "step": 6059 + }, + { + "epoch": 1.6786703601108033, + "grad_norm": 0.6908855438232422, + "learning_rate": 4.115016449631116e-06, + "loss": 0.6166, + "step": 6060 + }, + { + "epoch": 1.6789473684210527, + "grad_norm": 0.6814182996749878, + "learning_rate": 4.1147383622521174e-06, + "loss": 0.594, + "step": 6061 + }, + { + "epoch": 1.679224376731302, + "grad_norm": 0.6829125285148621, + "learning_rate": 4.114460240588101e-06, + "loss": 0.5736, + "step": 6062 + }, + { + "epoch": 1.6795013850415512, + "grad_norm": 0.7400720119476318, + "learning_rate": 4.11418208464497e-06, + "loss": 0.6244, + "step": 6063 + }, + { + "epoch": 1.6797783933518007, + "grad_norm": 0.7241595983505249, + "learning_rate": 4.113903894428632e-06, + "loss": 0.6691, + "step": 6064 + }, + { + "epoch": 1.6800554016620497, + "grad_norm": 0.7304342985153198, + "learning_rate": 4.113625669944992e-06, + "loss": 0.6692, + "step": 6065 + }, + { + "epoch": 1.6803324099722992, + "grad_norm": 0.7147141098976135, + "learning_rate": 4.1133474111999584e-06, + "loss": 0.6329, + "step": 6066 + }, + { + "epoch": 1.6806094182825486, + "grad_norm": 0.7014601826667786, + "learning_rate": 4.1130691181994395e-06, + "loss": 0.6164, + "step": 6067 + }, + { + "epoch": 1.6808864265927976, + "grad_norm": 0.7008762359619141, + "learning_rate": 4.112790790949344e-06, + "loss": 0.6395, + "step": 6068 + }, + { + "epoch": 1.681163434903047, + "grad_norm": 0.726631760597229, + "learning_rate": 4.112512429455581e-06, + "loss": 0.6034, + "step": 6069 + }, + { + "epoch": 1.6814404432132966, + "grad_norm": 0.6704750061035156, + "learning_rate": 4.112234033724061e-06, + "loss": 0.6053, + "step": 6070 + }, + { + "epoch": 1.6817174515235456, + "grad_norm": 0.6997814178466797, + "learning_rate": 4.111955603760696e-06, + "loss": 0.6357, + "step": 6071 + }, + { + "epoch": 1.681994459833795, + "grad_norm": 0.682888388633728, + "learning_rate": 4.111677139571396e-06, + "loss": 0.5929, + "step": 6072 + }, + { + "epoch": 1.6822714681440443, + "grad_norm": 0.6795300245285034, + "learning_rate": 4.111398641162076e-06, + "loss": 0.6298, + "step": 6073 + }, + { + "epoch": 1.6825484764542935, + "grad_norm": 0.6853460669517517, + "learning_rate": 4.111120108538647e-06, + "loss": 0.627, + "step": 6074 + }, + { + "epoch": 1.682825484764543, + "grad_norm": 0.6448818445205688, + "learning_rate": 4.110841541707024e-06, + "loss": 0.5968, + "step": 6075 + }, + { + "epoch": 1.6831024930747922, + "grad_norm": 0.7064438462257385, + "learning_rate": 4.110562940673121e-06, + "loss": 0.6412, + "step": 6076 + }, + { + "epoch": 1.6833795013850414, + "grad_norm": 0.6554502248764038, + "learning_rate": 4.110284305442855e-06, + "loss": 0.5882, + "step": 6077 + }, + { + "epoch": 1.683656509695291, + "grad_norm": 0.7326308488845825, + "learning_rate": 4.110005636022138e-06, + "loss": 0.6333, + "step": 6078 + }, + { + "epoch": 1.6839335180055401, + "grad_norm": 0.6282971501350403, + "learning_rate": 4.109726932416893e-06, + "loss": 0.5892, + "step": 6079 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.6800413727760315, + "learning_rate": 4.109448194633033e-06, + "loss": 0.557, + "step": 6080 + }, + { + "epoch": 1.6844875346260388, + "grad_norm": 0.6659765243530273, + "learning_rate": 4.1091694226764784e-06, + "loss": 0.5852, + "step": 6081 + }, + { + "epoch": 1.684764542936288, + "grad_norm": 0.6778327226638794, + "learning_rate": 4.108890616553147e-06, + "loss": 0.6176, + "step": 6082 + }, + { + "epoch": 1.6850415512465373, + "grad_norm": 0.707545280456543, + "learning_rate": 4.1086117762689595e-06, + "loss": 0.6196, + "step": 6083 + }, + { + "epoch": 1.6853185595567868, + "grad_norm": 0.7330470085144043, + "learning_rate": 4.1083329018298356e-06, + "loss": 0.6483, + "step": 6084 + }, + { + "epoch": 1.685595567867036, + "grad_norm": 0.7315000891685486, + "learning_rate": 4.1080539932416974e-06, + "loss": 0.6321, + "step": 6085 + }, + { + "epoch": 1.6858725761772853, + "grad_norm": 0.6998733878135681, + "learning_rate": 4.107775050510467e-06, + "loss": 0.6541, + "step": 6086 + }, + { + "epoch": 1.6861495844875347, + "grad_norm": 0.6499519348144531, + "learning_rate": 4.107496073642064e-06, + "loss": 0.5905, + "step": 6087 + }, + { + "epoch": 1.686426592797784, + "grad_norm": 0.7056503295898438, + "learning_rate": 4.1072170626424166e-06, + "loss": 0.6332, + "step": 6088 + }, + { + "epoch": 1.6867036011080332, + "grad_norm": 0.6869191527366638, + "learning_rate": 4.106938017517446e-06, + "loss": 0.6453, + "step": 6089 + }, + { + "epoch": 1.6869806094182827, + "grad_norm": 0.7189652919769287, + "learning_rate": 4.106658938273078e-06, + "loss": 0.5499, + "step": 6090 + }, + { + "epoch": 1.6872576177285319, + "grad_norm": 0.6633101105690002, + "learning_rate": 4.106379824915237e-06, + "loss": 0.6392, + "step": 6091 + }, + { + "epoch": 1.6875346260387811, + "grad_norm": 0.7263966798782349, + "learning_rate": 4.106100677449851e-06, + "loss": 0.6561, + "step": 6092 + }, + { + "epoch": 1.6878116343490306, + "grad_norm": 0.7036128640174866, + "learning_rate": 4.105821495882845e-06, + "loss": 0.6357, + "step": 6093 + }, + { + "epoch": 1.6880886426592798, + "grad_norm": 0.6799331307411194, + "learning_rate": 4.105542280220148e-06, + "loss": 0.6067, + "step": 6094 + }, + { + "epoch": 1.688365650969529, + "grad_norm": 0.6819090843200684, + "learning_rate": 4.105263030467689e-06, + "loss": 0.6249, + "step": 6095 + }, + { + "epoch": 1.6886426592797785, + "grad_norm": 0.6934465169906616, + "learning_rate": 4.104983746631396e-06, + "loss": 0.6438, + "step": 6096 + }, + { + "epoch": 1.6889196675900275, + "grad_norm": 0.6890839338302612, + "learning_rate": 4.1047044287172e-06, + "loss": 0.6263, + "step": 6097 + }, + { + "epoch": 1.689196675900277, + "grad_norm": 0.7147068977355957, + "learning_rate": 4.10442507673103e-06, + "loss": 0.6331, + "step": 6098 + }, + { + "epoch": 1.6894736842105265, + "grad_norm": 0.6808082461357117, + "learning_rate": 4.104145690678819e-06, + "loss": 0.6405, + "step": 6099 + }, + { + "epoch": 1.6897506925207755, + "grad_norm": 0.6937206387519836, + "learning_rate": 4.103866270566498e-06, + "loss": 0.6308, + "step": 6100 + }, + { + "epoch": 1.690027700831025, + "grad_norm": 0.715852677822113, + "learning_rate": 4.103586816400001e-06, + "loss": 0.614, + "step": 6101 + }, + { + "epoch": 1.6903047091412744, + "grad_norm": 0.7389687299728394, + "learning_rate": 4.10330732818526e-06, + "loss": 0.6356, + "step": 6102 + }, + { + "epoch": 1.6905817174515234, + "grad_norm": 0.7088740468025208, + "learning_rate": 4.10302780592821e-06, + "loss": 0.6506, + "step": 6103 + }, + { + "epoch": 1.6908587257617729, + "grad_norm": 0.6934943199157715, + "learning_rate": 4.102748249634787e-06, + "loss": 0.6239, + "step": 6104 + }, + { + "epoch": 1.6911357340720221, + "grad_norm": 0.6720358729362488, + "learning_rate": 4.102468659310925e-06, + "loss": 0.6261, + "step": 6105 + }, + { + "epoch": 1.6914127423822714, + "grad_norm": 0.7021436095237732, + "learning_rate": 4.102189034962561e-06, + "loss": 0.6418, + "step": 6106 + }, + { + "epoch": 1.6916897506925208, + "grad_norm": 0.6990009546279907, + "learning_rate": 4.101909376595632e-06, + "loss": 0.6781, + "step": 6107 + }, + { + "epoch": 1.69196675900277, + "grad_norm": 0.6986948251724243, + "learning_rate": 4.101629684216076e-06, + "loss": 0.6628, + "step": 6108 + }, + { + "epoch": 1.6922437673130193, + "grad_norm": 0.6557834148406982, + "learning_rate": 4.101349957829833e-06, + "loss": 0.6234, + "step": 6109 + }, + { + "epoch": 1.6925207756232687, + "grad_norm": 0.7131497263908386, + "learning_rate": 4.1010701974428405e-06, + "loss": 0.6557, + "step": 6110 + }, + { + "epoch": 1.692797783933518, + "grad_norm": 0.7220354676246643, + "learning_rate": 4.100790403061038e-06, + "loss": 0.6288, + "step": 6111 + }, + { + "epoch": 1.6930747922437672, + "grad_norm": 0.7219642996788025, + "learning_rate": 4.100510574690368e-06, + "loss": 0.6509, + "step": 6112 + }, + { + "epoch": 1.6933518005540167, + "grad_norm": 0.770431399345398, + "learning_rate": 4.1002307123367716e-06, + "loss": 0.6359, + "step": 6113 + }, + { + "epoch": 1.693628808864266, + "grad_norm": 0.682773232460022, + "learning_rate": 4.099950816006189e-06, + "loss": 0.6319, + "step": 6114 + }, + { + "epoch": 1.6939058171745152, + "grad_norm": 0.6756408214569092, + "learning_rate": 4.099670885704566e-06, + "loss": 0.6226, + "step": 6115 + }, + { + "epoch": 1.6941828254847646, + "grad_norm": 0.7353326082229614, + "learning_rate": 4.099390921437845e-06, + "loss": 0.6501, + "step": 6116 + }, + { + "epoch": 1.6944598337950139, + "grad_norm": 0.7328805327415466, + "learning_rate": 4.099110923211971e-06, + "loss": 0.6325, + "step": 6117 + }, + { + "epoch": 1.694736842105263, + "grad_norm": 0.6881809234619141, + "learning_rate": 4.098830891032888e-06, + "loss": 0.6609, + "step": 6118 + }, + { + "epoch": 1.6950138504155126, + "grad_norm": 0.8122984766960144, + "learning_rate": 4.098550824906542e-06, + "loss": 0.6201, + "step": 6119 + }, + { + "epoch": 1.6952908587257618, + "grad_norm": 0.6945459842681885, + "learning_rate": 4.098270724838879e-06, + "loss": 0.5877, + "step": 6120 + }, + { + "epoch": 1.695567867036011, + "grad_norm": 0.6746237874031067, + "learning_rate": 4.097990590835847e-06, + "loss": 0.6191, + "step": 6121 + }, + { + "epoch": 1.6958448753462605, + "grad_norm": 0.7451200485229492, + "learning_rate": 4.097710422903395e-06, + "loss": 0.6232, + "step": 6122 + }, + { + "epoch": 1.6961218836565097, + "grad_norm": 0.7273604273796082, + "learning_rate": 4.0974302210474705e-06, + "loss": 0.6481, + "step": 6123 + }, + { + "epoch": 1.696398891966759, + "grad_norm": 0.6815860271453857, + "learning_rate": 4.0971499852740224e-06, + "loss": 0.5715, + "step": 6124 + }, + { + "epoch": 1.6966759002770084, + "grad_norm": 0.6721625328063965, + "learning_rate": 4.096869715589002e-06, + "loss": 0.6237, + "step": 6125 + }, + { + "epoch": 1.6969529085872577, + "grad_norm": 0.7191078066825867, + "learning_rate": 4.096589411998359e-06, + "loss": 0.6062, + "step": 6126 + }, + { + "epoch": 1.697229916897507, + "grad_norm": 0.7018681168556213, + "learning_rate": 4.096309074508046e-06, + "loss": 0.6266, + "step": 6127 + }, + { + "epoch": 1.6975069252077564, + "grad_norm": 0.6845425963401794, + "learning_rate": 4.096028703124014e-06, + "loss": 0.6761, + "step": 6128 + }, + { + "epoch": 1.6977839335180054, + "grad_norm": 0.7229567170143127, + "learning_rate": 4.095748297852218e-06, + "loss": 0.5982, + "step": 6129 + }, + { + "epoch": 1.6980609418282548, + "grad_norm": 0.6780923008918762, + "learning_rate": 4.09546785869861e-06, + "loss": 0.6243, + "step": 6130 + }, + { + "epoch": 1.6983379501385043, + "grad_norm": 0.7234211564064026, + "learning_rate": 4.095187385669145e-06, + "loss": 0.6738, + "step": 6131 + }, + { + "epoch": 1.6986149584487533, + "grad_norm": 0.6908296346664429, + "learning_rate": 4.094906878769779e-06, + "loss": 0.6236, + "step": 6132 + }, + { + "epoch": 1.6988919667590028, + "grad_norm": 0.6617946624755859, + "learning_rate": 4.094626338006466e-06, + "loss": 0.6388, + "step": 6133 + }, + { + "epoch": 1.6991689750692522, + "grad_norm": 0.7265128493309021, + "learning_rate": 4.094345763385163e-06, + "loss": 0.6631, + "step": 6134 + }, + { + "epoch": 1.6994459833795013, + "grad_norm": 0.6898639798164368, + "learning_rate": 4.094065154911829e-06, + "loss": 0.6443, + "step": 6135 + }, + { + "epoch": 1.6997229916897507, + "grad_norm": 0.6893454194068909, + "learning_rate": 4.09378451259242e-06, + "loss": 0.6131, + "step": 6136 + }, + { + "epoch": 1.7, + "grad_norm": 0.71174556016922, + "learning_rate": 4.093503836432897e-06, + "loss": 0.5904, + "step": 6137 + }, + { + "epoch": 1.7002770083102492, + "grad_norm": 0.7047994136810303, + "learning_rate": 4.093223126439217e-06, + "loss": 0.5866, + "step": 6138 + }, + { + "epoch": 1.7005540166204987, + "grad_norm": 0.7247219681739807, + "learning_rate": 4.092942382617342e-06, + "loss": 0.6405, + "step": 6139 + }, + { + "epoch": 1.700831024930748, + "grad_norm": 0.7214411497116089, + "learning_rate": 4.092661604973232e-06, + "loss": 0.6256, + "step": 6140 + }, + { + "epoch": 1.7011080332409971, + "grad_norm": 0.6483976244926453, + "learning_rate": 4.09238079351285e-06, + "loss": 0.6162, + "step": 6141 + }, + { + "epoch": 1.7013850415512466, + "grad_norm": 0.6869223713874817, + "learning_rate": 4.092099948242156e-06, + "loss": 0.6235, + "step": 6142 + }, + { + "epoch": 1.7016620498614958, + "grad_norm": 0.7167564034461975, + "learning_rate": 4.0918190691671145e-06, + "loss": 0.6512, + "step": 6143 + }, + { + "epoch": 1.701939058171745, + "grad_norm": 0.6702075600624084, + "learning_rate": 4.09153815629369e-06, + "loss": 0.6293, + "step": 6144 + }, + { + "epoch": 1.7022160664819945, + "grad_norm": 0.6951959729194641, + "learning_rate": 4.091257209627845e-06, + "loss": 0.5995, + "step": 6145 + }, + { + "epoch": 1.7024930747922438, + "grad_norm": 0.7316385507583618, + "learning_rate": 4.090976229175546e-06, + "loss": 0.6548, + "step": 6146 + }, + { + "epoch": 1.702770083102493, + "grad_norm": 0.7311107516288757, + "learning_rate": 4.090695214942759e-06, + "loss": 0.6865, + "step": 6147 + }, + { + "epoch": 1.7030470914127425, + "grad_norm": 0.663872241973877, + "learning_rate": 4.09041416693545e-06, + "loss": 0.6137, + "step": 6148 + }, + { + "epoch": 1.7033240997229917, + "grad_norm": 0.6457836627960205, + "learning_rate": 4.0901330851595874e-06, + "loss": 0.6149, + "step": 6149 + }, + { + "epoch": 1.703601108033241, + "grad_norm": 0.6875283122062683, + "learning_rate": 4.089851969621138e-06, + "loss": 0.6416, + "step": 6150 + }, + { + "epoch": 1.7038781163434904, + "grad_norm": 0.6898400783538818, + "learning_rate": 4.089570820326072e-06, + "loss": 0.5711, + "step": 6151 + }, + { + "epoch": 1.7041551246537396, + "grad_norm": 0.7291202545166016, + "learning_rate": 4.089289637280358e-06, + "loss": 0.6284, + "step": 6152 + }, + { + "epoch": 1.7044321329639889, + "grad_norm": 0.7050850987434387, + "learning_rate": 4.089008420489966e-06, + "loss": 0.6316, + "step": 6153 + }, + { + "epoch": 1.7047091412742383, + "grad_norm": 0.6834461092948914, + "learning_rate": 4.0887271699608675e-06, + "loss": 0.6267, + "step": 6154 + }, + { + "epoch": 1.7049861495844876, + "grad_norm": 0.7263997197151184, + "learning_rate": 4.088445885699034e-06, + "loss": 0.648, + "step": 6155 + }, + { + "epoch": 1.7052631578947368, + "grad_norm": 0.7006638646125793, + "learning_rate": 4.088164567710438e-06, + "loss": 0.6418, + "step": 6156 + }, + { + "epoch": 1.7055401662049863, + "grad_norm": 0.6719088554382324, + "learning_rate": 4.087883216001052e-06, + "loss": 0.6188, + "step": 6157 + }, + { + "epoch": 1.7058171745152355, + "grad_norm": 0.6723303198814392, + "learning_rate": 4.08760183057685e-06, + "loss": 0.5958, + "step": 6158 + }, + { + "epoch": 1.7060941828254848, + "grad_norm": 0.7296972274780273, + "learning_rate": 4.087320411443808e-06, + "loss": 0.6476, + "step": 6159 + }, + { + "epoch": 1.7063711911357342, + "grad_norm": 0.7308874130249023, + "learning_rate": 4.0870389586079e-06, + "loss": 0.6427, + "step": 6160 + }, + { + "epoch": 1.7066481994459832, + "grad_norm": 0.6853946447372437, + "learning_rate": 4.086757472075102e-06, + "loss": 0.6455, + "step": 6161 + }, + { + "epoch": 1.7069252077562327, + "grad_norm": 0.6351714730262756, + "learning_rate": 4.08647595185139e-06, + "loss": 0.6193, + "step": 6162 + }, + { + "epoch": 1.7072022160664821, + "grad_norm": 0.7018414735794067, + "learning_rate": 4.0861943979427424e-06, + "loss": 0.6611, + "step": 6163 + }, + { + "epoch": 1.7074792243767312, + "grad_norm": 0.742825448513031, + "learning_rate": 4.085912810355137e-06, + "loss": 0.6531, + "step": 6164 + }, + { + "epoch": 1.7077562326869806, + "grad_norm": 0.685265302658081, + "learning_rate": 4.085631189094553e-06, + "loss": 0.6116, + "step": 6165 + }, + { + "epoch": 1.7080332409972299, + "grad_norm": 0.7166582345962524, + "learning_rate": 4.08534953416697e-06, + "loss": 0.6211, + "step": 6166 + }, + { + "epoch": 1.708310249307479, + "grad_norm": 0.7106016874313354, + "learning_rate": 4.085067845578368e-06, + "loss": 0.6188, + "step": 6167 + }, + { + "epoch": 1.7085872576177286, + "grad_norm": 0.6873186230659485, + "learning_rate": 4.084786123334726e-06, + "loss": 0.5869, + "step": 6168 + }, + { + "epoch": 1.7088642659279778, + "grad_norm": 0.670590877532959, + "learning_rate": 4.084504367442028e-06, + "loss": 0.6009, + "step": 6169 + }, + { + "epoch": 1.709141274238227, + "grad_norm": 0.668232262134552, + "learning_rate": 4.084222577906257e-06, + "loss": 0.6048, + "step": 6170 + }, + { + "epoch": 1.7094182825484765, + "grad_norm": 0.6711857914924622, + "learning_rate": 4.083940754733394e-06, + "loss": 0.6295, + "step": 6171 + }, + { + "epoch": 1.7096952908587257, + "grad_norm": 0.7261093258857727, + "learning_rate": 4.083658897929425e-06, + "loss": 0.6218, + "step": 6172 + }, + { + "epoch": 1.709972299168975, + "grad_norm": 0.7620695233345032, + "learning_rate": 4.083377007500333e-06, + "loss": 0.6085, + "step": 6173 + }, + { + "epoch": 1.7102493074792244, + "grad_norm": 0.7104936242103577, + "learning_rate": 4.083095083452103e-06, + "loss": 0.6665, + "step": 6174 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6915191411972046, + "learning_rate": 4.082813125790722e-06, + "loss": 0.628, + "step": 6175 + }, + { + "epoch": 1.710803324099723, + "grad_norm": 0.6785016059875488, + "learning_rate": 4.082531134522176e-06, + "loss": 0.6056, + "step": 6176 + }, + { + "epoch": 1.7110803324099724, + "grad_norm": 0.7069738507270813, + "learning_rate": 4.082249109652453e-06, + "loss": 0.6552, + "step": 6177 + }, + { + "epoch": 1.7113573407202216, + "grad_norm": 0.6891471743583679, + "learning_rate": 4.081967051187541e-06, + "loss": 0.6242, + "step": 6178 + }, + { + "epoch": 1.7116343490304708, + "grad_norm": 0.7005449533462524, + "learning_rate": 4.0816849591334284e-06, + "loss": 0.6649, + "step": 6179 + }, + { + "epoch": 1.7119113573407203, + "grad_norm": 0.717957615852356, + "learning_rate": 4.081402833496105e-06, + "loss": 0.6159, + "step": 6180 + }, + { + "epoch": 1.7121883656509695, + "grad_norm": 0.6962898969650269, + "learning_rate": 4.08112067428156e-06, + "loss": 0.6841, + "step": 6181 + }, + { + "epoch": 1.7124653739612188, + "grad_norm": 0.677981972694397, + "learning_rate": 4.080838481495786e-06, + "loss": 0.595, + "step": 6182 + }, + { + "epoch": 1.7127423822714682, + "grad_norm": 0.6928604245185852, + "learning_rate": 4.080556255144775e-06, + "loss": 0.6615, + "step": 6183 + }, + { + "epoch": 1.7130193905817175, + "grad_norm": 0.7054057121276855, + "learning_rate": 4.0802739952345175e-06, + "loss": 0.5946, + "step": 6184 + }, + { + "epoch": 1.7132963988919667, + "grad_norm": 0.7249715328216553, + "learning_rate": 4.079991701771007e-06, + "loss": 0.6842, + "step": 6185 + }, + { + "epoch": 1.7135734072022162, + "grad_norm": 0.6929246187210083, + "learning_rate": 4.079709374760239e-06, + "loss": 0.6191, + "step": 6186 + }, + { + "epoch": 1.7138504155124654, + "grad_norm": 0.7572833299636841, + "learning_rate": 4.079427014208206e-06, + "loss": 0.6109, + "step": 6187 + }, + { + "epoch": 1.7141274238227147, + "grad_norm": 0.6429564356803894, + "learning_rate": 4.079144620120904e-06, + "loss": 0.5983, + "step": 6188 + }, + { + "epoch": 1.7144044321329641, + "grad_norm": 0.6844918727874756, + "learning_rate": 4.0788621925043294e-06, + "loss": 0.6029, + "step": 6189 + }, + { + "epoch": 1.7146814404432131, + "grad_norm": 0.6932576298713684, + "learning_rate": 4.078579731364478e-06, + "loss": 0.6118, + "step": 6190 + }, + { + "epoch": 1.7149584487534626, + "grad_norm": 0.691020131111145, + "learning_rate": 4.078297236707348e-06, + "loss": 0.6609, + "step": 6191 + }, + { + "epoch": 1.715235457063712, + "grad_norm": 0.7045336365699768, + "learning_rate": 4.078014708538938e-06, + "loss": 0.6435, + "step": 6192 + }, + { + "epoch": 1.715512465373961, + "grad_norm": 0.6789292097091675, + "learning_rate": 4.0777321468652444e-06, + "loss": 0.6268, + "step": 6193 + }, + { + "epoch": 1.7157894736842105, + "grad_norm": 0.6860518455505371, + "learning_rate": 4.077449551692268e-06, + "loss": 0.6457, + "step": 6194 + }, + { + "epoch": 1.71606648199446, + "grad_norm": 0.6922401189804077, + "learning_rate": 4.077166923026012e-06, + "loss": 0.6315, + "step": 6195 + }, + { + "epoch": 1.716343490304709, + "grad_norm": 0.7087437510490417, + "learning_rate": 4.076884260872472e-06, + "loss": 0.6008, + "step": 6196 + }, + { + "epoch": 1.7166204986149585, + "grad_norm": 0.6820953488349915, + "learning_rate": 4.076601565237653e-06, + "loss": 0.6221, + "step": 6197 + }, + { + "epoch": 1.7168975069252077, + "grad_norm": 0.7251583337783813, + "learning_rate": 4.076318836127556e-06, + "loss": 0.6201, + "step": 6198 + }, + { + "epoch": 1.717174515235457, + "grad_norm": 0.7204704880714417, + "learning_rate": 4.076036073548185e-06, + "loss": 0.6396, + "step": 6199 + }, + { + "epoch": 1.7174515235457064, + "grad_norm": 0.6726484894752502, + "learning_rate": 4.075753277505544e-06, + "loss": 0.6495, + "step": 6200 + }, + { + "epoch": 1.7177285318559556, + "grad_norm": 0.7026734352111816, + "learning_rate": 4.075470448005636e-06, + "loss": 0.6056, + "step": 6201 + }, + { + "epoch": 1.7180055401662049, + "grad_norm": 0.704633891582489, + "learning_rate": 4.075187585054468e-06, + "loss": 0.5745, + "step": 6202 + }, + { + "epoch": 1.7182825484764543, + "grad_norm": 0.7028330564498901, + "learning_rate": 4.0749046886580444e-06, + "loss": 0.6352, + "step": 6203 + }, + { + "epoch": 1.7185595567867036, + "grad_norm": 0.6898366808891296, + "learning_rate": 4.074621758822373e-06, + "loss": 0.6102, + "step": 6204 + }, + { + "epoch": 1.7188365650969528, + "grad_norm": 0.727149248123169, + "learning_rate": 4.074338795553459e-06, + "loss": 0.626, + "step": 6205 + }, + { + "epoch": 1.7191135734072023, + "grad_norm": 0.6894062161445618, + "learning_rate": 4.074055798857314e-06, + "loss": 0.6292, + "step": 6206 + }, + { + "epoch": 1.7193905817174515, + "grad_norm": 0.9477318525314331, + "learning_rate": 4.073772768739943e-06, + "loss": 0.6284, + "step": 6207 + }, + { + "epoch": 1.7196675900277008, + "grad_norm": 0.7123240232467651, + "learning_rate": 4.073489705207358e-06, + "loss": 0.6025, + "step": 6208 + }, + { + "epoch": 1.7199445983379502, + "grad_norm": 0.7271552085876465, + "learning_rate": 4.073206608265567e-06, + "loss": 0.6025, + "step": 6209 + }, + { + "epoch": 1.7202216066481995, + "grad_norm": 0.7003310918807983, + "learning_rate": 4.072923477920584e-06, + "loss": 0.6437, + "step": 6210 + }, + { + "epoch": 1.7204986149584487, + "grad_norm": 0.6652795672416687, + "learning_rate": 4.072640314178418e-06, + "loss": 0.5818, + "step": 6211 + }, + { + "epoch": 1.7207756232686982, + "grad_norm": 0.7066246867179871, + "learning_rate": 4.072357117045082e-06, + "loss": 0.65, + "step": 6212 + }, + { + "epoch": 1.7210526315789474, + "grad_norm": 0.7616447806358337, + "learning_rate": 4.072073886526589e-06, + "loss": 0.6075, + "step": 6213 + }, + { + "epoch": 1.7213296398891966, + "grad_norm": 0.7162469029426575, + "learning_rate": 4.071790622628953e-06, + "loss": 0.6344, + "step": 6214 + }, + { + "epoch": 1.721606648199446, + "grad_norm": 0.6894190311431885, + "learning_rate": 4.071507325358188e-06, + "loss": 0.5557, + "step": 6215 + }, + { + "epoch": 1.7218836565096953, + "grad_norm": 0.7193700075149536, + "learning_rate": 4.071223994720309e-06, + "loss": 0.6059, + "step": 6216 + }, + { + "epoch": 1.7221606648199446, + "grad_norm": 0.7540478706359863, + "learning_rate": 4.070940630721333e-06, + "loss": 0.6264, + "step": 6217 + }, + { + "epoch": 1.722437673130194, + "grad_norm": 0.6955430507659912, + "learning_rate": 4.070657233367274e-06, + "loss": 0.6087, + "step": 6218 + }, + { + "epoch": 1.7227146814404433, + "grad_norm": 0.6968384385108948, + "learning_rate": 4.070373802664153e-06, + "loss": 0.6091, + "step": 6219 + }, + { + "epoch": 1.7229916897506925, + "grad_norm": 0.7160845398902893, + "learning_rate": 4.070090338617985e-06, + "loss": 0.6538, + "step": 6220 + }, + { + "epoch": 1.723268698060942, + "grad_norm": 0.7012823820114136, + "learning_rate": 4.069806841234789e-06, + "loss": 0.6606, + "step": 6221 + }, + { + "epoch": 1.723545706371191, + "grad_norm": 0.6941177845001221, + "learning_rate": 4.069523310520585e-06, + "loss": 0.6361, + "step": 6222 + }, + { + "epoch": 1.7238227146814404, + "grad_norm": 0.6887818574905396, + "learning_rate": 4.069239746481394e-06, + "loss": 0.6145, + "step": 6223 + }, + { + "epoch": 1.72409972299169, + "grad_norm": 0.7334598302841187, + "learning_rate": 4.068956149123235e-06, + "loss": 0.6769, + "step": 6224 + }, + { + "epoch": 1.724376731301939, + "grad_norm": 0.697456419467926, + "learning_rate": 4.06867251845213e-06, + "loss": 0.5673, + "step": 6225 + }, + { + "epoch": 1.7246537396121884, + "grad_norm": 0.6590297222137451, + "learning_rate": 4.0683888544741024e-06, + "loss": 0.5588, + "step": 6226 + }, + { + "epoch": 1.7249307479224378, + "grad_norm": 0.7389965653419495, + "learning_rate": 4.068105157195173e-06, + "loss": 0.6293, + "step": 6227 + }, + { + "epoch": 1.7252077562326869, + "grad_norm": 0.6632516980171204, + "learning_rate": 4.067821426621368e-06, + "loss": 0.6334, + "step": 6228 + }, + { + "epoch": 1.7254847645429363, + "grad_norm": 0.7266767024993896, + "learning_rate": 4.0675376627587095e-06, + "loss": 0.6003, + "step": 6229 + }, + { + "epoch": 1.7257617728531855, + "grad_norm": 0.6739516854286194, + "learning_rate": 4.067253865613223e-06, + "loss": 0.6102, + "step": 6230 + }, + { + "epoch": 1.7260387811634348, + "grad_norm": 0.6888447403907776, + "learning_rate": 4.066970035190935e-06, + "loss": 0.627, + "step": 6231 + }, + { + "epoch": 1.7263157894736842, + "grad_norm": 0.7709148526191711, + "learning_rate": 4.066686171497872e-06, + "loss": 0.6326, + "step": 6232 + }, + { + "epoch": 1.7265927977839335, + "grad_norm": 0.6897366046905518, + "learning_rate": 4.06640227454006e-06, + "loss": 0.6276, + "step": 6233 + }, + { + "epoch": 1.7268698060941827, + "grad_norm": 0.6722098588943481, + "learning_rate": 4.066118344323528e-06, + "loss": 0.6011, + "step": 6234 + }, + { + "epoch": 1.7271468144044322, + "grad_norm": 0.7252819538116455, + "learning_rate": 4.065834380854305e-06, + "loss": 0.6208, + "step": 6235 + }, + { + "epoch": 1.7274238227146814, + "grad_norm": 0.7158248424530029, + "learning_rate": 4.065550384138417e-06, + "loss": 0.6293, + "step": 6236 + }, + { + "epoch": 1.7277008310249307, + "grad_norm": 0.7303574681282043, + "learning_rate": 4.065266354181898e-06, + "loss": 0.6289, + "step": 6237 + }, + { + "epoch": 1.7279778393351801, + "grad_norm": 0.6661691069602966, + "learning_rate": 4.064982290990777e-06, + "loss": 0.6345, + "step": 6238 + }, + { + "epoch": 1.7282548476454294, + "grad_norm": 0.7096925377845764, + "learning_rate": 4.064698194571085e-06, + "loss": 0.6219, + "step": 6239 + }, + { + "epoch": 1.7285318559556786, + "grad_norm": 0.7123353481292725, + "learning_rate": 4.064414064928854e-06, + "loss": 0.6726, + "step": 6240 + }, + { + "epoch": 1.728808864265928, + "grad_norm": 0.6776770353317261, + "learning_rate": 4.064129902070118e-06, + "loss": 0.6325, + "step": 6241 + }, + { + "epoch": 1.7290858725761773, + "grad_norm": 0.727475643157959, + "learning_rate": 4.063845706000911e-06, + "loss": 0.6176, + "step": 6242 + }, + { + "epoch": 1.7293628808864265, + "grad_norm": 0.6917515397071838, + "learning_rate": 4.063561476727265e-06, + "loss": 0.623, + "step": 6243 + }, + { + "epoch": 1.729639889196676, + "grad_norm": 0.6911443471908569, + "learning_rate": 4.0632772142552155e-06, + "loss": 0.6079, + "step": 6244 + }, + { + "epoch": 1.7299168975069252, + "grad_norm": 0.6569199562072754, + "learning_rate": 4.062992918590799e-06, + "loss": 0.6144, + "step": 6245 + }, + { + "epoch": 1.7301939058171745, + "grad_norm": 0.6976788640022278, + "learning_rate": 4.062708589740051e-06, + "loss": 0.6127, + "step": 6246 + }, + { + "epoch": 1.730470914127424, + "grad_norm": 0.6994149684906006, + "learning_rate": 4.06242422770901e-06, + "loss": 0.6107, + "step": 6247 + }, + { + "epoch": 1.7307479224376732, + "grad_norm": 0.7247564792633057, + "learning_rate": 4.062139832503711e-06, + "loss": 0.653, + "step": 6248 + }, + { + "epoch": 1.7310249307479224, + "grad_norm": 0.6729344129562378, + "learning_rate": 4.061855404130195e-06, + "loss": 0.6358, + "step": 6249 + }, + { + "epoch": 1.7313019390581719, + "grad_norm": 0.7238703370094299, + "learning_rate": 4.0615709425945e-06, + "loss": 0.6623, + "step": 6250 + }, + { + "epoch": 1.731578947368421, + "grad_norm": 0.6958584785461426, + "learning_rate": 4.061286447902667e-06, + "loss": 0.6162, + "step": 6251 + }, + { + "epoch": 1.7318559556786703, + "grad_norm": 0.7200596332550049, + "learning_rate": 4.061001920060735e-06, + "loss": 0.6552, + "step": 6252 + }, + { + "epoch": 1.7321329639889198, + "grad_norm": 0.6930348873138428, + "learning_rate": 4.0607173590747455e-06, + "loss": 0.5934, + "step": 6253 + }, + { + "epoch": 1.7324099722991688, + "grad_norm": 0.6601454615592957, + "learning_rate": 4.060432764950741e-06, + "loss": 0.6083, + "step": 6254 + }, + { + "epoch": 1.7326869806094183, + "grad_norm": 0.6737957000732422, + "learning_rate": 4.0601481376947634e-06, + "loss": 0.633, + "step": 6255 + }, + { + "epoch": 1.7329639889196677, + "grad_norm": 0.6880027651786804, + "learning_rate": 4.059863477312858e-06, + "loss": 0.607, + "step": 6256 + }, + { + "epoch": 1.7332409972299168, + "grad_norm": 0.6753184199333191, + "learning_rate": 4.059578783811067e-06, + "loss": 0.6336, + "step": 6257 + }, + { + "epoch": 1.7335180055401662, + "grad_norm": 0.6858214735984802, + "learning_rate": 4.059294057195435e-06, + "loss": 0.6702, + "step": 6258 + }, + { + "epoch": 1.7337950138504157, + "grad_norm": 0.6795182228088379, + "learning_rate": 4.059009297472009e-06, + "loss": 0.6329, + "step": 6259 + }, + { + "epoch": 1.7340720221606647, + "grad_norm": 0.6773170232772827, + "learning_rate": 4.058724504646834e-06, + "loss": 0.6339, + "step": 6260 + }, + { + "epoch": 1.7343490304709142, + "grad_norm": 0.6879174709320068, + "learning_rate": 4.058439678725957e-06, + "loss": 0.6261, + "step": 6261 + }, + { + "epoch": 1.7346260387811634, + "grad_norm": 0.7096613645553589, + "learning_rate": 4.058154819715426e-06, + "loss": 0.647, + "step": 6262 + }, + { + "epoch": 1.7349030470914126, + "grad_norm": 0.695104718208313, + "learning_rate": 4.05786992762129e-06, + "loss": 0.6142, + "step": 6263 + }, + { + "epoch": 1.735180055401662, + "grad_norm": 0.6822800636291504, + "learning_rate": 4.057585002449596e-06, + "loss": 0.6559, + "step": 6264 + }, + { + "epoch": 1.7354570637119113, + "grad_norm": 0.7518125176429749, + "learning_rate": 4.057300044206395e-06, + "loss": 0.636, + "step": 6265 + }, + { + "epoch": 1.7357340720221606, + "grad_norm": 0.6837979555130005, + "learning_rate": 4.057015052897737e-06, + "loss": 0.6183, + "step": 6266 + }, + { + "epoch": 1.73601108033241, + "grad_norm": 0.7142424583435059, + "learning_rate": 4.056730028529673e-06, + "loss": 0.6499, + "step": 6267 + }, + { + "epoch": 1.7362880886426593, + "grad_norm": 0.7081639170646667, + "learning_rate": 4.0564449711082555e-06, + "loss": 0.6267, + "step": 6268 + }, + { + "epoch": 1.7365650969529085, + "grad_norm": 0.6069878935813904, + "learning_rate": 4.0561598806395365e-06, + "loss": 0.5292, + "step": 6269 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.6596729755401611, + "learning_rate": 4.055874757129569e-06, + "loss": 0.591, + "step": 6270 + }, + { + "epoch": 1.7371191135734072, + "grad_norm": 0.7232509851455688, + "learning_rate": 4.055589600584407e-06, + "loss": 0.6362, + "step": 6271 + }, + { + "epoch": 1.7373961218836564, + "grad_norm": 0.6596758961677551, + "learning_rate": 4.055304411010106e-06, + "loss": 0.627, + "step": 6272 + }, + { + "epoch": 1.737673130193906, + "grad_norm": 0.6614773273468018, + "learning_rate": 4.05501918841272e-06, + "loss": 0.5894, + "step": 6273 + }, + { + "epoch": 1.7379501385041551, + "grad_norm": 0.7271131873130798, + "learning_rate": 4.054733932798306e-06, + "loss": 0.6117, + "step": 6274 + }, + { + "epoch": 1.7382271468144044, + "grad_norm": 0.6962509155273438, + "learning_rate": 4.0544486441729204e-06, + "loss": 0.6136, + "step": 6275 + }, + { + "epoch": 1.7385041551246538, + "grad_norm": 0.7124279737472534, + "learning_rate": 4.054163322542619e-06, + "loss": 0.6045, + "step": 6276 + }, + { + "epoch": 1.738781163434903, + "grad_norm": 0.7210063338279724, + "learning_rate": 4.053877967913463e-06, + "loss": 0.6177, + "step": 6277 + }, + { + "epoch": 1.7390581717451523, + "grad_norm": 0.7171337008476257, + "learning_rate": 4.053592580291509e-06, + "loss": 0.6144, + "step": 6278 + }, + { + "epoch": 1.7393351800554018, + "grad_norm": 0.6800563931465149, + "learning_rate": 4.053307159682817e-06, + "loss": 0.6012, + "step": 6279 + }, + { + "epoch": 1.739612188365651, + "grad_norm": 0.725743293762207, + "learning_rate": 4.053021706093447e-06, + "loss": 0.6271, + "step": 6280 + }, + { + "epoch": 1.7398891966759003, + "grad_norm": 0.7025867700576782, + "learning_rate": 4.05273621952946e-06, + "loss": 0.5914, + "step": 6281 + }, + { + "epoch": 1.7401662049861497, + "grad_norm": 0.6952527761459351, + "learning_rate": 4.0524506999969185e-06, + "loss": 0.6406, + "step": 6282 + }, + { + "epoch": 1.7404432132963987, + "grad_norm": 0.7095229029655457, + "learning_rate": 4.052165147501884e-06, + "loss": 0.6267, + "step": 6283 + }, + { + "epoch": 1.7407202216066482, + "grad_norm": 0.6685070991516113, + "learning_rate": 4.051879562050419e-06, + "loss": 0.6507, + "step": 6284 + }, + { + "epoch": 1.7409972299168976, + "grad_norm": 0.7472874522209167, + "learning_rate": 4.051593943648588e-06, + "loss": 0.6187, + "step": 6285 + }, + { + "epoch": 1.7412742382271467, + "grad_norm": 0.7431471943855286, + "learning_rate": 4.051308292302456e-06, + "loss": 0.6263, + "step": 6286 + }, + { + "epoch": 1.7415512465373961, + "grad_norm": 0.6761013269424438, + "learning_rate": 4.051022608018087e-06, + "loss": 0.6467, + "step": 6287 + }, + { + "epoch": 1.7418282548476456, + "grad_norm": 0.6962248086929321, + "learning_rate": 4.050736890801547e-06, + "loss": 0.6051, + "step": 6288 + }, + { + "epoch": 1.7421052631578946, + "grad_norm": 0.7127392292022705, + "learning_rate": 4.0504511406589025e-06, + "loss": 0.6463, + "step": 6289 + }, + { + "epoch": 1.742382271468144, + "grad_norm": 0.656398594379425, + "learning_rate": 4.050165357596222e-06, + "loss": 0.6134, + "step": 6290 + }, + { + "epoch": 1.7426592797783933, + "grad_norm": 0.6969537734985352, + "learning_rate": 4.04987954161957e-06, + "loss": 0.5537, + "step": 6291 + }, + { + "epoch": 1.7429362880886425, + "grad_norm": 0.628457248210907, + "learning_rate": 4.04959369273502e-06, + "loss": 0.6291, + "step": 6292 + }, + { + "epoch": 1.743213296398892, + "grad_norm": 0.7016260623931885, + "learning_rate": 4.049307810948636e-06, + "loss": 0.6569, + "step": 6293 + }, + { + "epoch": 1.7434903047091412, + "grad_norm": 0.7526865601539612, + "learning_rate": 4.049021896266493e-06, + "loss": 0.6649, + "step": 6294 + }, + { + "epoch": 1.7437673130193905, + "grad_norm": 0.7152197360992432, + "learning_rate": 4.048735948694658e-06, + "loss": 0.644, + "step": 6295 + }, + { + "epoch": 1.74404432132964, + "grad_norm": 0.6660102605819702, + "learning_rate": 4.048449968239204e-06, + "loss": 0.5963, + "step": 6296 + }, + { + "epoch": 1.7443213296398892, + "grad_norm": 0.6983559727668762, + "learning_rate": 4.048163954906203e-06, + "loss": 0.6267, + "step": 6297 + }, + { + "epoch": 1.7445983379501384, + "grad_norm": 0.677616536617279, + "learning_rate": 4.0478779087017276e-06, + "loss": 0.6113, + "step": 6298 + }, + { + "epoch": 1.7448753462603879, + "grad_norm": 0.6915004253387451, + "learning_rate": 4.047591829631852e-06, + "loss": 0.6493, + "step": 6299 + }, + { + "epoch": 1.745152354570637, + "grad_norm": 0.7380059957504272, + "learning_rate": 4.047305717702649e-06, + "loss": 0.6161, + "step": 6300 + }, + { + "epoch": 1.7454293628808863, + "grad_norm": 0.6928910613059998, + "learning_rate": 4.047019572920194e-06, + "loss": 0.6416, + "step": 6301 + }, + { + "epoch": 1.7457063711911358, + "grad_norm": 0.6989980340003967, + "learning_rate": 4.0467333952905635e-06, + "loss": 0.586, + "step": 6302 + }, + { + "epoch": 1.745983379501385, + "grad_norm": 0.6553187370300293, + "learning_rate": 4.046447184819833e-06, + "loss": 0.6156, + "step": 6303 + }, + { + "epoch": 1.7462603878116343, + "grad_norm": 0.6651337742805481, + "learning_rate": 4.046160941514079e-06, + "loss": 0.5633, + "step": 6304 + }, + { + "epoch": 1.7465373961218837, + "grad_norm": 0.6954764127731323, + "learning_rate": 4.0458746653793805e-06, + "loss": 0.6093, + "step": 6305 + }, + { + "epoch": 1.746814404432133, + "grad_norm": 0.7164486646652222, + "learning_rate": 4.045588356421814e-06, + "loss": 0.6432, + "step": 6306 + }, + { + "epoch": 1.7470914127423822, + "grad_norm": 0.6956983804702759, + "learning_rate": 4.045302014647461e-06, + "loss": 0.6357, + "step": 6307 + }, + { + "epoch": 1.7473684210526317, + "grad_norm": 0.6895577907562256, + "learning_rate": 4.0450156400623994e-06, + "loss": 0.6631, + "step": 6308 + }, + { + "epoch": 1.747645429362881, + "grad_norm": 0.7270070314407349, + "learning_rate": 4.044729232672711e-06, + "loss": 0.6687, + "step": 6309 + }, + { + "epoch": 1.7479224376731302, + "grad_norm": 0.6985939741134644, + "learning_rate": 4.044442792484475e-06, + "loss": 0.6499, + "step": 6310 + }, + { + "epoch": 1.7481994459833796, + "grad_norm": 0.7345278263092041, + "learning_rate": 4.044156319503774e-06, + "loss": 0.6322, + "step": 6311 + }, + { + "epoch": 1.7484764542936289, + "grad_norm": 0.7118226885795593, + "learning_rate": 4.0438698137366925e-06, + "loss": 0.6265, + "step": 6312 + }, + { + "epoch": 1.748753462603878, + "grad_norm": 0.685529887676239, + "learning_rate": 4.043583275189311e-06, + "loss": 0.6536, + "step": 6313 + }, + { + "epoch": 1.7490304709141276, + "grad_norm": 0.7099524736404419, + "learning_rate": 4.043296703867715e-06, + "loss": 0.6477, + "step": 6314 + }, + { + "epoch": 1.7493074792243766, + "grad_norm": 0.6660603880882263, + "learning_rate": 4.04301009977799e-06, + "loss": 0.593, + "step": 6315 + }, + { + "epoch": 1.749584487534626, + "grad_norm": 0.6841859221458435, + "learning_rate": 4.042723462926219e-06, + "loss": 0.6088, + "step": 6316 + }, + { + "epoch": 1.7498614958448755, + "grad_norm": 0.717239260673523, + "learning_rate": 4.042436793318489e-06, + "loss": 0.6679, + "step": 6317 + }, + { + "epoch": 1.7501385041551245, + "grad_norm": 0.6708807349205017, + "learning_rate": 4.042150090960888e-06, + "loss": 0.642, + "step": 6318 + }, + { + "epoch": 1.750415512465374, + "grad_norm": 0.7339285612106323, + "learning_rate": 4.041863355859502e-06, + "loss": 0.6353, + "step": 6319 + }, + { + "epoch": 1.7506925207756234, + "grad_norm": 0.6966876983642578, + "learning_rate": 4.041576588020419e-06, + "loss": 0.6094, + "step": 6320 + }, + { + "epoch": 1.7509695290858724, + "grad_norm": 0.6712494492530823, + "learning_rate": 4.041289787449728e-06, + "loss": 0.6468, + "step": 6321 + }, + { + "epoch": 1.751246537396122, + "grad_norm": 0.6606813669204712, + "learning_rate": 4.04100295415352e-06, + "loss": 0.5824, + "step": 6322 + }, + { + "epoch": 1.7515235457063711, + "grad_norm": 0.6949263215065002, + "learning_rate": 4.040716088137883e-06, + "loss": 0.6462, + "step": 6323 + }, + { + "epoch": 1.7518005540166204, + "grad_norm": 0.7004557251930237, + "learning_rate": 4.040429189408909e-06, + "loss": 0.6624, + "step": 6324 + }, + { + "epoch": 1.7520775623268698, + "grad_norm": 0.6793391704559326, + "learning_rate": 4.040142257972689e-06, + "loss": 0.6009, + "step": 6325 + }, + { + "epoch": 1.752354570637119, + "grad_norm": 0.6711333394050598, + "learning_rate": 4.039855293835316e-06, + "loss": 0.5837, + "step": 6326 + }, + { + "epoch": 1.7526315789473683, + "grad_norm": 0.7150370478630066, + "learning_rate": 4.039568297002884e-06, + "loss": 0.6158, + "step": 6327 + }, + { + "epoch": 1.7529085872576178, + "grad_norm": 0.7052631974220276, + "learning_rate": 4.039281267481484e-06, + "loss": 0.6346, + "step": 6328 + }, + { + "epoch": 1.753185595567867, + "grad_norm": 0.7503765225410461, + "learning_rate": 4.038994205277213e-06, + "loss": 0.6359, + "step": 6329 + }, + { + "epoch": 1.7534626038781163, + "grad_norm": 0.720745861530304, + "learning_rate": 4.038707110396164e-06, + "loss": 0.6278, + "step": 6330 + }, + { + "epoch": 1.7537396121883657, + "grad_norm": 0.6894276738166809, + "learning_rate": 4.038419982844433e-06, + "loss": 0.612, + "step": 6331 + }, + { + "epoch": 1.754016620498615, + "grad_norm": 0.6956282258033752, + "learning_rate": 4.038132822628118e-06, + "loss": 0.6444, + "step": 6332 + }, + { + "epoch": 1.7542936288088642, + "grad_norm": 0.6910063624382019, + "learning_rate": 4.0378456297533155e-06, + "loss": 0.6443, + "step": 6333 + }, + { + "epoch": 1.7545706371191137, + "grad_norm": 0.6689027547836304, + "learning_rate": 4.037558404226123e-06, + "loss": 0.6045, + "step": 6334 + }, + { + "epoch": 1.754847645429363, + "grad_norm": 0.6691604256629944, + "learning_rate": 4.037271146052639e-06, + "loss": 0.6199, + "step": 6335 + }, + { + "epoch": 1.7551246537396121, + "grad_norm": 0.7217824459075928, + "learning_rate": 4.036983855238962e-06, + "loss": 0.6079, + "step": 6336 + }, + { + "epoch": 1.7554016620498616, + "grad_norm": 0.6883172988891602, + "learning_rate": 4.036696531791193e-06, + "loss": 0.6142, + "step": 6337 + }, + { + "epoch": 1.7556786703601108, + "grad_norm": 0.6836881041526794, + "learning_rate": 4.036409175715434e-06, + "loss": 0.5989, + "step": 6338 + }, + { + "epoch": 1.75595567867036, + "grad_norm": 0.6560087203979492, + "learning_rate": 4.036121787017782e-06, + "loss": 0.6466, + "step": 6339 + }, + { + "epoch": 1.7562326869806095, + "grad_norm": 0.697339653968811, + "learning_rate": 4.035834365704343e-06, + "loss": 0.5821, + "step": 6340 + }, + { + "epoch": 1.7565096952908588, + "grad_norm": 0.6981000900268555, + "learning_rate": 4.035546911781219e-06, + "loss": 0.5742, + "step": 6341 + }, + { + "epoch": 1.756786703601108, + "grad_norm": 0.7049815654754639, + "learning_rate": 4.035259425254511e-06, + "loss": 0.6468, + "step": 6342 + }, + { + "epoch": 1.7570637119113575, + "grad_norm": 0.6798682808876038, + "learning_rate": 4.034971906130326e-06, + "loss": 0.6371, + "step": 6343 + }, + { + "epoch": 1.7573407202216067, + "grad_norm": 0.6624364852905273, + "learning_rate": 4.034684354414768e-06, + "loss": 0.6099, + "step": 6344 + }, + { + "epoch": 1.757617728531856, + "grad_norm": 0.7097635865211487, + "learning_rate": 4.03439677011394e-06, + "loss": 0.5877, + "step": 6345 + }, + { + "epoch": 1.7578947368421054, + "grad_norm": 0.7028236985206604, + "learning_rate": 4.034109153233952e-06, + "loss": 0.67, + "step": 6346 + }, + { + "epoch": 1.7581717451523544, + "grad_norm": 0.6697776913642883, + "learning_rate": 4.0338215037809085e-06, + "loss": 0.5897, + "step": 6347 + }, + { + "epoch": 1.7584487534626039, + "grad_norm": 0.7153249382972717, + "learning_rate": 4.033533821760917e-06, + "loss": 0.6275, + "step": 6348 + }, + { + "epoch": 1.7587257617728533, + "grad_norm": 0.7187476754188538, + "learning_rate": 4.033246107180086e-06, + "loss": 0.5889, + "step": 6349 + }, + { + "epoch": 1.7590027700831024, + "grad_norm": 0.7286702394485474, + "learning_rate": 4.032958360044526e-06, + "loss": 0.6521, + "step": 6350 + }, + { + "epoch": 1.7592797783933518, + "grad_norm": 0.7181779146194458, + "learning_rate": 4.032670580360345e-06, + "loss": 0.6534, + "step": 6351 + }, + { + "epoch": 1.7595567867036013, + "grad_norm": 0.7091637849807739, + "learning_rate": 4.032382768133652e-06, + "loss": 0.637, + "step": 6352 + }, + { + "epoch": 1.7598337950138503, + "grad_norm": 0.7320290803909302, + "learning_rate": 4.032094923370561e-06, + "loss": 0.6393, + "step": 6353 + }, + { + "epoch": 1.7601108033240997, + "grad_norm": 0.6917217373847961, + "learning_rate": 4.031807046077182e-06, + "loss": 0.6552, + "step": 6354 + }, + { + "epoch": 1.760387811634349, + "grad_norm": 0.6732197999954224, + "learning_rate": 4.031519136259628e-06, + "loss": 0.6087, + "step": 6355 + }, + { + "epoch": 1.7606648199445982, + "grad_norm": 0.7149708867073059, + "learning_rate": 4.031231193924011e-06, + "loss": 0.6239, + "step": 6356 + }, + { + "epoch": 1.7609418282548477, + "grad_norm": 0.6990062594413757, + "learning_rate": 4.0309432190764456e-06, + "loss": 0.6011, + "step": 6357 + }, + { + "epoch": 1.761218836565097, + "grad_norm": 0.7346669435501099, + "learning_rate": 4.030655211723046e-06, + "loss": 0.6554, + "step": 6358 + }, + { + "epoch": 1.7614958448753462, + "grad_norm": 0.7255174517631531, + "learning_rate": 4.030367171869927e-06, + "loss": 0.6156, + "step": 6359 + }, + { + "epoch": 1.7617728531855956, + "grad_norm": 0.7192071080207825, + "learning_rate": 4.030079099523205e-06, + "loss": 0.6886, + "step": 6360 + }, + { + "epoch": 1.7620498614958449, + "grad_norm": 0.7222265005111694, + "learning_rate": 4.029790994688997e-06, + "loss": 0.6347, + "step": 6361 + }, + { + "epoch": 1.762326869806094, + "grad_norm": 0.730577290058136, + "learning_rate": 4.029502857373419e-06, + "loss": 0.6324, + "step": 6362 + }, + { + "epoch": 1.7626038781163436, + "grad_norm": 0.6651086211204529, + "learning_rate": 4.02921468758259e-06, + "loss": 0.5205, + "step": 6363 + }, + { + "epoch": 1.7628808864265928, + "grad_norm": 0.7027130126953125, + "learning_rate": 4.028926485322627e-06, + "loss": 0.5416, + "step": 6364 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.7234609723091125, + "learning_rate": 4.0286382505996505e-06, + "loss": 0.6434, + "step": 6365 + }, + { + "epoch": 1.7634349030470915, + "grad_norm": 0.6769989132881165, + "learning_rate": 4.02834998341978e-06, + "loss": 0.6248, + "step": 6366 + }, + { + "epoch": 1.7637119113573407, + "grad_norm": 0.7025700807571411, + "learning_rate": 4.028061683789137e-06, + "loss": 0.6469, + "step": 6367 + }, + { + "epoch": 1.76398891966759, + "grad_norm": 0.6738390326499939, + "learning_rate": 4.027773351713843e-06, + "loss": 0.6481, + "step": 6368 + }, + { + "epoch": 1.7642659279778394, + "grad_norm": 0.7171605229377747, + "learning_rate": 4.027484987200017e-06, + "loss": 0.6232, + "step": 6369 + }, + { + "epoch": 1.7645429362880887, + "grad_norm": 0.707499623298645, + "learning_rate": 4.027196590253786e-06, + "loss": 0.637, + "step": 6370 + }, + { + "epoch": 1.764819944598338, + "grad_norm": 0.6891076564788818, + "learning_rate": 4.026908160881271e-06, + "loss": 0.6388, + "step": 6371 + }, + { + "epoch": 1.7650969529085874, + "grad_norm": 0.7460578083992004, + "learning_rate": 4.0266196990885955e-06, + "loss": 0.6599, + "step": 6372 + }, + { + "epoch": 1.7653739612188366, + "grad_norm": 0.7125904560089111, + "learning_rate": 4.026331204881886e-06, + "loss": 0.6781, + "step": 6373 + }, + { + "epoch": 1.7656509695290858, + "grad_norm": 0.7035360932350159, + "learning_rate": 4.026042678267267e-06, + "loss": 0.6337, + "step": 6374 + }, + { + "epoch": 1.7659279778393353, + "grad_norm": 0.6471675038337708, + "learning_rate": 4.025754119250865e-06, + "loss": 0.6037, + "step": 6375 + }, + { + "epoch": 1.7662049861495843, + "grad_norm": 0.6844245195388794, + "learning_rate": 4.025465527838806e-06, + "loss": 0.6409, + "step": 6376 + }, + { + "epoch": 1.7664819944598338, + "grad_norm": 0.626818060874939, + "learning_rate": 4.025176904037219e-06, + "loss": 0.5556, + "step": 6377 + }, + { + "epoch": 1.7667590027700832, + "grad_norm": 0.7065606713294983, + "learning_rate": 4.024888247852231e-06, + "loss": 0.6036, + "step": 6378 + }, + { + "epoch": 1.7670360110803323, + "grad_norm": 0.7180307507514954, + "learning_rate": 4.024599559289972e-06, + "loss": 0.6045, + "step": 6379 + }, + { + "epoch": 1.7673130193905817, + "grad_norm": 0.7368504405021667, + "learning_rate": 4.02431083835657e-06, + "loss": 0.7031, + "step": 6380 + }, + { + "epoch": 1.7675900277008312, + "grad_norm": 0.7280147075653076, + "learning_rate": 4.024022085058157e-06, + "loss": 0.6692, + "step": 6381 + }, + { + "epoch": 1.7678670360110802, + "grad_norm": 0.6778074502944946, + "learning_rate": 4.0237332994008625e-06, + "loss": 0.5773, + "step": 6382 + }, + { + "epoch": 1.7681440443213297, + "grad_norm": 0.6655060052871704, + "learning_rate": 4.0234444813908195e-06, + "loss": 0.5874, + "step": 6383 + }, + { + "epoch": 1.768421052631579, + "grad_norm": 0.6540441513061523, + "learning_rate": 4.0231556310341595e-06, + "loss": 0.5792, + "step": 6384 + }, + { + "epoch": 1.7686980609418281, + "grad_norm": 0.6953973174095154, + "learning_rate": 4.022866748337016e-06, + "loss": 0.6148, + "step": 6385 + }, + { + "epoch": 1.7689750692520776, + "grad_norm": 0.707272469997406, + "learning_rate": 4.022577833305522e-06, + "loss": 0.589, + "step": 6386 + }, + { + "epoch": 1.7692520775623268, + "grad_norm": 0.6879422664642334, + "learning_rate": 4.022288885945813e-06, + "loss": 0.623, + "step": 6387 + }, + { + "epoch": 1.769529085872576, + "grad_norm": 0.6887927651405334, + "learning_rate": 4.021999906264023e-06, + "loss": 0.6404, + "step": 6388 + }, + { + "epoch": 1.7698060941828255, + "grad_norm": 0.7096054553985596, + "learning_rate": 4.021710894266288e-06, + "loss": 0.6306, + "step": 6389 + }, + { + "epoch": 1.7700831024930748, + "grad_norm": 0.6833407878875732, + "learning_rate": 4.021421849958745e-06, + "loss": 0.6155, + "step": 6390 + }, + { + "epoch": 1.770360110803324, + "grad_norm": 0.6665039658546448, + "learning_rate": 4.0211327733475305e-06, + "loss": 0.6208, + "step": 6391 + }, + { + "epoch": 1.7706371191135735, + "grad_norm": 0.661353588104248, + "learning_rate": 4.020843664438783e-06, + "loss": 0.5745, + "step": 6392 + }, + { + "epoch": 1.7709141274238227, + "grad_norm": 0.6623826622962952, + "learning_rate": 4.020554523238641e-06, + "loss": 0.5878, + "step": 6393 + }, + { + "epoch": 1.771191135734072, + "grad_norm": 0.6898151636123657, + "learning_rate": 4.0202653497532425e-06, + "loss": 0.6625, + "step": 6394 + }, + { + "epoch": 1.7714681440443214, + "grad_norm": 0.7333918809890747, + "learning_rate": 4.019976143988729e-06, + "loss": 0.5955, + "step": 6395 + }, + { + "epoch": 1.7717451523545706, + "grad_norm": 0.6648381948471069, + "learning_rate": 4.01968690595124e-06, + "loss": 0.6256, + "step": 6396 + }, + { + "epoch": 1.7720221606648199, + "grad_norm": 0.6879984140396118, + "learning_rate": 4.019397635646918e-06, + "loss": 0.6013, + "step": 6397 + }, + { + "epoch": 1.7722991689750693, + "grad_norm": 0.7033804655075073, + "learning_rate": 4.019108333081903e-06, + "loss": 0.6112, + "step": 6398 + }, + { + "epoch": 1.7725761772853186, + "grad_norm": 0.6876200437545776, + "learning_rate": 4.018818998262339e-06, + "loss": 0.6513, + "step": 6399 + }, + { + "epoch": 1.7728531855955678, + "grad_norm": 0.7130957841873169, + "learning_rate": 4.01852963119437e-06, + "loss": 0.6725, + "step": 6400 + }, + { + "epoch": 1.7731301939058173, + "grad_norm": 0.7180396318435669, + "learning_rate": 4.018240231884137e-06, + "loss": 0.6402, + "step": 6401 + }, + { + "epoch": 1.7734072022160665, + "grad_norm": 0.6846297383308411, + "learning_rate": 4.017950800337789e-06, + "loss": 0.5692, + "step": 6402 + }, + { + "epoch": 1.7736842105263158, + "grad_norm": 0.666314423084259, + "learning_rate": 4.017661336561467e-06, + "loss": 0.6093, + "step": 6403 + }, + { + "epoch": 1.7739612188365652, + "grad_norm": 0.703360915184021, + "learning_rate": 4.0173718405613205e-06, + "loss": 0.6181, + "step": 6404 + }, + { + "epoch": 1.7742382271468145, + "grad_norm": 0.7042622566223145, + "learning_rate": 4.017082312343494e-06, + "loss": 0.6297, + "step": 6405 + }, + { + "epoch": 1.7745152354570637, + "grad_norm": 0.6913483738899231, + "learning_rate": 4.016792751914137e-06, + "loss": 0.628, + "step": 6406 + }, + { + "epoch": 1.7747922437673131, + "grad_norm": 0.7046165466308594, + "learning_rate": 4.016503159279395e-06, + "loss": 0.6544, + "step": 6407 + }, + { + "epoch": 1.7750692520775622, + "grad_norm": 0.7160999178886414, + "learning_rate": 4.016213534445419e-06, + "loss": 0.5549, + "step": 6408 + }, + { + "epoch": 1.7753462603878116, + "grad_norm": 0.706745445728302, + "learning_rate": 4.015923877418357e-06, + "loss": 0.6263, + "step": 6409 + }, + { + "epoch": 1.775623268698061, + "grad_norm": 0.6807883977890015, + "learning_rate": 4.015634188204361e-06, + "loss": 0.5961, + "step": 6410 + }, + { + "epoch": 1.77590027700831, + "grad_norm": 0.737921953201294, + "learning_rate": 4.01534446680958e-06, + "loss": 0.6938, + "step": 6411 + }, + { + "epoch": 1.7761772853185596, + "grad_norm": 0.6510135531425476, + "learning_rate": 4.015054713240166e-06, + "loss": 0.5828, + "step": 6412 + }, + { + "epoch": 1.776454293628809, + "grad_norm": 0.6969490051269531, + "learning_rate": 4.014764927502272e-06, + "loss": 0.6384, + "step": 6413 + }, + { + "epoch": 1.776731301939058, + "grad_norm": 0.7064840197563171, + "learning_rate": 4.01447510960205e-06, + "loss": 0.6255, + "step": 6414 + }, + { + "epoch": 1.7770083102493075, + "grad_norm": 0.7026253938674927, + "learning_rate": 4.014185259545654e-06, + "loss": 0.639, + "step": 6415 + }, + { + "epoch": 1.7772853185595567, + "grad_norm": 0.683268129825592, + "learning_rate": 4.013895377339238e-06, + "loss": 0.588, + "step": 6416 + }, + { + "epoch": 1.777562326869806, + "grad_norm": 0.6896020174026489, + "learning_rate": 4.013605462988957e-06, + "loss": 0.6428, + "step": 6417 + }, + { + "epoch": 1.7778393351800554, + "grad_norm": 0.7973686456680298, + "learning_rate": 4.013315516500966e-06, + "loss": 0.6393, + "step": 6418 + }, + { + "epoch": 1.7781163434903047, + "grad_norm": 0.6856012344360352, + "learning_rate": 4.0130255378814235e-06, + "loss": 0.5807, + "step": 6419 + }, + { + "epoch": 1.778393351800554, + "grad_norm": 0.7140977382659912, + "learning_rate": 4.0127355271364845e-06, + "loss": 0.6148, + "step": 6420 + }, + { + "epoch": 1.7786703601108034, + "grad_norm": 0.6942720413208008, + "learning_rate": 4.012445484272307e-06, + "loss": 0.642, + "step": 6421 + }, + { + "epoch": 1.7789473684210526, + "grad_norm": 0.7332590818405151, + "learning_rate": 4.0121554092950485e-06, + "loss": 0.604, + "step": 6422 + }, + { + "epoch": 1.7792243767313018, + "grad_norm": 0.7232838273048401, + "learning_rate": 4.01186530221087e-06, + "loss": 0.6172, + "step": 6423 + }, + { + "epoch": 1.7795013850415513, + "grad_norm": 0.7233272194862366, + "learning_rate": 4.011575163025931e-06, + "loss": 0.6529, + "step": 6424 + }, + { + "epoch": 1.7797783933518005, + "grad_norm": 0.6849462389945984, + "learning_rate": 4.0112849917463905e-06, + "loss": 0.6532, + "step": 6425 + }, + { + "epoch": 1.7800554016620498, + "grad_norm": 0.7064739465713501, + "learning_rate": 4.01099478837841e-06, + "loss": 0.6258, + "step": 6426 + }, + { + "epoch": 1.7803324099722992, + "grad_norm": 0.691041886806488, + "learning_rate": 4.010704552928151e-06, + "loss": 0.5841, + "step": 6427 + }, + { + "epoch": 1.7806094182825485, + "grad_norm": 0.6790480017662048, + "learning_rate": 4.010414285401777e-06, + "loss": 0.6473, + "step": 6428 + }, + { + "epoch": 1.7808864265927977, + "grad_norm": 0.6644400954246521, + "learning_rate": 4.0101239858054504e-06, + "loss": 0.6176, + "step": 6429 + }, + { + "epoch": 1.7811634349030472, + "grad_norm": 0.6953248381614685, + "learning_rate": 4.009833654145335e-06, + "loss": 0.6312, + "step": 6430 + }, + { + "epoch": 1.7814404432132964, + "grad_norm": 0.6625707745552063, + "learning_rate": 4.009543290427596e-06, + "loss": 0.6244, + "step": 6431 + }, + { + "epoch": 1.7817174515235457, + "grad_norm": 0.6987664699554443, + "learning_rate": 4.009252894658398e-06, + "loss": 0.6033, + "step": 6432 + }, + { + "epoch": 1.7819944598337951, + "grad_norm": 0.6788783669471741, + "learning_rate": 4.008962466843907e-06, + "loss": 0.6241, + "step": 6433 + }, + { + "epoch": 1.7822714681440444, + "grad_norm": 0.7290467023849487, + "learning_rate": 4.008672006990289e-06, + "loss": 0.637, + "step": 6434 + }, + { + "epoch": 1.7825484764542936, + "grad_norm": 0.6927595734596252, + "learning_rate": 4.008381515103711e-06, + "loss": 0.608, + "step": 6435 + }, + { + "epoch": 1.782825484764543, + "grad_norm": 0.7004909515380859, + "learning_rate": 4.008090991190341e-06, + "loss": 0.6295, + "step": 6436 + }, + { + "epoch": 1.7831024930747923, + "grad_norm": 0.6821027994155884, + "learning_rate": 4.007800435256349e-06, + "loss": 0.6072, + "step": 6437 + }, + { + "epoch": 1.7833795013850415, + "grad_norm": 0.7391160130500793, + "learning_rate": 4.007509847307903e-06, + "loss": 0.6398, + "step": 6438 + }, + { + "epoch": 1.783656509695291, + "grad_norm": 0.6901292204856873, + "learning_rate": 4.007219227351173e-06, + "loss": 0.6062, + "step": 6439 + }, + { + "epoch": 1.78393351800554, + "grad_norm": 0.7144735455513, + "learning_rate": 4.00692857539233e-06, + "loss": 0.6049, + "step": 6440 + }, + { + "epoch": 1.7842105263157895, + "grad_norm": 0.7031808495521545, + "learning_rate": 4.006637891437546e-06, + "loss": 0.6131, + "step": 6441 + }, + { + "epoch": 1.784487534626039, + "grad_norm": 0.6666425466537476, + "learning_rate": 4.006347175492991e-06, + "loss": 0.6234, + "step": 6442 + }, + { + "epoch": 1.784764542936288, + "grad_norm": 0.7020919919013977, + "learning_rate": 4.00605642756484e-06, + "loss": 0.5821, + "step": 6443 + }, + { + "epoch": 1.7850415512465374, + "grad_norm": 0.7614548206329346, + "learning_rate": 4.005765647659263e-06, + "loss": 0.6563, + "step": 6444 + }, + { + "epoch": 1.7853185595567869, + "grad_norm": 0.6820983290672302, + "learning_rate": 4.005474835782437e-06, + "loss": 0.6032, + "step": 6445 + }, + { + "epoch": 1.7855955678670359, + "grad_norm": 0.6977035403251648, + "learning_rate": 4.0051839919405355e-06, + "loss": 0.6408, + "step": 6446 + }, + { + "epoch": 1.7858725761772853, + "grad_norm": 0.6919875741004944, + "learning_rate": 4.004893116139735e-06, + "loss": 0.6158, + "step": 6447 + }, + { + "epoch": 1.7861495844875346, + "grad_norm": 0.6647782921791077, + "learning_rate": 4.004602208386209e-06, + "loss": 0.6182, + "step": 6448 + }, + { + "epoch": 1.7864265927977838, + "grad_norm": 0.6721411347389221, + "learning_rate": 4.004311268686136e-06, + "loss": 0.65, + "step": 6449 + }, + { + "epoch": 1.7867036011080333, + "grad_norm": 0.7224037051200867, + "learning_rate": 4.0040202970456936e-06, + "loss": 0.6441, + "step": 6450 + }, + { + "epoch": 1.7869806094182825, + "grad_norm": 0.6760212182998657, + "learning_rate": 4.003729293471059e-06, + "loss": 0.597, + "step": 6451 + }, + { + "epoch": 1.7872576177285318, + "grad_norm": 0.6822620034217834, + "learning_rate": 4.003438257968413e-06, + "loss": 0.6238, + "step": 6452 + }, + { + "epoch": 1.7875346260387812, + "grad_norm": 0.7053095102310181, + "learning_rate": 4.003147190543931e-06, + "loss": 0.6508, + "step": 6453 + }, + { + "epoch": 1.7878116343490305, + "grad_norm": 0.6813586950302124, + "learning_rate": 4.002856091203797e-06, + "loss": 0.6425, + "step": 6454 + }, + { + "epoch": 1.7880886426592797, + "grad_norm": 0.693324863910675, + "learning_rate": 4.00256495995419e-06, + "loss": 0.6425, + "step": 6455 + }, + { + "epoch": 1.7883656509695292, + "grad_norm": 0.6755707263946533, + "learning_rate": 4.002273796801292e-06, + "loss": 0.6227, + "step": 6456 + }, + { + "epoch": 1.7886426592797784, + "grad_norm": 0.6835914850234985, + "learning_rate": 4.001982601751285e-06, + "loss": 0.5766, + "step": 6457 + }, + { + "epoch": 1.7889196675900276, + "grad_norm": 0.6911206841468811, + "learning_rate": 4.001691374810352e-06, + "loss": 0.615, + "step": 6458 + }, + { + "epoch": 1.789196675900277, + "grad_norm": 0.7716637253761292, + "learning_rate": 4.001400115984675e-06, + "loss": 0.6435, + "step": 6459 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.671084463596344, + "learning_rate": 4.001108825280441e-06, + "loss": 0.6313, + "step": 6460 + }, + { + "epoch": 1.7897506925207756, + "grad_norm": 0.6706188321113586, + "learning_rate": 4.000817502703832e-06, + "loss": 0.6186, + "step": 6461 + }, + { + "epoch": 1.790027700831025, + "grad_norm": 0.7258620858192444, + "learning_rate": 4.000526148261035e-06, + "loss": 0.6582, + "step": 6462 + }, + { + "epoch": 1.7903047091412743, + "grad_norm": 0.6809777617454529, + "learning_rate": 4.000234761958235e-06, + "loss": 0.6067, + "step": 6463 + }, + { + "epoch": 1.7905817174515235, + "grad_norm": 0.7288227677345276, + "learning_rate": 3.999943343801621e-06, + "loss": 0.6414, + "step": 6464 + }, + { + "epoch": 1.790858725761773, + "grad_norm": 0.7226935029029846, + "learning_rate": 3.99965189379738e-06, + "loss": 0.6005, + "step": 6465 + }, + { + "epoch": 1.7911357340720222, + "grad_norm": 0.716136634349823, + "learning_rate": 3.999360411951698e-06, + "loss": 0.6223, + "step": 6466 + }, + { + "epoch": 1.7914127423822714, + "grad_norm": 0.6857314109802246, + "learning_rate": 3.999068898270767e-06, + "loss": 0.6374, + "step": 6467 + }, + { + "epoch": 1.791689750692521, + "grad_norm": 0.7114225625991821, + "learning_rate": 3.998777352760774e-06, + "loss": 0.6016, + "step": 6468 + }, + { + "epoch": 1.7919667590027701, + "grad_norm": 0.7072104811668396, + "learning_rate": 3.9984857754279105e-06, + "loss": 0.5931, + "step": 6469 + }, + { + "epoch": 1.7922437673130194, + "grad_norm": 0.7062807679176331, + "learning_rate": 3.9981941662783675e-06, + "loss": 0.6769, + "step": 6470 + }, + { + "epoch": 1.7925207756232688, + "grad_norm": 0.6915931701660156, + "learning_rate": 3.997902525318337e-06, + "loss": 0.5963, + "step": 6471 + }, + { + "epoch": 1.7927977839335179, + "grad_norm": 0.6964583992958069, + "learning_rate": 3.997610852554009e-06, + "loss": 0.6054, + "step": 6472 + }, + { + "epoch": 1.7930747922437673, + "grad_norm": 0.7309070229530334, + "learning_rate": 3.99731914799158e-06, + "loss": 0.5954, + "step": 6473 + }, + { + "epoch": 1.7933518005540168, + "grad_norm": 0.6806193590164185, + "learning_rate": 3.997027411637241e-06, + "loss": 0.6184, + "step": 6474 + }, + { + "epoch": 1.7936288088642658, + "grad_norm": 0.7490919828414917, + "learning_rate": 3.996735643497187e-06, + "loss": 0.6083, + "step": 6475 + }, + { + "epoch": 1.7939058171745152, + "grad_norm": 0.6906395554542542, + "learning_rate": 3.996443843577612e-06, + "loss": 0.5904, + "step": 6476 + }, + { + "epoch": 1.7941828254847645, + "grad_norm": 0.7120215892791748, + "learning_rate": 3.9961520118847145e-06, + "loss": 0.61, + "step": 6477 + }, + { + "epoch": 1.7944598337950137, + "grad_norm": 0.7228304147720337, + "learning_rate": 3.995860148424689e-06, + "loss": 0.6304, + "step": 6478 + }, + { + "epoch": 1.7947368421052632, + "grad_norm": 0.7273550629615784, + "learning_rate": 3.995568253203731e-06, + "loss": 0.651, + "step": 6479 + }, + { + "epoch": 1.7950138504155124, + "grad_norm": 0.6840648055076599, + "learning_rate": 3.99527632622804e-06, + "loss": 0.5863, + "step": 6480 + }, + { + "epoch": 1.7952908587257617, + "grad_norm": 0.7078438997268677, + "learning_rate": 3.994984367503815e-06, + "loss": 0.66, + "step": 6481 + }, + { + "epoch": 1.7955678670360111, + "grad_norm": 0.7198485732078552, + "learning_rate": 3.994692377037254e-06, + "loss": 0.5683, + "step": 6482 + }, + { + "epoch": 1.7958448753462604, + "grad_norm": 0.7349383234977722, + "learning_rate": 3.994400354834556e-06, + "loss": 0.5938, + "step": 6483 + }, + { + "epoch": 1.7961218836565096, + "grad_norm": 0.6952342391014099, + "learning_rate": 3.994108300901923e-06, + "loss": 0.6079, + "step": 6484 + }, + { + "epoch": 1.796398891966759, + "grad_norm": 0.7039951682090759, + "learning_rate": 3.993816215245555e-06, + "loss": 0.6323, + "step": 6485 + }, + { + "epoch": 1.7966759002770083, + "grad_norm": 0.7002725005149841, + "learning_rate": 3.993524097871654e-06, + "loss": 0.6215, + "step": 6486 + }, + { + "epoch": 1.7969529085872575, + "grad_norm": 0.6925804615020752, + "learning_rate": 3.993231948786421e-06, + "loss": 0.6016, + "step": 6487 + }, + { + "epoch": 1.797229916897507, + "grad_norm": 0.6799862384796143, + "learning_rate": 3.992939767996063e-06, + "loss": 0.6471, + "step": 6488 + }, + { + "epoch": 1.7975069252077562, + "grad_norm": 0.6949800252914429, + "learning_rate": 3.992647555506779e-06, + "loss": 0.6095, + "step": 6489 + }, + { + "epoch": 1.7977839335180055, + "grad_norm": 0.707348108291626, + "learning_rate": 3.9923553113247764e-06, + "loss": 0.5838, + "step": 6490 + }, + { + "epoch": 1.798060941828255, + "grad_norm": 0.7468195557594299, + "learning_rate": 3.9920630354562595e-06, + "loss": 0.6265, + "step": 6491 + }, + { + "epoch": 1.7983379501385042, + "grad_norm": 0.6571547389030457, + "learning_rate": 3.991770727907434e-06, + "loss": 0.6172, + "step": 6492 + }, + { + "epoch": 1.7986149584487534, + "grad_norm": 0.6744096279144287, + "learning_rate": 3.9914783886845066e-06, + "loss": 0.5914, + "step": 6493 + }, + { + "epoch": 1.7988919667590029, + "grad_norm": 0.7042905688285828, + "learning_rate": 3.991186017793683e-06, + "loss": 0.6326, + "step": 6494 + }, + { + "epoch": 1.799168975069252, + "grad_norm": 0.6672032475471497, + "learning_rate": 3.990893615241174e-06, + "loss": 0.5663, + "step": 6495 + }, + { + "epoch": 1.7994459833795013, + "grad_norm": 0.6901209950447083, + "learning_rate": 3.990601181033185e-06, + "loss": 0.636, + "step": 6496 + }, + { + "epoch": 1.7997229916897508, + "grad_norm": 0.6690481901168823, + "learning_rate": 3.990308715175927e-06, + "loss": 0.6141, + "step": 6497 + }, + { + "epoch": 1.8, + "grad_norm": 0.7317468523979187, + "learning_rate": 3.9900162176756085e-06, + "loss": 0.6357, + "step": 6498 + }, + { + "epoch": 1.8002770083102493, + "grad_norm": 0.6796091198921204, + "learning_rate": 3.9897236885384415e-06, + "loss": 0.5713, + "step": 6499 + }, + { + "epoch": 1.8005540166204987, + "grad_norm": 0.7144341468811035, + "learning_rate": 3.989431127770635e-06, + "loss": 0.6112, + "step": 6500 + }, + { + "epoch": 1.8008310249307478, + "grad_norm": 0.6569699048995972, + "learning_rate": 3.989138535378404e-06, + "loss": 0.5959, + "step": 6501 + }, + { + "epoch": 1.8011080332409972, + "grad_norm": 0.7061322927474976, + "learning_rate": 3.988845911367957e-06, + "loss": 0.6334, + "step": 6502 + }, + { + "epoch": 1.8013850415512467, + "grad_norm": 0.6743340492248535, + "learning_rate": 3.988553255745511e-06, + "loss": 0.5976, + "step": 6503 + }, + { + "epoch": 1.8016620498614957, + "grad_norm": 0.6997945308685303, + "learning_rate": 3.988260568517277e-06, + "loss": 0.639, + "step": 6504 + }, + { + "epoch": 1.8019390581717452, + "grad_norm": 0.6599920988082886, + "learning_rate": 3.98796784968947e-06, + "loss": 0.5883, + "step": 6505 + }, + { + "epoch": 1.8022160664819946, + "grad_norm": 0.6996617317199707, + "learning_rate": 3.9876750992683065e-06, + "loss": 0.6091, + "step": 6506 + }, + { + "epoch": 1.8024930747922436, + "grad_norm": 0.709029495716095, + "learning_rate": 3.987382317260001e-06, + "loss": 0.5815, + "step": 6507 + }, + { + "epoch": 1.802770083102493, + "grad_norm": 0.7156533598899841, + "learning_rate": 3.987089503670771e-06, + "loss": 0.6297, + "step": 6508 + }, + { + "epoch": 1.8030470914127423, + "grad_norm": 0.7280424237251282, + "learning_rate": 3.986796658506833e-06, + "loss": 0.6483, + "step": 6509 + }, + { + "epoch": 1.8033240997229916, + "grad_norm": 0.7090050578117371, + "learning_rate": 3.986503781774406e-06, + "loss": 0.5907, + "step": 6510 + }, + { + "epoch": 1.803601108033241, + "grad_norm": 0.6697266101837158, + "learning_rate": 3.986210873479706e-06, + "loss": 0.6084, + "step": 6511 + }, + { + "epoch": 1.8038781163434903, + "grad_norm": 0.7092403173446655, + "learning_rate": 3.985917933628955e-06, + "loss": 0.6487, + "step": 6512 + }, + { + "epoch": 1.8041551246537395, + "grad_norm": 0.7133142948150635, + "learning_rate": 3.985624962228371e-06, + "loss": 0.6381, + "step": 6513 + }, + { + "epoch": 1.804432132963989, + "grad_norm": 0.6582397818565369, + "learning_rate": 3.9853319592841755e-06, + "loss": 0.596, + "step": 6514 + }, + { + "epoch": 1.8047091412742382, + "grad_norm": 0.7293874621391296, + "learning_rate": 3.985038924802589e-06, + "loss": 0.6, + "step": 6515 + }, + { + "epoch": 1.8049861495844874, + "grad_norm": 0.6582150459289551, + "learning_rate": 3.984745858789835e-06, + "loss": 0.6201, + "step": 6516 + }, + { + "epoch": 1.805263157894737, + "grad_norm": 0.7221015095710754, + "learning_rate": 3.9844527612521335e-06, + "loss": 0.5975, + "step": 6517 + }, + { + "epoch": 1.8055401662049861, + "grad_norm": 0.706571102142334, + "learning_rate": 3.984159632195709e-06, + "loss": 0.6357, + "step": 6518 + }, + { + "epoch": 1.8058171745152354, + "grad_norm": 0.6357813477516174, + "learning_rate": 3.983866471626786e-06, + "loss": 0.5697, + "step": 6519 + }, + { + "epoch": 1.8060941828254848, + "grad_norm": 0.7110602855682373, + "learning_rate": 3.983573279551588e-06, + "loss": 0.6401, + "step": 6520 + }, + { + "epoch": 1.806371191135734, + "grad_norm": 0.6931176781654358, + "learning_rate": 3.983280055976341e-06, + "loss": 0.6113, + "step": 6521 + }, + { + "epoch": 1.8066481994459833, + "grad_norm": 0.6891133785247803, + "learning_rate": 3.98298680090727e-06, + "loss": 0.6379, + "step": 6522 + }, + { + "epoch": 1.8069252077562328, + "grad_norm": 0.659149706363678, + "learning_rate": 3.982693514350602e-06, + "loss": 0.5731, + "step": 6523 + }, + { + "epoch": 1.807202216066482, + "grad_norm": 0.6716209650039673, + "learning_rate": 3.982400196312565e-06, + "loss": 0.6329, + "step": 6524 + }, + { + "epoch": 1.8074792243767313, + "grad_norm": 0.6645585894584656, + "learning_rate": 3.982106846799386e-06, + "loss": 0.6286, + "step": 6525 + }, + { + "epoch": 1.8077562326869807, + "grad_norm": 0.7084228992462158, + "learning_rate": 3.981813465817293e-06, + "loss": 0.6175, + "step": 6526 + }, + { + "epoch": 1.80803324099723, + "grad_norm": 0.6945014595985413, + "learning_rate": 3.981520053372517e-06, + "loss": 0.5685, + "step": 6527 + }, + { + "epoch": 1.8083102493074792, + "grad_norm": 0.6847140789031982, + "learning_rate": 3.981226609471286e-06, + "loss": 0.5825, + "step": 6528 + }, + { + "epoch": 1.8085872576177286, + "grad_norm": 0.7313349843025208, + "learning_rate": 3.980933134119832e-06, + "loss": 0.625, + "step": 6529 + }, + { + "epoch": 1.8088642659279779, + "grad_norm": 0.7253093719482422, + "learning_rate": 3.980639627324386e-06, + "loss": 0.6793, + "step": 6530 + }, + { + "epoch": 1.8091412742382271, + "grad_norm": 0.675082802772522, + "learning_rate": 3.980346089091179e-06, + "loss": 0.6159, + "step": 6531 + }, + { + "epoch": 1.8094182825484766, + "grad_norm": 0.7030022740364075, + "learning_rate": 3.980052519426444e-06, + "loss": 0.6353, + "step": 6532 + }, + { + "epoch": 1.8096952908587256, + "grad_norm": 0.6523541808128357, + "learning_rate": 3.979758918336415e-06, + "loss": 0.6172, + "step": 6533 + }, + { + "epoch": 1.809972299168975, + "grad_norm": 0.6885202527046204, + "learning_rate": 3.979465285827325e-06, + "loss": 0.6017, + "step": 6534 + }, + { + "epoch": 1.8102493074792245, + "grad_norm": 0.7288880944252014, + "learning_rate": 3.979171621905409e-06, + "loss": 0.6022, + "step": 6535 + }, + { + "epoch": 1.8105263157894735, + "grad_norm": 0.6946640610694885, + "learning_rate": 3.978877926576901e-06, + "loss": 0.645, + "step": 6536 + }, + { + "epoch": 1.810803324099723, + "grad_norm": 0.7339047193527222, + "learning_rate": 3.978584199848039e-06, + "loss": 0.6403, + "step": 6537 + }, + { + "epoch": 1.8110803324099725, + "grad_norm": 0.6518142223358154, + "learning_rate": 3.9782904417250586e-06, + "loss": 0.566, + "step": 6538 + }, + { + "epoch": 1.8113573407202215, + "grad_norm": 0.6858428120613098, + "learning_rate": 3.977996652214197e-06, + "loss": 0.6623, + "step": 6539 + }, + { + "epoch": 1.811634349030471, + "grad_norm": 0.7024521231651306, + "learning_rate": 3.977702831321692e-06, + "loss": 0.672, + "step": 6540 + }, + { + "epoch": 1.8119113573407202, + "grad_norm": 0.7442790865898132, + "learning_rate": 3.977408979053781e-06, + "loss": 0.6212, + "step": 6541 + }, + { + "epoch": 1.8121883656509694, + "grad_norm": 0.6858800053596497, + "learning_rate": 3.977115095416706e-06, + "loss": 0.6096, + "step": 6542 + }, + { + "epoch": 1.8124653739612189, + "grad_norm": 0.7015238404273987, + "learning_rate": 3.9768211804167054e-06, + "loss": 0.6182, + "step": 6543 + }, + { + "epoch": 1.8127423822714681, + "grad_norm": 0.6668867468833923, + "learning_rate": 3.976527234060019e-06, + "loss": 0.6471, + "step": 6544 + }, + { + "epoch": 1.8130193905817173, + "grad_norm": 0.6626103520393372, + "learning_rate": 3.97623325635289e-06, + "loss": 0.6262, + "step": 6545 + }, + { + "epoch": 1.8132963988919668, + "grad_norm": 0.7281420826911926, + "learning_rate": 3.975939247301558e-06, + "loss": 0.6242, + "step": 6546 + }, + { + "epoch": 1.813573407202216, + "grad_norm": 0.7330793142318726, + "learning_rate": 3.975645206912267e-06, + "loss": 0.6098, + "step": 6547 + }, + { + "epoch": 1.8138504155124653, + "grad_norm": 0.6654852628707886, + "learning_rate": 3.97535113519126e-06, + "loss": 0.6616, + "step": 6548 + }, + { + "epoch": 1.8141274238227147, + "grad_norm": 0.7465592622756958, + "learning_rate": 3.975057032144779e-06, + "loss": 0.6399, + "step": 6549 + }, + { + "epoch": 1.814404432132964, + "grad_norm": 0.6892106533050537, + "learning_rate": 3.974762897779072e-06, + "loss": 0.6272, + "step": 6550 + }, + { + "epoch": 1.8146814404432132, + "grad_norm": 0.6937387585639954, + "learning_rate": 3.974468732100383e-06, + "loss": 0.6166, + "step": 6551 + }, + { + "epoch": 1.8149584487534627, + "grad_norm": 0.7007038593292236, + "learning_rate": 3.974174535114956e-06, + "loss": 0.629, + "step": 6552 + }, + { + "epoch": 1.815235457063712, + "grad_norm": 0.6984164714813232, + "learning_rate": 3.97388030682904e-06, + "loss": 0.6131, + "step": 6553 + }, + { + "epoch": 1.8155124653739612, + "grad_norm": 0.6829065084457397, + "learning_rate": 3.9735860472488806e-06, + "loss": 0.6543, + "step": 6554 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.7143397331237793, + "learning_rate": 3.973291756380727e-06, + "loss": 0.6692, + "step": 6555 + }, + { + "epoch": 1.8160664819944599, + "grad_norm": 0.7546378374099731, + "learning_rate": 3.972997434230826e-06, + "loss": 0.5824, + "step": 6556 + }, + { + "epoch": 1.816343490304709, + "grad_norm": 0.6992888450622559, + "learning_rate": 3.972703080805429e-06, + "loss": 0.6236, + "step": 6557 + }, + { + "epoch": 1.8166204986149586, + "grad_norm": 0.7112694978713989, + "learning_rate": 3.972408696110785e-06, + "loss": 0.648, + "step": 6558 + }, + { + "epoch": 1.8168975069252078, + "grad_norm": 0.7382538914680481, + "learning_rate": 3.972114280153143e-06, + "loss": 0.6204, + "step": 6559 + }, + { + "epoch": 1.817174515235457, + "grad_norm": 0.6987351179122925, + "learning_rate": 3.971819832938756e-06, + "loss": 0.5919, + "step": 6560 + }, + { + "epoch": 1.8174515235457065, + "grad_norm": 0.6935312151908875, + "learning_rate": 3.971525354473875e-06, + "loss": 0.6464, + "step": 6561 + }, + { + "epoch": 1.8177285318559557, + "grad_norm": 0.6827068328857422, + "learning_rate": 3.971230844764753e-06, + "loss": 0.613, + "step": 6562 + }, + { + "epoch": 1.818005540166205, + "grad_norm": 0.7268776297569275, + "learning_rate": 3.970936303817643e-06, + "loss": 0.6078, + "step": 6563 + }, + { + "epoch": 1.8182825484764544, + "grad_norm": 0.695104718208313, + "learning_rate": 3.970641731638799e-06, + "loss": 0.6382, + "step": 6564 + }, + { + "epoch": 1.8185595567867034, + "grad_norm": 0.6960898041725159, + "learning_rate": 3.9703471282344754e-06, + "loss": 0.6293, + "step": 6565 + }, + { + "epoch": 1.818836565096953, + "grad_norm": 0.7311190962791443, + "learning_rate": 3.970052493610928e-06, + "loss": 0.6128, + "step": 6566 + }, + { + "epoch": 1.8191135734072024, + "grad_norm": 0.6929040551185608, + "learning_rate": 3.969757827774412e-06, + "loss": 0.6017, + "step": 6567 + }, + { + "epoch": 1.8193905817174514, + "grad_norm": 0.6849446892738342, + "learning_rate": 3.969463130731183e-06, + "loss": 0.6455, + "step": 6568 + }, + { + "epoch": 1.8196675900277008, + "grad_norm": 0.685980498790741, + "learning_rate": 3.9691684024875e-06, + "loss": 0.6173, + "step": 6569 + }, + { + "epoch": 1.8199445983379503, + "grad_norm": 0.7096467614173889, + "learning_rate": 3.96887364304962e-06, + "loss": 0.5953, + "step": 6570 + }, + { + "epoch": 1.8202216066481993, + "grad_norm": 0.7245126962661743, + "learning_rate": 3.968578852423801e-06, + "loss": 0.6379, + "step": 6571 + }, + { + "epoch": 1.8204986149584488, + "grad_norm": 0.708109974861145, + "learning_rate": 3.9682840306163025e-06, + "loss": 0.5782, + "step": 6572 + }, + { + "epoch": 1.820775623268698, + "grad_norm": 0.7031292915344238, + "learning_rate": 3.967989177633385e-06, + "loss": 0.6398, + "step": 6573 + }, + { + "epoch": 1.8210526315789473, + "grad_norm": 0.6725582480430603, + "learning_rate": 3.9676942934813075e-06, + "loss": 0.5867, + "step": 6574 + }, + { + "epoch": 1.8213296398891967, + "grad_norm": 0.7046939730644226, + "learning_rate": 3.967399378166333e-06, + "loss": 0.6597, + "step": 6575 + }, + { + "epoch": 1.821606648199446, + "grad_norm": 0.7320119142532349, + "learning_rate": 3.967104431694722e-06, + "loss": 0.6644, + "step": 6576 + }, + { + "epoch": 1.8218836565096952, + "grad_norm": 0.664082944393158, + "learning_rate": 3.966809454072738e-06, + "loss": 0.6144, + "step": 6577 + }, + { + "epoch": 1.8221606648199447, + "grad_norm": 0.6912119388580322, + "learning_rate": 3.966514445306642e-06, + "loss": 0.5829, + "step": 6578 + }, + { + "epoch": 1.822437673130194, + "grad_norm": 0.6881527304649353, + "learning_rate": 3.9662194054027e-06, + "loss": 0.6185, + "step": 6579 + }, + { + "epoch": 1.8227146814404431, + "grad_norm": 0.6926375031471252, + "learning_rate": 3.965924334367176e-06, + "loss": 0.6179, + "step": 6580 + }, + { + "epoch": 1.8229916897506926, + "grad_norm": 0.6690463423728943, + "learning_rate": 3.965629232206335e-06, + "loss": 0.6289, + "step": 6581 + }, + { + "epoch": 1.8232686980609418, + "grad_norm": 0.7154456377029419, + "learning_rate": 3.965334098926441e-06, + "loss": 0.6325, + "step": 6582 + }, + { + "epoch": 1.823545706371191, + "grad_norm": 0.6615016460418701, + "learning_rate": 3.965038934533764e-06, + "loss": 0.5962, + "step": 6583 + }, + { + "epoch": 1.8238227146814405, + "grad_norm": 0.6844783425331116, + "learning_rate": 3.964743739034569e-06, + "loss": 0.6, + "step": 6584 + }, + { + "epoch": 1.8240997229916898, + "grad_norm": 0.802054762840271, + "learning_rate": 3.964448512435123e-06, + "loss": 0.612, + "step": 6585 + }, + { + "epoch": 1.824376731301939, + "grad_norm": 0.671402096748352, + "learning_rate": 3.964153254741695e-06, + "loss": 0.6258, + "step": 6586 + }, + { + "epoch": 1.8246537396121885, + "grad_norm": 0.6834940314292908, + "learning_rate": 3.963857965960555e-06, + "loss": 0.6245, + "step": 6587 + }, + { + "epoch": 1.8249307479224377, + "grad_norm": 0.7301408052444458, + "learning_rate": 3.963562646097972e-06, + "loss": 0.6207, + "step": 6588 + }, + { + "epoch": 1.825207756232687, + "grad_norm": 0.6522077918052673, + "learning_rate": 3.9632672951602166e-06, + "loss": 0.6154, + "step": 6589 + }, + { + "epoch": 1.8254847645429364, + "grad_norm": 0.7064364552497864, + "learning_rate": 3.9629719131535595e-06, + "loss": 0.6165, + "step": 6590 + }, + { + "epoch": 1.8257617728531856, + "grad_norm": 0.7141890525817871, + "learning_rate": 3.962676500084272e-06, + "loss": 0.6153, + "step": 6591 + }, + { + "epoch": 1.8260387811634349, + "grad_norm": 0.7095588445663452, + "learning_rate": 3.962381055958629e-06, + "loss": 0.6256, + "step": 6592 + }, + { + "epoch": 1.8263157894736843, + "grad_norm": 0.6960277557373047, + "learning_rate": 3.9620855807829e-06, + "loss": 0.6223, + "step": 6593 + }, + { + "epoch": 1.8265927977839334, + "grad_norm": 0.7400093078613281, + "learning_rate": 3.961790074563362e-06, + "loss": 0.6138, + "step": 6594 + }, + { + "epoch": 1.8268698060941828, + "grad_norm": 0.6910619735717773, + "learning_rate": 3.961494537306286e-06, + "loss": 0.6275, + "step": 6595 + }, + { + "epoch": 1.8271468144044323, + "grad_norm": 0.6945144534111023, + "learning_rate": 3.96119896901795e-06, + "loss": 0.6323, + "step": 6596 + }, + { + "epoch": 1.8274238227146813, + "grad_norm": 0.6521016955375671, + "learning_rate": 3.9609033697046275e-06, + "loss": 0.6486, + "step": 6597 + }, + { + "epoch": 1.8277008310249307, + "grad_norm": 0.6771376132965088, + "learning_rate": 3.960607739372596e-06, + "loss": 0.6332, + "step": 6598 + }, + { + "epoch": 1.8279778393351802, + "grad_norm": 0.6685909628868103, + "learning_rate": 3.960312078028132e-06, + "loss": 0.6175, + "step": 6599 + }, + { + "epoch": 1.8282548476454292, + "grad_norm": 0.7308417558670044, + "learning_rate": 3.9600163856775135e-06, + "loss": 0.6134, + "step": 6600 + }, + { + "epoch": 1.8285318559556787, + "grad_norm": 0.670316219329834, + "learning_rate": 3.959720662327018e-06, + "loss": 0.5853, + "step": 6601 + }, + { + "epoch": 1.828808864265928, + "grad_norm": 0.7013362646102905, + "learning_rate": 3.959424907982927e-06, + "loss": 0.647, + "step": 6602 + }, + { + "epoch": 1.8290858725761772, + "grad_norm": 0.6852704286575317, + "learning_rate": 3.959129122651516e-06, + "loss": 0.5743, + "step": 6603 + }, + { + "epoch": 1.8293628808864266, + "grad_norm": 0.6822395920753479, + "learning_rate": 3.958833306339069e-06, + "loss": 0.6286, + "step": 6604 + }, + { + "epoch": 1.8296398891966759, + "grad_norm": 0.7460020184516907, + "learning_rate": 3.9585374590518654e-06, + "loss": 0.6033, + "step": 6605 + }, + { + "epoch": 1.829916897506925, + "grad_norm": 0.641464352607727, + "learning_rate": 3.958241580796186e-06, + "loss": 0.5939, + "step": 6606 + }, + { + "epoch": 1.8301939058171746, + "grad_norm": 0.7084401249885559, + "learning_rate": 3.957945671578315e-06, + "loss": 0.6111, + "step": 6607 + }, + { + "epoch": 1.8304709141274238, + "grad_norm": 0.717842161655426, + "learning_rate": 3.957649731404533e-06, + "loss": 0.6218, + "step": 6608 + }, + { + "epoch": 1.830747922437673, + "grad_norm": 0.6891061663627625, + "learning_rate": 3.9573537602811256e-06, + "loss": 0.631, + "step": 6609 + }, + { + "epoch": 1.8310249307479225, + "grad_norm": 0.6817660927772522, + "learning_rate": 3.957057758214376e-06, + "loss": 0.6099, + "step": 6610 + }, + { + "epoch": 1.8313019390581717, + "grad_norm": 0.6792852282524109, + "learning_rate": 3.95676172521057e-06, + "loss": 0.5927, + "step": 6611 + }, + { + "epoch": 1.831578947368421, + "grad_norm": 0.7312888503074646, + "learning_rate": 3.9564656612759904e-06, + "loss": 0.6568, + "step": 6612 + }, + { + "epoch": 1.8318559556786704, + "grad_norm": 0.6890841126441956, + "learning_rate": 3.956169566416928e-06, + "loss": 0.6552, + "step": 6613 + }, + { + "epoch": 1.8321329639889197, + "grad_norm": 0.688114583492279, + "learning_rate": 3.955873440639665e-06, + "loss": 0.6047, + "step": 6614 + }, + { + "epoch": 1.832409972299169, + "grad_norm": 0.7114209532737732, + "learning_rate": 3.955577283950491e-06, + "loss": 0.6245, + "step": 6615 + }, + { + "epoch": 1.8326869806094184, + "grad_norm": 0.7178516983985901, + "learning_rate": 3.9552810963556955e-06, + "loss": 0.6064, + "step": 6616 + }, + { + "epoch": 1.8329639889196676, + "grad_norm": 0.6884858012199402, + "learning_rate": 3.954984877861565e-06, + "loss": 0.6177, + "step": 6617 + }, + { + "epoch": 1.8332409972299168, + "grad_norm": 0.6996463537216187, + "learning_rate": 3.95468862847439e-06, + "loss": 0.6554, + "step": 6618 + }, + { + "epoch": 1.8335180055401663, + "grad_norm": 0.7320404648780823, + "learning_rate": 3.95439234820046e-06, + "loss": 0.632, + "step": 6619 + }, + { + "epoch": 1.8337950138504155, + "grad_norm": 0.6940857172012329, + "learning_rate": 3.954096037046068e-06, + "loss": 0.6096, + "step": 6620 + }, + { + "epoch": 1.8340720221606648, + "grad_norm": 0.7151522636413574, + "learning_rate": 3.953799695017503e-06, + "loss": 0.6366, + "step": 6621 + }, + { + "epoch": 1.8343490304709142, + "grad_norm": 0.7152765989303589, + "learning_rate": 3.9535033221210574e-06, + "loss": 0.619, + "step": 6622 + }, + { + "epoch": 1.8346260387811635, + "grad_norm": 0.6934623718261719, + "learning_rate": 3.953206918363024e-06, + "loss": 0.629, + "step": 6623 + }, + { + "epoch": 1.8349030470914127, + "grad_norm": 0.7444363236427307, + "learning_rate": 3.952910483749698e-06, + "loss": 0.6337, + "step": 6624 + }, + { + "epoch": 1.8351800554016622, + "grad_norm": 0.7064351439476013, + "learning_rate": 3.952614018287371e-06, + "loss": 0.6309, + "step": 6625 + }, + { + "epoch": 1.8354570637119112, + "grad_norm": 0.7176104784011841, + "learning_rate": 3.9523175219823394e-06, + "loss": 0.6072, + "step": 6626 + }, + { + "epoch": 1.8357340720221607, + "grad_norm": 0.7179607152938843, + "learning_rate": 3.952020994840898e-06, + "loss": 0.6372, + "step": 6627 + }, + { + "epoch": 1.8360110803324101, + "grad_norm": 0.7062599062919617, + "learning_rate": 3.951724436869343e-06, + "loss": 0.6095, + "step": 6628 + }, + { + "epoch": 1.8362880886426591, + "grad_norm": 0.6884281635284424, + "learning_rate": 3.95142784807397e-06, + "loss": 0.5208, + "step": 6629 + }, + { + "epoch": 1.8365650969529086, + "grad_norm": 0.7327911257743835, + "learning_rate": 3.9511312284610785e-06, + "loss": 0.6686, + "step": 6630 + }, + { + "epoch": 1.836842105263158, + "grad_norm": 0.6697911024093628, + "learning_rate": 3.950834578036964e-06, + "loss": 0.6197, + "step": 6631 + }, + { + "epoch": 1.837119113573407, + "grad_norm": 0.6918233036994934, + "learning_rate": 3.950537896807926e-06, + "loss": 0.6175, + "step": 6632 + }, + { + "epoch": 1.8373961218836565, + "grad_norm": 0.7282374501228333, + "learning_rate": 3.950241184780266e-06, + "loss": 0.6044, + "step": 6633 + }, + { + "epoch": 1.8376731301939058, + "grad_norm": 0.6883581876754761, + "learning_rate": 3.94994444196028e-06, + "loss": 0.6207, + "step": 6634 + }, + { + "epoch": 1.837950138504155, + "grad_norm": 0.658677339553833, + "learning_rate": 3.949647668354273e-06, + "loss": 0.564, + "step": 6635 + }, + { + "epoch": 1.8382271468144045, + "grad_norm": 0.7224382758140564, + "learning_rate": 3.949350863968543e-06, + "loss": 0.651, + "step": 6636 + }, + { + "epoch": 1.8385041551246537, + "grad_norm": 0.675486147403717, + "learning_rate": 3.949054028809393e-06, + "loss": 0.6152, + "step": 6637 + }, + { + "epoch": 1.838781163434903, + "grad_norm": 0.7212086319923401, + "learning_rate": 3.9487571628831255e-06, + "loss": 0.6426, + "step": 6638 + }, + { + "epoch": 1.8390581717451524, + "grad_norm": 0.6830087900161743, + "learning_rate": 3.948460266196043e-06, + "loss": 0.6306, + "step": 6639 + }, + { + "epoch": 1.8393351800554016, + "grad_norm": 0.7165666222572327, + "learning_rate": 3.948163338754452e-06, + "loss": 0.6318, + "step": 6640 + }, + { + "epoch": 1.8396121883656509, + "grad_norm": 0.689459502696991, + "learning_rate": 3.947866380564654e-06, + "loss": 0.6096, + "step": 6641 + }, + { + "epoch": 1.8398891966759003, + "grad_norm": 0.6928114295005798, + "learning_rate": 3.9475693916329536e-06, + "loss": 0.6445, + "step": 6642 + }, + { + "epoch": 1.8401662049861496, + "grad_norm": 0.6756209135055542, + "learning_rate": 3.94727237196566e-06, + "loss": 0.6097, + "step": 6643 + }, + { + "epoch": 1.8404432132963988, + "grad_norm": 0.7100651860237122, + "learning_rate": 3.9469753215690785e-06, + "loss": 0.6057, + "step": 6644 + }, + { + "epoch": 1.8407202216066483, + "grad_norm": 0.6669239401817322, + "learning_rate": 3.946678240449515e-06, + "loss": 0.5732, + "step": 6645 + }, + { + "epoch": 1.8409972299168975, + "grad_norm": 0.7072580456733704, + "learning_rate": 3.946381128613278e-06, + "loss": 0.5744, + "step": 6646 + }, + { + "epoch": 1.8412742382271468, + "grad_norm": 0.6962587833404541, + "learning_rate": 3.946083986066676e-06, + "loss": 0.6353, + "step": 6647 + }, + { + "epoch": 1.8415512465373962, + "grad_norm": 0.7108308672904968, + "learning_rate": 3.945786812816018e-06, + "loss": 0.5871, + "step": 6648 + }, + { + "epoch": 1.8418282548476455, + "grad_norm": 0.7100950479507446, + "learning_rate": 3.945489608867614e-06, + "loss": 0.6274, + "step": 6649 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.6926645040512085, + "learning_rate": 3.945192374227774e-06, + "loss": 0.5946, + "step": 6650 + }, + { + "epoch": 1.8423822714681442, + "grad_norm": 0.7518139481544495, + "learning_rate": 3.94489510890281e-06, + "loss": 0.6722, + "step": 6651 + }, + { + "epoch": 1.8426592797783934, + "grad_norm": 0.7210699915885925, + "learning_rate": 3.944597812899033e-06, + "loss": 0.6258, + "step": 6652 + }, + { + "epoch": 1.8429362880886426, + "grad_norm": 0.7152932286262512, + "learning_rate": 3.944300486222754e-06, + "loss": 0.5441, + "step": 6653 + }, + { + "epoch": 1.843213296398892, + "grad_norm": 0.7112598419189453, + "learning_rate": 3.9440031288802885e-06, + "loss": 0.6072, + "step": 6654 + }, + { + "epoch": 1.8434903047091413, + "grad_norm": 0.6846300959587097, + "learning_rate": 3.943705740877949e-06, + "loss": 0.6604, + "step": 6655 + }, + { + "epoch": 1.8437673130193906, + "grad_norm": 0.7109817266464233, + "learning_rate": 3.943408322222049e-06, + "loss": 0.5605, + "step": 6656 + }, + { + "epoch": 1.84404432132964, + "grad_norm": 0.669664740562439, + "learning_rate": 3.943110872918905e-06, + "loss": 0.5983, + "step": 6657 + }, + { + "epoch": 1.844321329639889, + "grad_norm": 0.6921775937080383, + "learning_rate": 3.942813392974832e-06, + "loss": 0.6385, + "step": 6658 + }, + { + "epoch": 1.8445983379501385, + "grad_norm": 0.6840718388557434, + "learning_rate": 3.942515882396145e-06, + "loss": 0.6221, + "step": 6659 + }, + { + "epoch": 1.844875346260388, + "grad_norm": 0.7287196516990662, + "learning_rate": 3.942218341189163e-06, + "loss": 0.6248, + "step": 6660 + }, + { + "epoch": 1.845152354570637, + "grad_norm": 0.7228674292564392, + "learning_rate": 3.941920769360202e-06, + "loss": 0.6277, + "step": 6661 + }, + { + "epoch": 1.8454293628808864, + "grad_norm": 0.688456654548645, + "learning_rate": 3.941623166915582e-06, + "loss": 0.5955, + "step": 6662 + }, + { + "epoch": 1.845706371191136, + "grad_norm": 0.6912646293640137, + "learning_rate": 3.941325533861619e-06, + "loss": 0.6411, + "step": 6663 + }, + { + "epoch": 1.845983379501385, + "grad_norm": 0.6859648823738098, + "learning_rate": 3.941027870204634e-06, + "loss": 0.6436, + "step": 6664 + }, + { + "epoch": 1.8462603878116344, + "grad_norm": 0.7074818015098572, + "learning_rate": 3.94073017595095e-06, + "loss": 0.6248, + "step": 6665 + }, + { + "epoch": 1.8465373961218836, + "grad_norm": 0.7277980446815491, + "learning_rate": 3.9404324511068825e-06, + "loss": 0.6328, + "step": 6666 + }, + { + "epoch": 1.8468144044321329, + "grad_norm": 0.6855519413948059, + "learning_rate": 3.940134695678757e-06, + "loss": 0.6123, + "step": 6667 + }, + { + "epoch": 1.8470914127423823, + "grad_norm": 0.7200990915298462, + "learning_rate": 3.939836909672893e-06, + "loss": 0.6347, + "step": 6668 + }, + { + "epoch": 1.8473684210526315, + "grad_norm": 0.73331218957901, + "learning_rate": 3.939539093095616e-06, + "loss": 0.6002, + "step": 6669 + }, + { + "epoch": 1.8476454293628808, + "grad_norm": 0.6926178932189941, + "learning_rate": 3.9392412459532466e-06, + "loss": 0.6748, + "step": 6670 + }, + { + "epoch": 1.8479224376731302, + "grad_norm": 0.704434871673584, + "learning_rate": 3.93894336825211e-06, + "loss": 0.6212, + "step": 6671 + }, + { + "epoch": 1.8481994459833795, + "grad_norm": 0.6933774948120117, + "learning_rate": 3.938645459998532e-06, + "loss": 0.6593, + "step": 6672 + }, + { + "epoch": 1.8484764542936287, + "grad_norm": 0.7267084121704102, + "learning_rate": 3.938347521198837e-06, + "loss": 0.646, + "step": 6673 + }, + { + "epoch": 1.8487534626038782, + "grad_norm": 0.6638674736022949, + "learning_rate": 3.938049551859351e-06, + "loss": 0.6442, + "step": 6674 + }, + { + "epoch": 1.8490304709141274, + "grad_norm": 0.7296227216720581, + "learning_rate": 3.9377515519864e-06, + "loss": 0.6231, + "step": 6675 + }, + { + "epoch": 1.8493074792243767, + "grad_norm": 0.6873810887336731, + "learning_rate": 3.937453521586312e-06, + "loss": 0.6157, + "step": 6676 + }, + { + "epoch": 1.8495844875346261, + "grad_norm": 0.6810946464538574, + "learning_rate": 3.937155460665415e-06, + "loss": 0.6375, + "step": 6677 + }, + { + "epoch": 1.8498614958448754, + "grad_norm": 0.6755537986755371, + "learning_rate": 3.936857369230037e-06, + "loss": 0.5826, + "step": 6678 + }, + { + "epoch": 1.8501385041551246, + "grad_norm": 0.6881328225135803, + "learning_rate": 3.936559247286509e-06, + "loss": 0.6105, + "step": 6679 + }, + { + "epoch": 1.850415512465374, + "grad_norm": 0.7300723791122437, + "learning_rate": 3.936261094841159e-06, + "loss": 0.622, + "step": 6680 + }, + { + "epoch": 1.8506925207756233, + "grad_norm": 0.7086427807807922, + "learning_rate": 3.935962911900317e-06, + "loss": 0.6495, + "step": 6681 + }, + { + "epoch": 1.8509695290858725, + "grad_norm": 0.6515028476715088, + "learning_rate": 3.9356646984703175e-06, + "loss": 0.6109, + "step": 6682 + }, + { + "epoch": 1.851246537396122, + "grad_norm": 0.72528076171875, + "learning_rate": 3.935366454557488e-06, + "loss": 0.6304, + "step": 6683 + }, + { + "epoch": 1.8515235457063712, + "grad_norm": 0.6935245394706726, + "learning_rate": 3.935068180168165e-06, + "loss": 0.6425, + "step": 6684 + }, + { + "epoch": 1.8518005540166205, + "grad_norm": 0.7298848628997803, + "learning_rate": 3.934769875308678e-06, + "loss": 0.6371, + "step": 6685 + }, + { + "epoch": 1.85207756232687, + "grad_norm": 0.7072490453720093, + "learning_rate": 3.934471539985364e-06, + "loss": 0.5982, + "step": 6686 + }, + { + "epoch": 1.8523545706371192, + "grad_norm": 0.7231565117835999, + "learning_rate": 3.934173174204555e-06, + "loss": 0.647, + "step": 6687 + }, + { + "epoch": 1.8526315789473684, + "grad_norm": 0.7302079796791077, + "learning_rate": 3.933874777972589e-06, + "loss": 0.6555, + "step": 6688 + }, + { + "epoch": 1.8529085872576179, + "grad_norm": 0.6899899840354919, + "learning_rate": 3.933576351295797e-06, + "loss": 0.5908, + "step": 6689 + }, + { + "epoch": 1.8531855955678669, + "grad_norm": 0.6647062301635742, + "learning_rate": 3.933277894180519e-06, + "loss": 0.6456, + "step": 6690 + }, + { + "epoch": 1.8534626038781163, + "grad_norm": 0.7126433253288269, + "learning_rate": 3.932979406633091e-06, + "loss": 0.6684, + "step": 6691 + }, + { + "epoch": 1.8537396121883658, + "grad_norm": 0.7348646521568298, + "learning_rate": 3.932680888659852e-06, + "loss": 0.6495, + "step": 6692 + }, + { + "epoch": 1.8540166204986148, + "grad_norm": 0.6930987238883972, + "learning_rate": 3.932382340267138e-06, + "loss": 0.6132, + "step": 6693 + }, + { + "epoch": 1.8542936288088643, + "grad_norm": 0.7253433465957642, + "learning_rate": 3.93208376146129e-06, + "loss": 0.644, + "step": 6694 + }, + { + "epoch": 1.8545706371191135, + "grad_norm": 0.679956316947937, + "learning_rate": 3.931785152248645e-06, + "loss": 0.6581, + "step": 6695 + }, + { + "epoch": 1.8548476454293628, + "grad_norm": 0.742157518863678, + "learning_rate": 3.931486512635546e-06, + "loss": 0.6561, + "step": 6696 + }, + { + "epoch": 1.8551246537396122, + "grad_norm": 0.7009865045547485, + "learning_rate": 3.931187842628332e-06, + "loss": 0.6122, + "step": 6697 + }, + { + "epoch": 1.8554016620498615, + "grad_norm": 0.704265832901001, + "learning_rate": 3.930889142233346e-06, + "loss": 0.6643, + "step": 6698 + }, + { + "epoch": 1.8556786703601107, + "grad_norm": 0.687735915184021, + "learning_rate": 3.9305904114569295e-06, + "loss": 0.6302, + "step": 6699 + }, + { + "epoch": 1.8559556786703602, + "grad_norm": 0.6906552910804749, + "learning_rate": 3.930291650305424e-06, + "loss": 0.6103, + "step": 6700 + }, + { + "epoch": 1.8562326869806094, + "grad_norm": 0.6972749829292297, + "learning_rate": 3.929992858785176e-06, + "loss": 0.624, + "step": 6701 + }, + { + "epoch": 1.8565096952908586, + "grad_norm": 0.74320387840271, + "learning_rate": 3.929694036902526e-06, + "loss": 0.691, + "step": 6702 + }, + { + "epoch": 1.856786703601108, + "grad_norm": 0.6657626032829285, + "learning_rate": 3.929395184663822e-06, + "loss": 0.616, + "step": 6703 + }, + { + "epoch": 1.8570637119113573, + "grad_norm": 0.681754469871521, + "learning_rate": 3.929096302075407e-06, + "loss": 0.6234, + "step": 6704 + }, + { + "epoch": 1.8573407202216066, + "grad_norm": 0.7159410119056702, + "learning_rate": 3.928797389143628e-06, + "loss": 0.6834, + "step": 6705 + }, + { + "epoch": 1.857617728531856, + "grad_norm": 0.6623358726501465, + "learning_rate": 3.9284984458748325e-06, + "loss": 0.5949, + "step": 6706 + }, + { + "epoch": 1.8578947368421053, + "grad_norm": 0.6965116858482361, + "learning_rate": 3.928199472275367e-06, + "loss": 0.6455, + "step": 6707 + }, + { + "epoch": 1.8581717451523545, + "grad_norm": 0.6975403428077698, + "learning_rate": 3.9279004683515785e-06, + "loss": 0.6341, + "step": 6708 + }, + { + "epoch": 1.858448753462604, + "grad_norm": 0.7415390014648438, + "learning_rate": 3.927601434109816e-06, + "loss": 0.6179, + "step": 6709 + }, + { + "epoch": 1.8587257617728532, + "grad_norm": 0.7149662971496582, + "learning_rate": 3.927302369556431e-06, + "loss": 0.6419, + "step": 6710 + }, + { + "epoch": 1.8590027700831024, + "grad_norm": 0.756695032119751, + "learning_rate": 3.927003274697772e-06, + "loss": 0.6325, + "step": 6711 + }, + { + "epoch": 1.859279778393352, + "grad_norm": 0.692173182964325, + "learning_rate": 3.926704149540188e-06, + "loss": 0.6093, + "step": 6712 + }, + { + "epoch": 1.8595567867036011, + "grad_norm": 0.7355810403823853, + "learning_rate": 3.9264049940900315e-06, + "loss": 0.6555, + "step": 6713 + }, + { + "epoch": 1.8598337950138504, + "grad_norm": 0.7171083688735962, + "learning_rate": 3.926105808353655e-06, + "loss": 0.6567, + "step": 6714 + }, + { + "epoch": 1.8601108033240998, + "grad_norm": 0.6942392587661743, + "learning_rate": 3.9258065923374104e-06, + "loss": 0.6598, + "step": 6715 + }, + { + "epoch": 1.860387811634349, + "grad_norm": 0.7160379886627197, + "learning_rate": 3.925507346047651e-06, + "loss": 0.6218, + "step": 6716 + }, + { + "epoch": 1.8606648199445983, + "grad_norm": 0.6698644757270813, + "learning_rate": 3.925208069490729e-06, + "loss": 0.6812, + "step": 6717 + }, + { + "epoch": 1.8609418282548478, + "grad_norm": 0.6921928524971008, + "learning_rate": 3.924908762673001e-06, + "loss": 0.6248, + "step": 6718 + }, + { + "epoch": 1.8612188365650968, + "grad_norm": 0.6881176233291626, + "learning_rate": 3.924609425600822e-06, + "loss": 0.5868, + "step": 6719 + }, + { + "epoch": 1.8614958448753463, + "grad_norm": 0.6716845035552979, + "learning_rate": 3.924310058280545e-06, + "loss": 0.6136, + "step": 6720 + }, + { + "epoch": 1.8617728531855957, + "grad_norm": 0.7087962031364441, + "learning_rate": 3.924010660718529e-06, + "loss": 0.6524, + "step": 6721 + }, + { + "epoch": 1.8620498614958447, + "grad_norm": 0.688025951385498, + "learning_rate": 3.92371123292113e-06, + "loss": 0.574, + "step": 6722 + }, + { + "epoch": 1.8623268698060942, + "grad_norm": 0.7003433108329773, + "learning_rate": 3.923411774894707e-06, + "loss": 0.6171, + "step": 6723 + }, + { + "epoch": 1.8626038781163436, + "grad_norm": 0.7128757238388062, + "learning_rate": 3.923112286645615e-06, + "loss": 0.6208, + "step": 6724 + }, + { + "epoch": 1.8628808864265927, + "grad_norm": 0.6693975925445557, + "learning_rate": 3.922812768180216e-06, + "loss": 0.5917, + "step": 6725 + }, + { + "epoch": 1.8631578947368421, + "grad_norm": 0.7504602074623108, + "learning_rate": 3.922513219504869e-06, + "loss": 0.6333, + "step": 6726 + }, + { + "epoch": 1.8634349030470914, + "grad_norm": 0.6776396632194519, + "learning_rate": 3.922213640625933e-06, + "loss": 0.6532, + "step": 6727 + }, + { + "epoch": 1.8637119113573406, + "grad_norm": 0.7138490080833435, + "learning_rate": 3.921914031549769e-06, + "loss": 0.6541, + "step": 6728 + }, + { + "epoch": 1.86398891966759, + "grad_norm": 0.6899004578590393, + "learning_rate": 3.92161439228274e-06, + "loss": 0.6395, + "step": 6729 + }, + { + "epoch": 1.8642659279778393, + "grad_norm": 0.6944865584373474, + "learning_rate": 3.921314722831205e-06, + "loss": 0.5886, + "step": 6730 + }, + { + "epoch": 1.8645429362880885, + "grad_norm": 0.7385538220405579, + "learning_rate": 3.9210150232015305e-06, + "loss": 0.6616, + "step": 6731 + }, + { + "epoch": 1.864819944598338, + "grad_norm": 0.7503032684326172, + "learning_rate": 3.920715293400079e-06, + "loss": 0.612, + "step": 6732 + }, + { + "epoch": 1.8650969529085872, + "grad_norm": 0.690791130065918, + "learning_rate": 3.920415533433212e-06, + "loss": 0.6164, + "step": 6733 + }, + { + "epoch": 1.8653739612188365, + "grad_norm": 0.7362712025642395, + "learning_rate": 3.920115743307297e-06, + "loss": 0.5862, + "step": 6734 + }, + { + "epoch": 1.865650969529086, + "grad_norm": 0.7012346982955933, + "learning_rate": 3.919815923028697e-06, + "loss": 0.6398, + "step": 6735 + }, + { + "epoch": 1.8659279778393352, + "grad_norm": 0.6703203916549683, + "learning_rate": 3.9195160726037805e-06, + "loss": 0.6499, + "step": 6736 + }, + { + "epoch": 1.8662049861495844, + "grad_norm": 0.7024924755096436, + "learning_rate": 3.919216192038912e-06, + "loss": 0.6364, + "step": 6737 + }, + { + "epoch": 1.8664819944598339, + "grad_norm": 0.6496801972389221, + "learning_rate": 3.918916281340459e-06, + "loss": 0.5743, + "step": 6738 + }, + { + "epoch": 1.866759002770083, + "grad_norm": 0.7026786804199219, + "learning_rate": 3.918616340514791e-06, + "loss": 0.6141, + "step": 6739 + }, + { + "epoch": 1.8670360110803323, + "grad_norm": 0.7047402262687683, + "learning_rate": 3.918316369568275e-06, + "loss": 0.6274, + "step": 6740 + }, + { + "epoch": 1.8673130193905818, + "grad_norm": 0.7136473655700684, + "learning_rate": 3.918016368507279e-06, + "loss": 0.6346, + "step": 6741 + }, + { + "epoch": 1.867590027700831, + "grad_norm": 0.6642279624938965, + "learning_rate": 3.917716337338176e-06, + "loss": 0.6038, + "step": 6742 + }, + { + "epoch": 1.8678670360110803, + "grad_norm": 0.6472945809364319, + "learning_rate": 3.917416276067334e-06, + "loss": 0.6186, + "step": 6743 + }, + { + "epoch": 1.8681440443213297, + "grad_norm": 0.7538015842437744, + "learning_rate": 3.917116184701125e-06, + "loss": 0.6696, + "step": 6744 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.6811779141426086, + "learning_rate": 3.916816063245921e-06, + "loss": 0.5859, + "step": 6745 + }, + { + "epoch": 1.8686980609418282, + "grad_norm": 0.6205077767372131, + "learning_rate": 3.916515911708093e-06, + "loss": 0.4999, + "step": 6746 + }, + { + "epoch": 1.8689750692520777, + "grad_norm": 0.69630366563797, + "learning_rate": 3.916215730094015e-06, + "loss": 0.6043, + "step": 6747 + }, + { + "epoch": 1.869252077562327, + "grad_norm": 0.6904569268226624, + "learning_rate": 3.91591551841006e-06, + "loss": 0.6165, + "step": 6748 + }, + { + "epoch": 1.8695290858725762, + "grad_norm": 0.6998686790466309, + "learning_rate": 3.9156152766626036e-06, + "loss": 0.5871, + "step": 6749 + }, + { + "epoch": 1.8698060941828256, + "grad_norm": 0.6952171921730042, + "learning_rate": 3.915315004858019e-06, + "loss": 0.63, + "step": 6750 + }, + { + "epoch": 1.8700831024930746, + "grad_norm": 0.7425678372383118, + "learning_rate": 3.915014703002683e-06, + "loss": 0.634, + "step": 6751 + }, + { + "epoch": 1.870360110803324, + "grad_norm": 0.6801332831382751, + "learning_rate": 3.914714371102971e-06, + "loss": 0.6623, + "step": 6752 + }, + { + "epoch": 1.8706371191135736, + "grad_norm": 0.7020564079284668, + "learning_rate": 3.914414009165261e-06, + "loss": 0.5887, + "step": 6753 + }, + { + "epoch": 1.8709141274238226, + "grad_norm": 0.7230193018913269, + "learning_rate": 3.914113617195928e-06, + "loss": 0.5858, + "step": 6754 + }, + { + "epoch": 1.871191135734072, + "grad_norm": 0.6767334938049316, + "learning_rate": 3.9138131952013535e-06, + "loss": 0.6358, + "step": 6755 + }, + { + "epoch": 1.8714681440443215, + "grad_norm": 0.6952727437019348, + "learning_rate": 3.913512743187913e-06, + "loss": 0.618, + "step": 6756 + }, + { + "epoch": 1.8717451523545705, + "grad_norm": 0.7013217210769653, + "learning_rate": 3.913212261161988e-06, + "loss": 0.597, + "step": 6757 + }, + { + "epoch": 1.87202216066482, + "grad_norm": 0.6710230112075806, + "learning_rate": 3.912911749129957e-06, + "loss": 0.6258, + "step": 6758 + }, + { + "epoch": 1.8722991689750692, + "grad_norm": 0.6895468235015869, + "learning_rate": 3.912611207098202e-06, + "loss": 0.6536, + "step": 6759 + }, + { + "epoch": 1.8725761772853184, + "grad_norm": 0.7238779067993164, + "learning_rate": 3.9123106350731035e-06, + "loss": 0.6386, + "step": 6760 + }, + { + "epoch": 1.872853185595568, + "grad_norm": 0.705436110496521, + "learning_rate": 3.912010033061043e-06, + "loss": 0.6253, + "step": 6761 + }, + { + "epoch": 1.8731301939058171, + "grad_norm": 0.7319697737693787, + "learning_rate": 3.9117094010684046e-06, + "loss": 0.6386, + "step": 6762 + }, + { + "epoch": 1.8734072022160664, + "grad_norm": 0.6451613903045654, + "learning_rate": 3.911408739101569e-06, + "loss": 0.622, + "step": 6763 + }, + { + "epoch": 1.8736842105263158, + "grad_norm": 0.7184947729110718, + "learning_rate": 3.911108047166924e-06, + "loss": 0.6283, + "step": 6764 + }, + { + "epoch": 1.873961218836565, + "grad_norm": 0.7210145592689514, + "learning_rate": 3.91080732527085e-06, + "loss": 0.589, + "step": 6765 + }, + { + "epoch": 1.8742382271468143, + "grad_norm": 0.6852824091911316, + "learning_rate": 3.910506573419734e-06, + "loss": 0.6923, + "step": 6766 + }, + { + "epoch": 1.8745152354570638, + "grad_norm": 0.6745520234107971, + "learning_rate": 3.91020579161996e-06, + "loss": 0.6075, + "step": 6767 + }, + { + "epoch": 1.874792243767313, + "grad_norm": 0.6957927346229553, + "learning_rate": 3.9099049798779175e-06, + "loss": 0.6381, + "step": 6768 + }, + { + "epoch": 1.8750692520775623, + "grad_norm": 0.7385134100914001, + "learning_rate": 3.9096041381999915e-06, + "loss": 0.6347, + "step": 6769 + }, + { + "epoch": 1.8753462603878117, + "grad_norm": 0.6691296696662903, + "learning_rate": 3.90930326659257e-06, + "loss": 0.6456, + "step": 6770 + }, + { + "epoch": 1.875623268698061, + "grad_norm": 0.7090333700180054, + "learning_rate": 3.90900236506204e-06, + "loss": 0.6446, + "step": 6771 + }, + { + "epoch": 1.8759002770083102, + "grad_norm": 0.6893540024757385, + "learning_rate": 3.908701433614793e-06, + "loss": 0.5945, + "step": 6772 + }, + { + "epoch": 1.8761772853185597, + "grad_norm": 0.7020983695983887, + "learning_rate": 3.908400472257216e-06, + "loss": 0.565, + "step": 6773 + }, + { + "epoch": 1.876454293628809, + "grad_norm": 0.705759584903717, + "learning_rate": 3.908099480995701e-06, + "loss": 0.6329, + "step": 6774 + }, + { + "epoch": 1.8767313019390581, + "grad_norm": 0.7019346952438354, + "learning_rate": 3.907798459836638e-06, + "loss": 0.6426, + "step": 6775 + }, + { + "epoch": 1.8770083102493076, + "grad_norm": 0.8493272066116333, + "learning_rate": 3.907497408786418e-06, + "loss": 0.6011, + "step": 6776 + }, + { + "epoch": 1.8772853185595568, + "grad_norm": 0.6631448864936829, + "learning_rate": 3.9071963278514345e-06, + "loss": 0.6352, + "step": 6777 + }, + { + "epoch": 1.877562326869806, + "grad_norm": 0.7411801218986511, + "learning_rate": 3.906895217038079e-06, + "loss": 0.6191, + "step": 6778 + }, + { + "epoch": 1.8778393351800555, + "grad_norm": 0.7120534181594849, + "learning_rate": 3.906594076352746e-06, + "loss": 0.6346, + "step": 6779 + }, + { + "epoch": 1.8781163434903048, + "grad_norm": 0.6764026284217834, + "learning_rate": 3.906292905801828e-06, + "loss": 0.5605, + "step": 6780 + }, + { + "epoch": 1.878393351800554, + "grad_norm": 0.7547695636749268, + "learning_rate": 3.90599170539172e-06, + "loss": 0.6406, + "step": 6781 + }, + { + "epoch": 1.8786703601108035, + "grad_norm": 0.688042938709259, + "learning_rate": 3.9056904751288185e-06, + "loss": 0.6302, + "step": 6782 + }, + { + "epoch": 1.8789473684210525, + "grad_norm": 0.7339063882827759, + "learning_rate": 3.905389215019518e-06, + "loss": 0.6323, + "step": 6783 + }, + { + "epoch": 1.879224376731302, + "grad_norm": 0.7102497816085815, + "learning_rate": 3.905087925070214e-06, + "loss": 0.5828, + "step": 6784 + }, + { + "epoch": 1.8795013850415514, + "grad_norm": 0.7145693898200989, + "learning_rate": 3.9047866052873075e-06, + "loss": 0.6135, + "step": 6785 + }, + { + "epoch": 1.8797783933518004, + "grad_norm": 0.7017257213592529, + "learning_rate": 3.904485255677193e-06, + "loss": 0.6143, + "step": 6786 + }, + { + "epoch": 1.8800554016620499, + "grad_norm": 0.6663693785667419, + "learning_rate": 3.90418387624627e-06, + "loss": 0.6185, + "step": 6787 + }, + { + "epoch": 1.8803324099722993, + "grad_norm": 0.7595977783203125, + "learning_rate": 3.903882467000938e-06, + "loss": 0.6048, + "step": 6788 + }, + { + "epoch": 1.8806094182825484, + "grad_norm": 0.7022473812103271, + "learning_rate": 3.9035810279475945e-06, + "loss": 0.6308, + "step": 6789 + }, + { + "epoch": 1.8808864265927978, + "grad_norm": 0.7909603714942932, + "learning_rate": 3.903279559092642e-06, + "loss": 0.6012, + "step": 6790 + }, + { + "epoch": 1.881163434903047, + "grad_norm": 0.6895849704742432, + "learning_rate": 3.902978060442482e-06, + "loss": 0.634, + "step": 6791 + }, + { + "epoch": 1.8814404432132963, + "grad_norm": 0.7165695428848267, + "learning_rate": 3.902676532003513e-06, + "loss": 0.6076, + "step": 6792 + }, + { + "epoch": 1.8817174515235457, + "grad_norm": 0.7227752804756165, + "learning_rate": 3.902374973782141e-06, + "loss": 0.6158, + "step": 6793 + }, + { + "epoch": 1.881994459833795, + "grad_norm": 0.716238260269165, + "learning_rate": 3.9020733857847665e-06, + "loss": 0.6021, + "step": 6794 + }, + { + "epoch": 1.8822714681440442, + "grad_norm": 0.7465752959251404, + "learning_rate": 3.901771768017793e-06, + "loss": 0.6563, + "step": 6795 + }, + { + "epoch": 1.8825484764542937, + "grad_norm": 0.7316117882728577, + "learning_rate": 3.901470120487626e-06, + "loss": 0.6235, + "step": 6796 + }, + { + "epoch": 1.882825484764543, + "grad_norm": 0.7191451191902161, + "learning_rate": 3.901168443200669e-06, + "loss": 0.668, + "step": 6797 + }, + { + "epoch": 1.8831024930747922, + "grad_norm": 0.7118086218833923, + "learning_rate": 3.900866736163326e-06, + "loss": 0.5964, + "step": 6798 + }, + { + "epoch": 1.8833795013850416, + "grad_norm": 0.6637589931488037, + "learning_rate": 3.900564999382007e-06, + "loss": 0.6217, + "step": 6799 + }, + { + "epoch": 1.8836565096952909, + "grad_norm": 0.7318509221076965, + "learning_rate": 3.900263232863117e-06, + "loss": 0.5931, + "step": 6800 + }, + { + "epoch": 1.88393351800554, + "grad_norm": 0.6680594086647034, + "learning_rate": 3.899961436613061e-06, + "loss": 0.631, + "step": 6801 + }, + { + "epoch": 1.8842105263157896, + "grad_norm": 0.6916776299476624, + "learning_rate": 3.8996596106382486e-06, + "loss": 0.6443, + "step": 6802 + }, + { + "epoch": 1.8844875346260388, + "grad_norm": 0.6689826846122742, + "learning_rate": 3.899357754945089e-06, + "loss": 0.6351, + "step": 6803 + }, + { + "epoch": 1.884764542936288, + "grad_norm": 0.6716510057449341, + "learning_rate": 3.899055869539991e-06, + "loss": 0.5975, + "step": 6804 + }, + { + "epoch": 1.8850415512465375, + "grad_norm": 0.7200577259063721, + "learning_rate": 3.898753954429363e-06, + "loss": 0.6257, + "step": 6805 + }, + { + "epoch": 1.8853185595567867, + "grad_norm": 0.7464181780815125, + "learning_rate": 3.898452009619616e-06, + "loss": 0.6395, + "step": 6806 + }, + { + "epoch": 1.885595567867036, + "grad_norm": 0.6617922186851501, + "learning_rate": 3.8981500351171645e-06, + "loss": 0.5896, + "step": 6807 + }, + { + "epoch": 1.8858725761772854, + "grad_norm": 0.7187881469726562, + "learning_rate": 3.897848030928415e-06, + "loss": 0.6358, + "step": 6808 + }, + { + "epoch": 1.8861495844875347, + "grad_norm": 0.6788371205329895, + "learning_rate": 3.897545997059782e-06, + "loss": 0.5784, + "step": 6809 + }, + { + "epoch": 1.886426592797784, + "grad_norm": 0.7442015409469604, + "learning_rate": 3.897243933517679e-06, + "loss": 0.663, + "step": 6810 + }, + { + "epoch": 1.8867036011080334, + "grad_norm": 0.6612447500228882, + "learning_rate": 3.89694184030852e-06, + "loss": 0.6382, + "step": 6811 + }, + { + "epoch": 1.8869806094182824, + "grad_norm": 0.6605403423309326, + "learning_rate": 3.896639717438717e-06, + "loss": 0.5846, + "step": 6812 + }, + { + "epoch": 1.8872576177285318, + "grad_norm": 0.6534563302993774, + "learning_rate": 3.896337564914687e-06, + "loss": 0.5978, + "step": 6813 + }, + { + "epoch": 1.8875346260387813, + "grad_norm": 0.669242262840271, + "learning_rate": 3.896035382742844e-06, + "loss": 0.6266, + "step": 6814 + }, + { + "epoch": 1.8878116343490303, + "grad_norm": 0.6762076616287231, + "learning_rate": 3.895733170929606e-06, + "loss": 0.628, + "step": 6815 + }, + { + "epoch": 1.8880886426592798, + "grad_norm": 0.6946333646774292, + "learning_rate": 3.8954309294813865e-06, + "loss": 0.6718, + "step": 6816 + }, + { + "epoch": 1.8883656509695292, + "grad_norm": 0.6655415296554565, + "learning_rate": 3.895128658404605e-06, + "loss": 0.5833, + "step": 6817 + }, + { + "epoch": 1.8886426592797783, + "grad_norm": 0.6595086455345154, + "learning_rate": 3.894826357705681e-06, + "loss": 0.5797, + "step": 6818 + }, + { + "epoch": 1.8889196675900277, + "grad_norm": 0.6730854511260986, + "learning_rate": 3.894524027391031e-06, + "loss": 0.6075, + "step": 6819 + }, + { + "epoch": 1.889196675900277, + "grad_norm": 0.6942018866539001, + "learning_rate": 3.894221667467074e-06, + "loss": 0.627, + "step": 6820 + }, + { + "epoch": 1.8894736842105262, + "grad_norm": 0.7450907230377197, + "learning_rate": 3.893919277940231e-06, + "loss": 0.5793, + "step": 6821 + }, + { + "epoch": 1.8897506925207757, + "grad_norm": 0.6789368987083435, + "learning_rate": 3.893616858816921e-06, + "loss": 0.5696, + "step": 6822 + }, + { + "epoch": 1.890027700831025, + "grad_norm": 0.6660521626472473, + "learning_rate": 3.893314410103568e-06, + "loss": 0.5933, + "step": 6823 + }, + { + "epoch": 1.8903047091412741, + "grad_norm": 0.7321332693099976, + "learning_rate": 3.8930119318065905e-06, + "loss": 0.6425, + "step": 6824 + }, + { + "epoch": 1.8905817174515236, + "grad_norm": 0.7297572493553162, + "learning_rate": 3.892709423932412e-06, + "loss": 0.6211, + "step": 6825 + }, + { + "epoch": 1.8908587257617728, + "grad_norm": 0.6849077939987183, + "learning_rate": 3.892406886487457e-06, + "loss": 0.6179, + "step": 6826 + }, + { + "epoch": 1.891135734072022, + "grad_norm": 0.6857331395149231, + "learning_rate": 3.892104319478148e-06, + "loss": 0.6557, + "step": 6827 + }, + { + "epoch": 1.8914127423822715, + "grad_norm": 0.7214446663856506, + "learning_rate": 3.891801722910909e-06, + "loss": 0.6169, + "step": 6828 + }, + { + "epoch": 1.8916897506925208, + "grad_norm": 0.7125465869903564, + "learning_rate": 3.891499096792165e-06, + "loss": 0.5925, + "step": 6829 + }, + { + "epoch": 1.89196675900277, + "grad_norm": 0.7323399782180786, + "learning_rate": 3.891196441128342e-06, + "loss": 0.6084, + "step": 6830 + }, + { + "epoch": 1.8922437673130195, + "grad_norm": 0.7022059559822083, + "learning_rate": 3.890893755925865e-06, + "loss": 0.6626, + "step": 6831 + }, + { + "epoch": 1.8925207756232687, + "grad_norm": 0.7302187085151672, + "learning_rate": 3.890591041191162e-06, + "loss": 0.614, + "step": 6832 + }, + { + "epoch": 1.892797783933518, + "grad_norm": 0.7071456909179688, + "learning_rate": 3.89028829693066e-06, + "loss": 0.6617, + "step": 6833 + }, + { + "epoch": 1.8930747922437674, + "grad_norm": 0.7259766459465027, + "learning_rate": 3.889985523150788e-06, + "loss": 0.633, + "step": 6834 + }, + { + "epoch": 1.8933518005540166, + "grad_norm": 0.7103668451309204, + "learning_rate": 3.889682719857972e-06, + "loss": 0.641, + "step": 6835 + }, + { + "epoch": 1.8936288088642659, + "grad_norm": 0.7698994278907776, + "learning_rate": 3.889379887058644e-06, + "loss": 0.6379, + "step": 6836 + }, + { + "epoch": 1.8939058171745153, + "grad_norm": 0.7132614850997925, + "learning_rate": 3.889077024759233e-06, + "loss": 0.6053, + "step": 6837 + }, + { + "epoch": 1.8941828254847646, + "grad_norm": 0.7450546622276306, + "learning_rate": 3.8887741329661695e-06, + "loss": 0.6214, + "step": 6838 + }, + { + "epoch": 1.8944598337950138, + "grad_norm": 0.6779670715332031, + "learning_rate": 3.888471211685885e-06, + "loss": 0.6601, + "step": 6839 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.6860229969024658, + "learning_rate": 3.888168260924809e-06, + "loss": 0.6084, + "step": 6840 + }, + { + "epoch": 1.8950138504155125, + "grad_norm": 0.7304345369338989, + "learning_rate": 3.887865280689378e-06, + "loss": 0.619, + "step": 6841 + }, + { + "epoch": 1.8952908587257618, + "grad_norm": 0.6954903602600098, + "learning_rate": 3.887562270986021e-06, + "loss": 0.6084, + "step": 6842 + }, + { + "epoch": 1.8955678670360112, + "grad_norm": 0.6936541795730591, + "learning_rate": 3.887259231821176e-06, + "loss": 0.6134, + "step": 6843 + }, + { + "epoch": 1.8958448753462602, + "grad_norm": 0.7285112142562866, + "learning_rate": 3.886956163201272e-06, + "loss": 0.6549, + "step": 6844 + }, + { + "epoch": 1.8961218836565097, + "grad_norm": 0.7493306398391724, + "learning_rate": 3.886653065132749e-06, + "loss": 0.6692, + "step": 6845 + }, + { + "epoch": 1.8963988919667591, + "grad_norm": 0.7186040878295898, + "learning_rate": 3.886349937622039e-06, + "loss": 0.6429, + "step": 6846 + }, + { + "epoch": 1.8966759002770082, + "grad_norm": 0.6854330897331238, + "learning_rate": 3.8860467806755795e-06, + "loss": 0.6766, + "step": 6847 + }, + { + "epoch": 1.8969529085872576, + "grad_norm": 0.684589684009552, + "learning_rate": 3.885743594299808e-06, + "loss": 0.6185, + "step": 6848 + }, + { + "epoch": 1.897229916897507, + "grad_norm": 0.6987202167510986, + "learning_rate": 3.88544037850116e-06, + "loss": 0.5872, + "step": 6849 + }, + { + "epoch": 1.897506925207756, + "grad_norm": 0.6784823536872864, + "learning_rate": 3.885137133286076e-06, + "loss": 0.6039, + "step": 6850 + }, + { + "epoch": 1.8977839335180056, + "grad_norm": 0.6691726446151733, + "learning_rate": 3.8848338586609934e-06, + "loss": 0.6003, + "step": 6851 + }, + { + "epoch": 1.8980609418282548, + "grad_norm": 0.6847113966941833, + "learning_rate": 3.884530554632351e-06, + "loss": 0.6125, + "step": 6852 + }, + { + "epoch": 1.898337950138504, + "grad_norm": 0.667030394077301, + "learning_rate": 3.884227221206589e-06, + "loss": 0.5736, + "step": 6853 + }, + { + "epoch": 1.8986149584487535, + "grad_norm": 0.7349780201911926, + "learning_rate": 3.883923858390149e-06, + "loss": 0.6526, + "step": 6854 + }, + { + "epoch": 1.8988919667590027, + "grad_norm": 0.6886972188949585, + "learning_rate": 3.88362046618947e-06, + "loss": 0.6634, + "step": 6855 + }, + { + "epoch": 1.899168975069252, + "grad_norm": 0.7415412664413452, + "learning_rate": 3.883317044610996e-06, + "loss": 0.6238, + "step": 6856 + }, + { + "epoch": 1.8994459833795014, + "grad_norm": 0.7218773365020752, + "learning_rate": 3.883013593661168e-06, + "loss": 0.6392, + "step": 6857 + }, + { + "epoch": 1.8997229916897507, + "grad_norm": 0.741144061088562, + "learning_rate": 3.88271011334643e-06, + "loss": 0.6259, + "step": 6858 + }, + { + "epoch": 1.9, + "grad_norm": 0.7077538371086121, + "learning_rate": 3.882406603673226e-06, + "loss": 0.6401, + "step": 6859 + }, + { + "epoch": 1.9002770083102494, + "grad_norm": 0.6991947293281555, + "learning_rate": 3.882103064647999e-06, + "loss": 0.61, + "step": 6860 + }, + { + "epoch": 1.9005540166204986, + "grad_norm": 0.7192123532295227, + "learning_rate": 3.881799496277194e-06, + "loss": 0.6193, + "step": 6861 + }, + { + "epoch": 1.9008310249307478, + "grad_norm": 0.7007842659950256, + "learning_rate": 3.8814958985672564e-06, + "loss": 0.6135, + "step": 6862 + }, + { + "epoch": 1.9011080332409973, + "grad_norm": 0.7086727619171143, + "learning_rate": 3.881192271524634e-06, + "loss": 0.6553, + "step": 6863 + }, + { + "epoch": 1.9013850415512465, + "grad_norm": 0.7087380290031433, + "learning_rate": 3.880888615155772e-06, + "loss": 0.6604, + "step": 6864 + }, + { + "epoch": 1.9016620498614958, + "grad_norm": 0.693657398223877, + "learning_rate": 3.880584929467119e-06, + "loss": 0.641, + "step": 6865 + }, + { + "epoch": 1.9019390581717452, + "grad_norm": 0.6947059035301208, + "learning_rate": 3.880281214465122e-06, + "loss": 0.6528, + "step": 6866 + }, + { + "epoch": 1.9022160664819945, + "grad_norm": 0.7018727660179138, + "learning_rate": 3.879977470156229e-06, + "loss": 0.6823, + "step": 6867 + }, + { + "epoch": 1.9024930747922437, + "grad_norm": 0.7383748888969421, + "learning_rate": 3.879673696546891e-06, + "loss": 0.6329, + "step": 6868 + }, + { + "epoch": 1.9027700831024932, + "grad_norm": 0.7053066492080688, + "learning_rate": 3.8793698936435574e-06, + "loss": 0.5968, + "step": 6869 + }, + { + "epoch": 1.9030470914127424, + "grad_norm": 0.7110251784324646, + "learning_rate": 3.8790660614526776e-06, + "loss": 0.6363, + "step": 6870 + }, + { + "epoch": 1.9033240997229917, + "grad_norm": 0.6861403584480286, + "learning_rate": 3.878762199980705e-06, + "loss": 0.6327, + "step": 6871 + }, + { + "epoch": 1.9036011080332411, + "grad_norm": 0.7011222243309021, + "learning_rate": 3.878458309234089e-06, + "loss": 0.608, + "step": 6872 + }, + { + "epoch": 1.9038781163434904, + "grad_norm": 0.7192626595497131, + "learning_rate": 3.8781543892192825e-06, + "loss": 0.6199, + "step": 6873 + }, + { + "epoch": 1.9041551246537396, + "grad_norm": 0.6861461400985718, + "learning_rate": 3.877850439942739e-06, + "loss": 0.601, + "step": 6874 + }, + { + "epoch": 1.904432132963989, + "grad_norm": 0.6651129722595215, + "learning_rate": 3.877546461410912e-06, + "loss": 0.6312, + "step": 6875 + }, + { + "epoch": 1.904709141274238, + "grad_norm": 0.6886206269264221, + "learning_rate": 3.8772424536302565e-06, + "loss": 0.6376, + "step": 6876 + }, + { + "epoch": 1.9049861495844875, + "grad_norm": 0.7215523719787598, + "learning_rate": 3.876938416607227e-06, + "loss": 0.6144, + "step": 6877 + }, + { + "epoch": 1.905263157894737, + "grad_norm": 0.7637720108032227, + "learning_rate": 3.876634350348278e-06, + "loss": 0.6296, + "step": 6878 + }, + { + "epoch": 1.905540166204986, + "grad_norm": 0.7078676819801331, + "learning_rate": 3.876330254859865e-06, + "loss": 0.6593, + "step": 6879 + }, + { + "epoch": 1.9058171745152355, + "grad_norm": 0.6875743865966797, + "learning_rate": 3.876026130148447e-06, + "loss": 0.5992, + "step": 6880 + }, + { + "epoch": 1.906094182825485, + "grad_norm": 0.6959920525550842, + "learning_rate": 3.87572197622048e-06, + "loss": 0.6229, + "step": 6881 + }, + { + "epoch": 1.906371191135734, + "grad_norm": 0.7063423991203308, + "learning_rate": 3.875417793082423e-06, + "loss": 0.5705, + "step": 6882 + }, + { + "epoch": 1.9066481994459834, + "grad_norm": 0.6862561702728271, + "learning_rate": 3.875113580740732e-06, + "loss": 0.5985, + "step": 6883 + }, + { + "epoch": 1.9069252077562326, + "grad_norm": 0.6900488138198853, + "learning_rate": 3.874809339201871e-06, + "loss": 0.6148, + "step": 6884 + }, + { + "epoch": 1.9072022160664819, + "grad_norm": 0.7383355498313904, + "learning_rate": 3.874505068472294e-06, + "loss": 0.5925, + "step": 6885 + }, + { + "epoch": 1.9074792243767313, + "grad_norm": 0.7217558026313782, + "learning_rate": 3.874200768558466e-06, + "loss": 0.6417, + "step": 6886 + }, + { + "epoch": 1.9077562326869806, + "grad_norm": 0.6995415687561035, + "learning_rate": 3.873896439466845e-06, + "loss": 0.6171, + "step": 6887 + }, + { + "epoch": 1.9080332409972298, + "grad_norm": 0.6916835904121399, + "learning_rate": 3.873592081203895e-06, + "loss": 0.6121, + "step": 6888 + }, + { + "epoch": 1.9083102493074793, + "grad_norm": 0.6758975386619568, + "learning_rate": 3.873287693776077e-06, + "loss": 0.5845, + "step": 6889 + }, + { + "epoch": 1.9085872576177285, + "grad_norm": 0.6973399519920349, + "learning_rate": 3.872983277189854e-06, + "loss": 0.652, + "step": 6890 + }, + { + "epoch": 1.9088642659279778, + "grad_norm": 0.6787953972816467, + "learning_rate": 3.8726788314516915e-06, + "loss": 0.6738, + "step": 6891 + }, + { + "epoch": 1.9091412742382272, + "grad_norm": 0.6787337064743042, + "learning_rate": 3.87237435656805e-06, + "loss": 0.6276, + "step": 6892 + }, + { + "epoch": 1.9094182825484765, + "grad_norm": 0.6608779430389404, + "learning_rate": 3.872069852545397e-06, + "loss": 0.5507, + "step": 6893 + }, + { + "epoch": 1.9096952908587257, + "grad_norm": 0.7295418381690979, + "learning_rate": 3.871765319390197e-06, + "loss": 0.618, + "step": 6894 + }, + { + "epoch": 1.9099722991689752, + "grad_norm": 0.6533291935920715, + "learning_rate": 3.8714607571089165e-06, + "loss": 0.6097, + "step": 6895 + }, + { + "epoch": 1.9102493074792244, + "grad_norm": 0.6750076413154602, + "learning_rate": 3.871156165708021e-06, + "loss": 0.6211, + "step": 6896 + }, + { + "epoch": 1.9105263157894736, + "grad_norm": 0.7045161724090576, + "learning_rate": 3.870851545193979e-06, + "loss": 0.6372, + "step": 6897 + }, + { + "epoch": 1.910803324099723, + "grad_norm": 0.7043505907058716, + "learning_rate": 3.870546895573258e-06, + "loss": 0.6611, + "step": 6898 + }, + { + "epoch": 1.9110803324099723, + "grad_norm": 0.7087211012840271, + "learning_rate": 3.8702422168523265e-06, + "loss": 0.6516, + "step": 6899 + }, + { + "epoch": 1.9113573407202216, + "grad_norm": 0.6823852062225342, + "learning_rate": 3.869937509037654e-06, + "loss": 0.6354, + "step": 6900 + }, + { + "epoch": 1.911634349030471, + "grad_norm": 0.7137317657470703, + "learning_rate": 3.869632772135708e-06, + "loss": 0.6512, + "step": 6901 + }, + { + "epoch": 1.9119113573407203, + "grad_norm": 0.6764719486236572, + "learning_rate": 3.869328006152963e-06, + "loss": 0.6574, + "step": 6902 + }, + { + "epoch": 1.9121883656509695, + "grad_norm": 0.7081059217453003, + "learning_rate": 3.869023211095886e-06, + "loss": 0.612, + "step": 6903 + }, + { + "epoch": 1.912465373961219, + "grad_norm": 0.7032474279403687, + "learning_rate": 3.868718386970951e-06, + "loss": 0.6544, + "step": 6904 + }, + { + "epoch": 1.912742382271468, + "grad_norm": 0.7301380038261414, + "learning_rate": 3.868413533784629e-06, + "loss": 0.6419, + "step": 6905 + }, + { + "epoch": 1.9130193905817174, + "grad_norm": 0.7018072009086609, + "learning_rate": 3.8681086515433916e-06, + "loss": 0.6219, + "step": 6906 + }, + { + "epoch": 1.913296398891967, + "grad_norm": 0.7180678248405457, + "learning_rate": 3.867803740253715e-06, + "loss": 0.6432, + "step": 6907 + }, + { + "epoch": 1.913573407202216, + "grad_norm": 0.7180802226066589, + "learning_rate": 3.867498799922073e-06, + "loss": 0.646, + "step": 6908 + }, + { + "epoch": 1.9138504155124654, + "grad_norm": 0.654554009437561, + "learning_rate": 3.867193830554938e-06, + "loss": 0.612, + "step": 6909 + }, + { + "epoch": 1.9141274238227148, + "grad_norm": 0.6916167736053467, + "learning_rate": 3.866888832158787e-06, + "loss": 0.6228, + "step": 6910 + }, + { + "epoch": 1.9144044321329639, + "grad_norm": 0.7020214796066284, + "learning_rate": 3.866583804740095e-06, + "loss": 0.6184, + "step": 6911 + }, + { + "epoch": 1.9146814404432133, + "grad_norm": 0.7106627821922302, + "learning_rate": 3.866278748305339e-06, + "loss": 0.6465, + "step": 6912 + }, + { + "epoch": 1.9149584487534625, + "grad_norm": 0.6954221725463867, + "learning_rate": 3.865973662860997e-06, + "loss": 0.6271, + "step": 6913 + }, + { + "epoch": 1.9152354570637118, + "grad_norm": 0.6910799741744995, + "learning_rate": 3.865668548413545e-06, + "loss": 0.5818, + "step": 6914 + }, + { + "epoch": 1.9155124653739612, + "grad_norm": 0.6691675782203674, + "learning_rate": 3.865363404969462e-06, + "loss": 0.5956, + "step": 6915 + }, + { + "epoch": 1.9157894736842105, + "grad_norm": 0.7135936617851257, + "learning_rate": 3.865058232535228e-06, + "loss": 0.633, + "step": 6916 + }, + { + "epoch": 1.9160664819944597, + "grad_norm": 0.7141229510307312, + "learning_rate": 3.864753031117321e-06, + "loss": 0.6207, + "step": 6917 + }, + { + "epoch": 1.9163434903047092, + "grad_norm": 0.7615339756011963, + "learning_rate": 3.864447800722222e-06, + "loss": 0.6407, + "step": 6918 + }, + { + "epoch": 1.9166204986149584, + "grad_norm": 0.6961625814437866, + "learning_rate": 3.864142541356411e-06, + "loss": 0.618, + "step": 6919 + }, + { + "epoch": 1.9168975069252077, + "grad_norm": 0.6443533301353455, + "learning_rate": 3.863837253026372e-06, + "loss": 0.547, + "step": 6920 + }, + { + "epoch": 1.9171745152354571, + "grad_norm": 0.700282096862793, + "learning_rate": 3.863531935738585e-06, + "loss": 0.678, + "step": 6921 + }, + { + "epoch": 1.9174515235457064, + "grad_norm": 0.7155157923698425, + "learning_rate": 3.863226589499531e-06, + "loss": 0.6291, + "step": 6922 + }, + { + "epoch": 1.9177285318559556, + "grad_norm": 0.7124128341674805, + "learning_rate": 3.862921214315698e-06, + "loss": 0.6286, + "step": 6923 + }, + { + "epoch": 1.918005540166205, + "grad_norm": 0.7020770907402039, + "learning_rate": 3.862615810193565e-06, + "loss": 0.6362, + "step": 6924 + }, + { + "epoch": 1.9182825484764543, + "grad_norm": 0.703712522983551, + "learning_rate": 3.86231037713962e-06, + "loss": 0.6393, + "step": 6925 + }, + { + "epoch": 1.9185595567867035, + "grad_norm": 0.6949636340141296, + "learning_rate": 3.862004915160346e-06, + "loss": 0.645, + "step": 6926 + }, + { + "epoch": 1.918836565096953, + "grad_norm": 0.682137668132782, + "learning_rate": 3.861699424262229e-06, + "loss": 0.6339, + "step": 6927 + }, + { + "epoch": 1.9191135734072022, + "grad_norm": 0.7261094450950623, + "learning_rate": 3.861393904451756e-06, + "loss": 0.6081, + "step": 6928 + }, + { + "epoch": 1.9193905817174515, + "grad_norm": 0.7021178007125854, + "learning_rate": 3.861088355735414e-06, + "loss": 0.6645, + "step": 6929 + }, + { + "epoch": 1.919667590027701, + "grad_norm": 0.6526563763618469, + "learning_rate": 3.86078277811969e-06, + "loss": 0.5589, + "step": 6930 + }, + { + "epoch": 1.9199445983379502, + "grad_norm": 0.7094777226448059, + "learning_rate": 3.860477171611073e-06, + "loss": 0.6477, + "step": 6931 + }, + { + "epoch": 1.9202216066481994, + "grad_norm": 0.705771803855896, + "learning_rate": 3.860171536216052e-06, + "loss": 0.6519, + "step": 6932 + }, + { + "epoch": 1.9204986149584489, + "grad_norm": 0.6879554390907288, + "learning_rate": 3.859865871941115e-06, + "loss": 0.6272, + "step": 6933 + }, + { + "epoch": 1.920775623268698, + "grad_norm": 0.7064213156700134, + "learning_rate": 3.859560178792753e-06, + "loss": 0.6137, + "step": 6934 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.694358229637146, + "learning_rate": 3.8592544567774556e-06, + "loss": 0.644, + "step": 6935 + }, + { + "epoch": 1.9213296398891968, + "grad_norm": 0.6568576097488403, + "learning_rate": 3.858948705901716e-06, + "loss": 0.5892, + "step": 6936 + }, + { + "epoch": 1.9216066481994458, + "grad_norm": 0.6796807050704956, + "learning_rate": 3.858642926172025e-06, + "loss": 0.574, + "step": 6937 + }, + { + "epoch": 1.9218836565096953, + "grad_norm": 0.66985684633255, + "learning_rate": 3.8583371175948745e-06, + "loss": 0.6177, + "step": 6938 + }, + { + "epoch": 1.9221606648199447, + "grad_norm": 0.7518061399459839, + "learning_rate": 3.858031280176759e-06, + "loss": 0.6234, + "step": 6939 + }, + { + "epoch": 1.9224376731301938, + "grad_norm": 0.6907498240470886, + "learning_rate": 3.857725413924171e-06, + "loss": 0.6326, + "step": 6940 + }, + { + "epoch": 1.9227146814404432, + "grad_norm": 0.6684120893478394, + "learning_rate": 3.8574195188436046e-06, + "loss": 0.6297, + "step": 6941 + }, + { + "epoch": 1.9229916897506927, + "grad_norm": 0.6897550225257874, + "learning_rate": 3.857113594941556e-06, + "loss": 0.6339, + "step": 6942 + }, + { + "epoch": 1.9232686980609417, + "grad_norm": 0.7206911444664001, + "learning_rate": 3.85680764222452e-06, + "loss": 0.6439, + "step": 6943 + }, + { + "epoch": 1.9235457063711912, + "grad_norm": 0.7342230677604675, + "learning_rate": 3.856501660698993e-06, + "loss": 0.6193, + "step": 6944 + }, + { + "epoch": 1.9238227146814404, + "grad_norm": 0.6811208724975586, + "learning_rate": 3.856195650371471e-06, + "loss": 0.5903, + "step": 6945 + }, + { + "epoch": 1.9240997229916896, + "grad_norm": 0.6885085701942444, + "learning_rate": 3.855889611248452e-06, + "loss": 0.6008, + "step": 6946 + }, + { + "epoch": 1.924376731301939, + "grad_norm": 0.7090029716491699, + "learning_rate": 3.855583543336436e-06, + "loss": 0.6232, + "step": 6947 + }, + { + "epoch": 1.9246537396121883, + "grad_norm": 0.7263821959495544, + "learning_rate": 3.855277446641917e-06, + "loss": 0.6485, + "step": 6948 + }, + { + "epoch": 1.9249307479224376, + "grad_norm": 0.7115022540092468, + "learning_rate": 3.854971321171398e-06, + "loss": 0.6102, + "step": 6949 + }, + { + "epoch": 1.925207756232687, + "grad_norm": 0.6860071420669556, + "learning_rate": 3.854665166931377e-06, + "loss": 0.5767, + "step": 6950 + }, + { + "epoch": 1.9254847645429363, + "grad_norm": 0.6789618730545044, + "learning_rate": 3.854358983928356e-06, + "loss": 0.6297, + "step": 6951 + }, + { + "epoch": 1.9257617728531855, + "grad_norm": 0.688520073890686, + "learning_rate": 3.854052772168834e-06, + "loss": 0.6187, + "step": 6952 + }, + { + "epoch": 1.926038781163435, + "grad_norm": 0.7230477333068848, + "learning_rate": 3.853746531659315e-06, + "loss": 0.6277, + "step": 6953 + }, + { + "epoch": 1.9263157894736842, + "grad_norm": 0.687503457069397, + "learning_rate": 3.853440262406299e-06, + "loss": 0.6624, + "step": 6954 + }, + { + "epoch": 1.9265927977839334, + "grad_norm": 0.6723766922950745, + "learning_rate": 3.853133964416291e-06, + "loss": 0.574, + "step": 6955 + }, + { + "epoch": 1.926869806094183, + "grad_norm": 0.7176601886749268, + "learning_rate": 3.852827637695793e-06, + "loss": 0.6213, + "step": 6956 + }, + { + "epoch": 1.9271468144044321, + "grad_norm": 0.6974349021911621, + "learning_rate": 3.852521282251309e-06, + "loss": 0.6563, + "step": 6957 + }, + { + "epoch": 1.9274238227146814, + "grad_norm": 0.6568154692649841, + "learning_rate": 3.852214898089344e-06, + "loss": 0.6399, + "step": 6958 + }, + { + "epoch": 1.9277008310249308, + "grad_norm": 0.688548743724823, + "learning_rate": 3.851908485216404e-06, + "loss": 0.6376, + "step": 6959 + }, + { + "epoch": 1.92797783933518, + "grad_norm": 0.7333968281745911, + "learning_rate": 3.8516020436389945e-06, + "loss": 0.6203, + "step": 6960 + }, + { + "epoch": 1.9282548476454293, + "grad_norm": 0.6940326690673828, + "learning_rate": 3.851295573363622e-06, + "loss": 0.605, + "step": 6961 + }, + { + "epoch": 1.9285318559556788, + "grad_norm": 0.7149107456207275, + "learning_rate": 3.8509890743967935e-06, + "loss": 0.6015, + "step": 6962 + }, + { + "epoch": 1.928808864265928, + "grad_norm": 0.7581908106803894, + "learning_rate": 3.850682546745017e-06, + "loss": 0.6426, + "step": 6963 + }, + { + "epoch": 1.9290858725761773, + "grad_norm": 0.6629204750061035, + "learning_rate": 3.8503759904148005e-06, + "loss": 0.5729, + "step": 6964 + }, + { + "epoch": 1.9293628808864267, + "grad_norm": 0.6844896674156189, + "learning_rate": 3.850069405412654e-06, + "loss": 0.6621, + "step": 6965 + }, + { + "epoch": 1.929639889196676, + "grad_norm": 0.725421130657196, + "learning_rate": 3.849762791745085e-06, + "loss": 0.6268, + "step": 6966 + }, + { + "epoch": 1.9299168975069252, + "grad_norm": 0.7454162836074829, + "learning_rate": 3.8494561494186064e-06, + "loss": 0.6373, + "step": 6967 + }, + { + "epoch": 1.9301939058171746, + "grad_norm": 0.7131611108779907, + "learning_rate": 3.849149478439727e-06, + "loss": 0.6257, + "step": 6968 + }, + { + "epoch": 1.9304709141274237, + "grad_norm": 0.7022071480751038, + "learning_rate": 3.848842778814959e-06, + "loss": 0.6146, + "step": 6969 + }, + { + "epoch": 1.9307479224376731, + "grad_norm": 0.699440598487854, + "learning_rate": 3.848536050550814e-06, + "loss": 0.5895, + "step": 6970 + }, + { + "epoch": 1.9310249307479226, + "grad_norm": 0.6625146865844727, + "learning_rate": 3.8482292936538045e-06, + "loss": 0.5855, + "step": 6971 + }, + { + "epoch": 1.9313019390581716, + "grad_norm": 0.7169764041900635, + "learning_rate": 3.847922508130445e-06, + "loss": 0.6225, + "step": 6972 + }, + { + "epoch": 1.931578947368421, + "grad_norm": 0.6792388558387756, + "learning_rate": 3.847615693987248e-06, + "loss": 0.6174, + "step": 6973 + }, + { + "epoch": 1.9318559556786705, + "grad_norm": 0.7312890887260437, + "learning_rate": 3.847308851230728e-06, + "loss": 0.6168, + "step": 6974 + }, + { + "epoch": 1.9321329639889195, + "grad_norm": 0.6699267625808716, + "learning_rate": 3.847001979867401e-06, + "loss": 0.5454, + "step": 6975 + }, + { + "epoch": 1.932409972299169, + "grad_norm": 0.6831119060516357, + "learning_rate": 3.846695079903782e-06, + "loss": 0.5931, + "step": 6976 + }, + { + "epoch": 1.9326869806094182, + "grad_norm": 0.6708670854568481, + "learning_rate": 3.846388151346387e-06, + "loss": 0.5962, + "step": 6977 + }, + { + "epoch": 1.9329639889196675, + "grad_norm": 0.6922425627708435, + "learning_rate": 3.846081194201734e-06, + "loss": 0.6173, + "step": 6978 + }, + { + "epoch": 1.933240997229917, + "grad_norm": 0.6881274580955505, + "learning_rate": 3.845774208476339e-06, + "loss": 0.5995, + "step": 6979 + }, + { + "epoch": 1.9335180055401662, + "grad_norm": 0.6586433053016663, + "learning_rate": 3.845467194176721e-06, + "loss": 0.5384, + "step": 6980 + }, + { + "epoch": 1.9337950138504154, + "grad_norm": 0.7081720232963562, + "learning_rate": 3.845160151309399e-06, + "loss": 0.6032, + "step": 6981 + }, + { + "epoch": 1.9340720221606649, + "grad_norm": 0.681023120880127, + "learning_rate": 3.8448530798808905e-06, + "loss": 0.5974, + "step": 6982 + }, + { + "epoch": 1.934349030470914, + "grad_norm": 0.6930094361305237, + "learning_rate": 3.844545979897719e-06, + "loss": 0.6471, + "step": 6983 + }, + { + "epoch": 1.9346260387811633, + "grad_norm": 0.6955491900444031, + "learning_rate": 3.844238851366401e-06, + "loss": 0.59, + "step": 6984 + }, + { + "epoch": 1.9349030470914128, + "grad_norm": 0.6596637964248657, + "learning_rate": 3.843931694293459e-06, + "loss": 0.6269, + "step": 6985 + }, + { + "epoch": 1.935180055401662, + "grad_norm": 0.7362352013587952, + "learning_rate": 3.843624508685416e-06, + "loss": 0.6512, + "step": 6986 + }, + { + "epoch": 1.9354570637119113, + "grad_norm": 0.6971285343170166, + "learning_rate": 3.843317294548793e-06, + "loss": 0.618, + "step": 6987 + }, + { + "epoch": 1.9357340720221607, + "grad_norm": 0.7472339868545532, + "learning_rate": 3.843010051890114e-06, + "loss": 0.6397, + "step": 6988 + }, + { + "epoch": 1.93601108033241, + "grad_norm": 0.6800471544265747, + "learning_rate": 3.842702780715902e-06, + "loss": 0.5976, + "step": 6989 + }, + { + "epoch": 1.9362880886426592, + "grad_norm": 0.7063587307929993, + "learning_rate": 3.842395481032681e-06, + "loss": 0.6294, + "step": 6990 + }, + { + "epoch": 1.9365650969529087, + "grad_norm": 0.7280341386795044, + "learning_rate": 3.842088152846975e-06, + "loss": 0.6331, + "step": 6991 + }, + { + "epoch": 1.936842105263158, + "grad_norm": 0.6882317066192627, + "learning_rate": 3.841780796165312e-06, + "loss": 0.6022, + "step": 6992 + }, + { + "epoch": 1.9371191135734072, + "grad_norm": 0.6916040182113647, + "learning_rate": 3.8414734109942145e-06, + "loss": 0.6238, + "step": 6993 + }, + { + "epoch": 1.9373961218836566, + "grad_norm": 0.7397603988647461, + "learning_rate": 3.841165997340211e-06, + "loss": 0.6635, + "step": 6994 + }, + { + "epoch": 1.9376731301939059, + "grad_norm": 0.6798705458641052, + "learning_rate": 3.840858555209828e-06, + "loss": 0.6374, + "step": 6995 + }, + { + "epoch": 1.937950138504155, + "grad_norm": 0.7833229899406433, + "learning_rate": 3.8405510846095945e-06, + "loss": 0.6667, + "step": 6996 + }, + { + "epoch": 1.9382271468144046, + "grad_norm": 0.7077188491821289, + "learning_rate": 3.840243585546037e-06, + "loss": 0.5957, + "step": 6997 + }, + { + "epoch": 1.9385041551246538, + "grad_norm": 0.6946340799331665, + "learning_rate": 3.8399360580256876e-06, + "loss": 0.6245, + "step": 6998 + }, + { + "epoch": 1.938781163434903, + "grad_norm": 0.6952667832374573, + "learning_rate": 3.839628502055072e-06, + "loss": 0.6248, + "step": 6999 + }, + { + "epoch": 1.9390581717451525, + "grad_norm": 0.6928011775016785, + "learning_rate": 3.839320917640722e-06, + "loss": 0.6322, + "step": 7000 + }, + { + "epoch": 1.9393351800554015, + "grad_norm": 0.6780809164047241, + "learning_rate": 3.83901330478917e-06, + "loss": 0.5967, + "step": 7001 + }, + { + "epoch": 1.939612188365651, + "grad_norm": 0.6768255829811096, + "learning_rate": 3.838705663506947e-06, + "loss": 0.6307, + "step": 7002 + }, + { + "epoch": 1.9398891966759004, + "grad_norm": 0.6668100357055664, + "learning_rate": 3.838397993800582e-06, + "loss": 0.626, + "step": 7003 + }, + { + "epoch": 1.9401662049861494, + "grad_norm": 0.6606988906860352, + "learning_rate": 3.83809029567661e-06, + "loss": 0.5936, + "step": 7004 + }, + { + "epoch": 1.940443213296399, + "grad_norm": 0.7164947986602783, + "learning_rate": 3.837782569141565e-06, + "loss": 0.581, + "step": 7005 + }, + { + "epoch": 1.9407202216066484, + "grad_norm": 0.7346347570419312, + "learning_rate": 3.837474814201979e-06, + "loss": 0.6123, + "step": 7006 + }, + { + "epoch": 1.9409972299168974, + "grad_norm": 0.7159395813941956, + "learning_rate": 3.837167030864387e-06, + "loss": 0.6525, + "step": 7007 + }, + { + "epoch": 1.9412742382271468, + "grad_norm": 0.7122390866279602, + "learning_rate": 3.8368592191353246e-06, + "loss": 0.6176, + "step": 7008 + }, + { + "epoch": 1.941551246537396, + "grad_norm": 0.7087534666061401, + "learning_rate": 3.836551379021326e-06, + "loss": 0.6226, + "step": 7009 + }, + { + "epoch": 1.9418282548476453, + "grad_norm": 0.7045443058013916, + "learning_rate": 3.83624351052893e-06, + "loss": 0.6429, + "step": 7010 + }, + { + "epoch": 1.9421052631578948, + "grad_norm": 0.6841949820518494, + "learning_rate": 3.835935613664671e-06, + "loss": 0.6247, + "step": 7011 + }, + { + "epoch": 1.942382271468144, + "grad_norm": 0.7041621208190918, + "learning_rate": 3.835627688435088e-06, + "loss": 0.6402, + "step": 7012 + }, + { + "epoch": 1.9426592797783933, + "grad_norm": 0.7085411548614502, + "learning_rate": 3.835319734846717e-06, + "loss": 0.6208, + "step": 7013 + }, + { + "epoch": 1.9429362880886427, + "grad_norm": 0.7151727676391602, + "learning_rate": 3.835011752906099e-06, + "loss": 0.6507, + "step": 7014 + }, + { + "epoch": 1.943213296398892, + "grad_norm": 0.6990612149238586, + "learning_rate": 3.834703742619772e-06, + "loss": 0.5722, + "step": 7015 + }, + { + "epoch": 1.9434903047091412, + "grad_norm": 0.728863000869751, + "learning_rate": 3.8343957039942755e-06, + "loss": 0.6511, + "step": 7016 + }, + { + "epoch": 1.9437673130193907, + "grad_norm": 0.681682288646698, + "learning_rate": 3.834087637036152e-06, + "loss": 0.5959, + "step": 7017 + }, + { + "epoch": 1.94404432132964, + "grad_norm": 0.7117037177085876, + "learning_rate": 3.833779541751941e-06, + "loss": 0.6421, + "step": 7018 + }, + { + "epoch": 1.9443213296398891, + "grad_norm": 0.6765846610069275, + "learning_rate": 3.833471418148183e-06, + "loss": 0.5888, + "step": 7019 + }, + { + "epoch": 1.9445983379501386, + "grad_norm": 0.7212615013122559, + "learning_rate": 3.833163266231421e-06, + "loss": 0.6119, + "step": 7020 + }, + { + "epoch": 1.9448753462603878, + "grad_norm": 0.6902775168418884, + "learning_rate": 3.832855086008199e-06, + "loss": 0.6158, + "step": 7021 + }, + { + "epoch": 1.945152354570637, + "grad_norm": 0.7205889821052551, + "learning_rate": 3.832546877485061e-06, + "loss": 0.6345, + "step": 7022 + }, + { + "epoch": 1.9454293628808865, + "grad_norm": 0.6866292357444763, + "learning_rate": 3.832238640668549e-06, + "loss": 0.6169, + "step": 7023 + }, + { + "epoch": 1.9457063711911358, + "grad_norm": 0.7034695148468018, + "learning_rate": 3.8319303755652084e-06, + "loss": 0.6795, + "step": 7024 + }, + { + "epoch": 1.945983379501385, + "grad_norm": 0.7042098045349121, + "learning_rate": 3.8316220821815845e-06, + "loss": 0.629, + "step": 7025 + }, + { + "epoch": 1.9462603878116345, + "grad_norm": 0.685293972492218, + "learning_rate": 3.831313760524223e-06, + "loss": 0.6395, + "step": 7026 + }, + { + "epoch": 1.9465373961218837, + "grad_norm": 0.6994853019714355, + "learning_rate": 3.831005410599671e-06, + "loss": 0.5892, + "step": 7027 + }, + { + "epoch": 1.946814404432133, + "grad_norm": 0.714125394821167, + "learning_rate": 3.830697032414474e-06, + "loss": 0.6438, + "step": 7028 + }, + { + "epoch": 1.9470914127423824, + "grad_norm": 0.6867514848709106, + "learning_rate": 3.830388625975182e-06, + "loss": 0.6684, + "step": 7029 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.7274505496025085, + "learning_rate": 3.830080191288342e-06, + "loss": 0.6044, + "step": 7030 + }, + { + "epoch": 1.9476454293628809, + "grad_norm": 0.7258849143981934, + "learning_rate": 3.829771728360502e-06, + "loss": 0.5791, + "step": 7031 + }, + { + "epoch": 1.9479224376731303, + "grad_norm": 0.786751925945282, + "learning_rate": 3.829463237198213e-06, + "loss": 0.6339, + "step": 7032 + }, + { + "epoch": 1.9481994459833794, + "grad_norm": 0.7100234627723694, + "learning_rate": 3.829154717808025e-06, + "loss": 0.6027, + "step": 7033 + }, + { + "epoch": 1.9484764542936288, + "grad_norm": 0.7123561501502991, + "learning_rate": 3.828846170196487e-06, + "loss": 0.6743, + "step": 7034 + }, + { + "epoch": 1.9487534626038783, + "grad_norm": 0.6572225689888, + "learning_rate": 3.828537594370152e-06, + "loss": 0.6214, + "step": 7035 + }, + { + "epoch": 1.9490304709141273, + "grad_norm": 0.7247850894927979, + "learning_rate": 3.828228990335571e-06, + "loss": 0.634, + "step": 7036 + }, + { + "epoch": 1.9493074792243767, + "grad_norm": 0.6873048543930054, + "learning_rate": 3.827920358099297e-06, + "loss": 0.6139, + "step": 7037 + }, + { + "epoch": 1.949584487534626, + "grad_norm": 0.6822011470794678, + "learning_rate": 3.827611697667881e-06, + "loss": 0.5926, + "step": 7038 + }, + { + "epoch": 1.9498614958448752, + "grad_norm": 0.699113130569458, + "learning_rate": 3.827303009047879e-06, + "loss": 0.5752, + "step": 7039 + }, + { + "epoch": 1.9501385041551247, + "grad_norm": 0.6774667501449585, + "learning_rate": 3.826994292245846e-06, + "loss": 0.6233, + "step": 7040 + }, + { + "epoch": 1.950415512465374, + "grad_norm": 0.6731832027435303, + "learning_rate": 3.826685547268334e-06, + "loss": 0.6007, + "step": 7041 + }, + { + "epoch": 1.9506925207756232, + "grad_norm": 0.6880195736885071, + "learning_rate": 3.8263767741219e-06, + "loss": 0.5603, + "step": 7042 + }, + { + "epoch": 1.9509695290858726, + "grad_norm": 0.6659010052680969, + "learning_rate": 3.8260679728131e-06, + "loss": 0.6244, + "step": 7043 + }, + { + "epoch": 1.9512465373961219, + "grad_norm": 0.6810696125030518, + "learning_rate": 3.825759143348491e-06, + "loss": 0.6181, + "step": 7044 + }, + { + "epoch": 1.951523545706371, + "grad_norm": 0.7221637964248657, + "learning_rate": 3.825450285734629e-06, + "loss": 0.6393, + "step": 7045 + }, + { + "epoch": 1.9518005540166206, + "grad_norm": 0.6795384883880615, + "learning_rate": 3.825141399978073e-06, + "loss": 0.6128, + "step": 7046 + }, + { + "epoch": 1.9520775623268698, + "grad_norm": 0.666467547416687, + "learning_rate": 3.824832486085381e-06, + "loss": 0.6414, + "step": 7047 + }, + { + "epoch": 1.952354570637119, + "grad_norm": 0.6998277306556702, + "learning_rate": 3.824523544063113e-06, + "loss": 0.6423, + "step": 7048 + }, + { + "epoch": 1.9526315789473685, + "grad_norm": 0.7023756504058838, + "learning_rate": 3.824214573917826e-06, + "loss": 0.6358, + "step": 7049 + }, + { + "epoch": 1.9529085872576177, + "grad_norm": 0.6664944887161255, + "learning_rate": 3.8239055756560824e-06, + "loss": 0.6213, + "step": 7050 + }, + { + "epoch": 1.953185595567867, + "grad_norm": 0.7106117606163025, + "learning_rate": 3.823596549284443e-06, + "loss": 0.6264, + "step": 7051 + }, + { + "epoch": 1.9534626038781164, + "grad_norm": 0.673440158367157, + "learning_rate": 3.823287494809469e-06, + "loss": 0.6037, + "step": 7052 + }, + { + "epoch": 1.9537396121883657, + "grad_norm": 0.714493453502655, + "learning_rate": 3.822978412237721e-06, + "loss": 0.6296, + "step": 7053 + }, + { + "epoch": 1.954016620498615, + "grad_norm": 0.7021950483322144, + "learning_rate": 3.822669301575764e-06, + "loss": 0.5957, + "step": 7054 + }, + { + "epoch": 1.9542936288088644, + "grad_norm": 0.6818373799324036, + "learning_rate": 3.822360162830159e-06, + "loss": 0.576, + "step": 7055 + }, + { + "epoch": 1.9545706371191136, + "grad_norm": 0.6853843331336975, + "learning_rate": 3.8220509960074724e-06, + "loss": 0.6438, + "step": 7056 + }, + { + "epoch": 1.9548476454293628, + "grad_norm": 0.728827178478241, + "learning_rate": 3.821741801114266e-06, + "loss": 0.6795, + "step": 7057 + }, + { + "epoch": 1.9551246537396123, + "grad_norm": 0.7150498628616333, + "learning_rate": 3.821432578157105e-06, + "loss": 0.627, + "step": 7058 + }, + { + "epoch": 1.9554016620498615, + "grad_norm": 0.6961706876754761, + "learning_rate": 3.8211233271425575e-06, + "loss": 0.6272, + "step": 7059 + }, + { + "epoch": 1.9556786703601108, + "grad_norm": 0.7418909072875977, + "learning_rate": 3.820814048077186e-06, + "loss": 0.651, + "step": 7060 + }, + { + "epoch": 1.9559556786703602, + "grad_norm": 0.7025729417800903, + "learning_rate": 3.8205047409675614e-06, + "loss": 0.6511, + "step": 7061 + }, + { + "epoch": 1.9562326869806093, + "grad_norm": 0.7075372338294983, + "learning_rate": 3.820195405820248e-06, + "loss": 0.6339, + "step": 7062 + }, + { + "epoch": 1.9565096952908587, + "grad_norm": 0.7278872728347778, + "learning_rate": 3.819886042641814e-06, + "loss": 0.6402, + "step": 7063 + }, + { + "epoch": 1.9567867036011082, + "grad_norm": 0.7034096121788025, + "learning_rate": 3.819576651438829e-06, + "loss": 0.6072, + "step": 7064 + }, + { + "epoch": 1.9570637119113572, + "grad_norm": 0.6780850887298584, + "learning_rate": 3.8192672322178625e-06, + "loss": 0.6045, + "step": 7065 + }, + { + "epoch": 1.9573407202216067, + "grad_norm": 0.6713101863861084, + "learning_rate": 3.818957784985483e-06, + "loss": 0.6397, + "step": 7066 + }, + { + "epoch": 1.9576177285318561, + "grad_norm": 0.6640244722366333, + "learning_rate": 3.818648309748262e-06, + "loss": 0.5437, + "step": 7067 + }, + { + "epoch": 1.9578947368421051, + "grad_norm": 0.6953721642494202, + "learning_rate": 3.81833880651277e-06, + "loss": 0.6341, + "step": 7068 + }, + { + "epoch": 1.9581717451523546, + "grad_norm": 0.6671434640884399, + "learning_rate": 3.818029275285578e-06, + "loss": 0.5959, + "step": 7069 + }, + { + "epoch": 1.9584487534626038, + "grad_norm": 0.7431301474571228, + "learning_rate": 3.817719716073258e-06, + "loss": 0.5948, + "step": 7070 + }, + { + "epoch": 1.958725761772853, + "grad_norm": 0.677884578704834, + "learning_rate": 3.817410128882384e-06, + "loss": 0.5916, + "step": 7071 + }, + { + "epoch": 1.9590027700831025, + "grad_norm": 0.6750688552856445, + "learning_rate": 3.817100513719529e-06, + "loss": 0.6194, + "step": 7072 + }, + { + "epoch": 1.9592797783933518, + "grad_norm": 0.6718873381614685, + "learning_rate": 3.816790870591267e-06, + "loss": 0.6022, + "step": 7073 + }, + { + "epoch": 1.959556786703601, + "grad_norm": 0.7275941967964172, + "learning_rate": 3.816481199504171e-06, + "loss": 0.599, + "step": 7074 + }, + { + "epoch": 1.9598337950138505, + "grad_norm": 0.730496346950531, + "learning_rate": 3.816171500464818e-06, + "loss": 0.6444, + "step": 7075 + }, + { + "epoch": 1.9601108033240997, + "grad_norm": 0.7540499567985535, + "learning_rate": 3.815861773479782e-06, + "loss": 0.6436, + "step": 7076 + }, + { + "epoch": 1.960387811634349, + "grad_norm": 0.6974166631698608, + "learning_rate": 3.815552018555641e-06, + "loss": 0.5722, + "step": 7077 + }, + { + "epoch": 1.9606648199445984, + "grad_norm": 0.6711729764938354, + "learning_rate": 3.815242235698972e-06, + "loss": 0.6126, + "step": 7078 + }, + { + "epoch": 1.9609418282548476, + "grad_norm": 0.74052494764328, + "learning_rate": 3.814932424916349e-06, + "loss": 0.6235, + "step": 7079 + }, + { + "epoch": 1.9612188365650969, + "grad_norm": 0.692291796207428, + "learning_rate": 3.8146225862143542e-06, + "loss": 0.5607, + "step": 7080 + }, + { + "epoch": 1.9614958448753463, + "grad_norm": 0.7283056378364563, + "learning_rate": 3.8143127195995648e-06, + "loss": 0.6665, + "step": 7081 + }, + { + "epoch": 1.9617728531855956, + "grad_norm": 0.6930385828018188, + "learning_rate": 3.81400282507856e-06, + "loss": 0.5921, + "step": 7082 + }, + { + "epoch": 1.9620498614958448, + "grad_norm": 0.7250744700431824, + "learning_rate": 3.8136929026579188e-06, + "loss": 0.6461, + "step": 7083 + }, + { + "epoch": 1.9623268698060943, + "grad_norm": 0.6802274584770203, + "learning_rate": 3.8133829523442234e-06, + "loss": 0.572, + "step": 7084 + }, + { + "epoch": 1.9626038781163435, + "grad_norm": 0.6919212937355042, + "learning_rate": 3.8130729741440536e-06, + "loss": 0.5794, + "step": 7085 + }, + { + "epoch": 1.9628808864265928, + "grad_norm": 0.7192142605781555, + "learning_rate": 3.812762968063991e-06, + "loss": 0.6523, + "step": 7086 + }, + { + "epoch": 1.9631578947368422, + "grad_norm": 0.689793050289154, + "learning_rate": 3.8124529341106188e-06, + "loss": 0.5646, + "step": 7087 + }, + { + "epoch": 1.9634349030470915, + "grad_norm": 0.6961142420768738, + "learning_rate": 3.8121428722905183e-06, + "loss": 0.5875, + "step": 7088 + }, + { + "epoch": 1.9637119113573407, + "grad_norm": 0.7265340089797974, + "learning_rate": 3.8118327826102747e-06, + "loss": 0.6319, + "step": 7089 + }, + { + "epoch": 1.9639889196675901, + "grad_norm": 0.6598079204559326, + "learning_rate": 3.8115226650764708e-06, + "loss": 0.5717, + "step": 7090 + }, + { + "epoch": 1.9642659279778394, + "grad_norm": 0.7008779644966125, + "learning_rate": 3.8112125196956917e-06, + "loss": 0.6106, + "step": 7091 + }, + { + "epoch": 1.9645429362880886, + "grad_norm": 0.6735950708389282, + "learning_rate": 3.810902346474521e-06, + "loss": 0.6352, + "step": 7092 + }, + { + "epoch": 1.964819944598338, + "grad_norm": 0.7206198573112488, + "learning_rate": 3.810592145419547e-06, + "loss": 0.6003, + "step": 7093 + }, + { + "epoch": 1.965096952908587, + "grad_norm": 0.7294063568115234, + "learning_rate": 3.810281916537355e-06, + "loss": 0.6278, + "step": 7094 + }, + { + "epoch": 1.9653739612188366, + "grad_norm": 0.7309246063232422, + "learning_rate": 3.8099716598345317e-06, + "loss": 0.5992, + "step": 7095 + }, + { + "epoch": 1.965650969529086, + "grad_norm": 0.7022836804389954, + "learning_rate": 3.8096613753176635e-06, + "loss": 0.5967, + "step": 7096 + }, + { + "epoch": 1.965927977839335, + "grad_norm": 0.7502058148384094, + "learning_rate": 3.809351062993342e-06, + "loss": 0.607, + "step": 7097 + }, + { + "epoch": 1.9662049861495845, + "grad_norm": 0.7289679050445557, + "learning_rate": 3.809040722868152e-06, + "loss": 0.589, + "step": 7098 + }, + { + "epoch": 1.966481994459834, + "grad_norm": 0.6707630157470703, + "learning_rate": 3.8087303549486844e-06, + "loss": 0.5773, + "step": 7099 + }, + { + "epoch": 1.966759002770083, + "grad_norm": 0.7455605864524841, + "learning_rate": 3.8084199592415305e-06, + "loss": 0.6105, + "step": 7100 + }, + { + "epoch": 1.9670360110803324, + "grad_norm": 0.7032577395439148, + "learning_rate": 3.8081095357532787e-06, + "loss": 0.6293, + "step": 7101 + }, + { + "epoch": 1.9673130193905817, + "grad_norm": 0.6860726475715637, + "learning_rate": 3.807799084490521e-06, + "loss": 0.6318, + "step": 7102 + }, + { + "epoch": 1.967590027700831, + "grad_norm": 0.7498553991317749, + "learning_rate": 3.807488605459849e-06, + "loss": 0.6472, + "step": 7103 + }, + { + "epoch": 1.9678670360110804, + "grad_norm": 0.7227993607521057, + "learning_rate": 3.807178098667855e-06, + "loss": 0.594, + "step": 7104 + }, + { + "epoch": 1.9681440443213296, + "grad_norm": 0.721440851688385, + "learning_rate": 3.8068675641211316e-06, + "loss": 0.6256, + "step": 7105 + }, + { + "epoch": 1.9684210526315788, + "grad_norm": 0.7207381725311279, + "learning_rate": 3.8065570018262733e-06, + "loss": 0.582, + "step": 7106 + }, + { + "epoch": 1.9686980609418283, + "grad_norm": 0.676328718662262, + "learning_rate": 3.806246411789872e-06, + "loss": 0.6137, + "step": 7107 + }, + { + "epoch": 1.9689750692520775, + "grad_norm": 0.7162710428237915, + "learning_rate": 3.805935794018525e-06, + "loss": 0.6427, + "step": 7108 + }, + { + "epoch": 1.9692520775623268, + "grad_norm": 0.7399466037750244, + "learning_rate": 3.8056251485188244e-06, + "loss": 0.6157, + "step": 7109 + }, + { + "epoch": 1.9695290858725762, + "grad_norm": 0.6657085418701172, + "learning_rate": 3.80531447529737e-06, + "loss": 0.5755, + "step": 7110 + }, + { + "epoch": 1.9698060941828255, + "grad_norm": 0.6843543648719788, + "learning_rate": 3.805003774360754e-06, + "loss": 0.6252, + "step": 7111 + }, + { + "epoch": 1.9700831024930747, + "grad_norm": 0.7047277092933655, + "learning_rate": 3.804693045715576e-06, + "loss": 0.6521, + "step": 7112 + }, + { + "epoch": 1.9703601108033242, + "grad_norm": 0.7122973203659058, + "learning_rate": 3.804382289368432e-06, + "loss": 0.637, + "step": 7113 + }, + { + "epoch": 1.9706371191135734, + "grad_norm": 0.7243662476539612, + "learning_rate": 3.8040715053259213e-06, + "loss": 0.6483, + "step": 7114 + }, + { + "epoch": 1.9709141274238227, + "grad_norm": 0.6891406774520874, + "learning_rate": 3.8037606935946425e-06, + "loss": 0.6481, + "step": 7115 + }, + { + "epoch": 1.9711911357340721, + "grad_norm": 0.6714333295822144, + "learning_rate": 3.803449854181195e-06, + "loss": 0.6116, + "step": 7116 + }, + { + "epoch": 1.9714681440443214, + "grad_norm": 0.7190486788749695, + "learning_rate": 3.8031389870921785e-06, + "loss": 0.6091, + "step": 7117 + }, + { + "epoch": 1.9717451523545706, + "grad_norm": 0.7250654697418213, + "learning_rate": 3.8028280923341927e-06, + "loss": 0.6217, + "step": 7118 + }, + { + "epoch": 1.97202216066482, + "grad_norm": 0.738471508026123, + "learning_rate": 3.802517169913841e-06, + "loss": 0.6285, + "step": 7119 + }, + { + "epoch": 1.9722991689750693, + "grad_norm": 0.6984786987304688, + "learning_rate": 3.802206219837722e-06, + "loss": 0.633, + "step": 7120 + }, + { + "epoch": 1.9725761772853185, + "grad_norm": 0.747661292552948, + "learning_rate": 3.8018952421124393e-06, + "loss": 0.5987, + "step": 7121 + }, + { + "epoch": 1.972853185595568, + "grad_norm": 0.6906194090843201, + "learning_rate": 3.8015842367445967e-06, + "loss": 0.5797, + "step": 7122 + }, + { + "epoch": 1.973130193905817, + "grad_norm": 0.7184000611305237, + "learning_rate": 3.8012732037407975e-06, + "loss": 0.6018, + "step": 7123 + }, + { + "epoch": 1.9734072022160665, + "grad_norm": 0.6846181750297546, + "learning_rate": 3.8009621431076436e-06, + "loss": 0.6387, + "step": 7124 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.699408233165741, + "learning_rate": 3.8006510548517416e-06, + "loss": 0.6615, + "step": 7125 + }, + { + "epoch": 1.973961218836565, + "grad_norm": 0.6856897473335266, + "learning_rate": 3.800339938979697e-06, + "loss": 0.6589, + "step": 7126 + }, + { + "epoch": 1.9742382271468144, + "grad_norm": 0.6751475930213928, + "learning_rate": 3.8000287954981134e-06, + "loss": 0.6341, + "step": 7127 + }, + { + "epoch": 1.9745152354570639, + "grad_norm": 0.7153899669647217, + "learning_rate": 3.799717624413599e-06, + "loss": 0.6402, + "step": 7128 + }, + { + "epoch": 1.9747922437673129, + "grad_norm": 0.6692858934402466, + "learning_rate": 3.7994064257327613e-06, + "loss": 0.6092, + "step": 7129 + }, + { + "epoch": 1.9750692520775623, + "grad_norm": 0.7093313932418823, + "learning_rate": 3.7990951994622062e-06, + "loss": 0.6155, + "step": 7130 + }, + { + "epoch": 1.9753462603878116, + "grad_norm": 0.7412492632865906, + "learning_rate": 3.7987839456085424e-06, + "loss": 0.6334, + "step": 7131 + }, + { + "epoch": 1.9756232686980608, + "grad_norm": 0.721476674079895, + "learning_rate": 3.7984726641783793e-06, + "loss": 0.6307, + "step": 7132 + }, + { + "epoch": 1.9759002770083103, + "grad_norm": 0.6939054727554321, + "learning_rate": 3.7981613551783246e-06, + "loss": 0.5683, + "step": 7133 + }, + { + "epoch": 1.9761772853185595, + "grad_norm": 0.7083145380020142, + "learning_rate": 3.7978500186149902e-06, + "loss": 0.5428, + "step": 7134 + }, + { + "epoch": 1.9764542936288088, + "grad_norm": 0.6860464811325073, + "learning_rate": 3.7975386544949848e-06, + "loss": 0.6463, + "step": 7135 + }, + { + "epoch": 1.9767313019390582, + "grad_norm": 0.6742307543754578, + "learning_rate": 3.7972272628249206e-06, + "loss": 0.606, + "step": 7136 + }, + { + "epoch": 1.9770083102493075, + "grad_norm": 0.6865066289901733, + "learning_rate": 3.7969158436114083e-06, + "loss": 0.667, + "step": 7137 + }, + { + "epoch": 1.9772853185595567, + "grad_norm": 0.716090202331543, + "learning_rate": 3.796604396861061e-06, + "loss": 0.6379, + "step": 7138 + }, + { + "epoch": 1.9775623268698062, + "grad_norm": 0.7939145565032959, + "learning_rate": 3.796292922580492e-06, + "loss": 0.6028, + "step": 7139 + }, + { + "epoch": 1.9778393351800554, + "grad_norm": 0.7300482988357544, + "learning_rate": 3.7959814207763134e-06, + "loss": 0.6127, + "step": 7140 + }, + { + "epoch": 1.9781163434903046, + "grad_norm": 0.6948409080505371, + "learning_rate": 3.795669891455139e-06, + "loss": 0.6754, + "step": 7141 + }, + { + "epoch": 1.978393351800554, + "grad_norm": 0.724706768989563, + "learning_rate": 3.7953583346235843e-06, + "loss": 0.6248, + "step": 7142 + }, + { + "epoch": 1.9786703601108033, + "grad_norm": 0.749442994594574, + "learning_rate": 3.795046750288265e-06, + "loss": 0.6157, + "step": 7143 + }, + { + "epoch": 1.9789473684210526, + "grad_norm": 0.6890953779220581, + "learning_rate": 3.7947351384557953e-06, + "loss": 0.6027, + "step": 7144 + }, + { + "epoch": 1.979224376731302, + "grad_norm": 0.6894163489341736, + "learning_rate": 3.794423499132792e-06, + "loss": 0.6555, + "step": 7145 + }, + { + "epoch": 1.9795013850415513, + "grad_norm": 0.7086030840873718, + "learning_rate": 3.7941118323258717e-06, + "loss": 0.6216, + "step": 7146 + }, + { + "epoch": 1.9797783933518005, + "grad_norm": 0.731605589389801, + "learning_rate": 3.7938001380416536e-06, + "loss": 0.614, + "step": 7147 + }, + { + "epoch": 1.98005540166205, + "grad_norm": 0.7541550397872925, + "learning_rate": 3.7934884162867546e-06, + "loss": 0.616, + "step": 7148 + }, + { + "epoch": 1.9803324099722992, + "grad_norm": 0.710583508014679, + "learning_rate": 3.793176667067793e-06, + "loss": 0.6436, + "step": 7149 + }, + { + "epoch": 1.9806094182825484, + "grad_norm": 0.7192587852478027, + "learning_rate": 3.7928648903913877e-06, + "loss": 0.6342, + "step": 7150 + }, + { + "epoch": 1.980886426592798, + "grad_norm": 0.6838364601135254, + "learning_rate": 3.79255308626416e-06, + "loss": 0.6229, + "step": 7151 + }, + { + "epoch": 1.9811634349030471, + "grad_norm": 0.6993123888969421, + "learning_rate": 3.7922412546927288e-06, + "loss": 0.6653, + "step": 7152 + }, + { + "epoch": 1.9814404432132964, + "grad_norm": 0.6950284838676453, + "learning_rate": 3.791929395683716e-06, + "loss": 0.6516, + "step": 7153 + }, + { + "epoch": 1.9817174515235458, + "grad_norm": 0.7114899754524231, + "learning_rate": 3.7916175092437436e-06, + "loss": 0.6277, + "step": 7154 + }, + { + "epoch": 1.9819944598337949, + "grad_norm": 0.7198684215545654, + "learning_rate": 3.791305595379432e-06, + "loss": 0.5895, + "step": 7155 + }, + { + "epoch": 1.9822714681440443, + "grad_norm": 0.7094388008117676, + "learning_rate": 3.7909936540974052e-06, + "loss": 0.6192, + "step": 7156 + }, + { + "epoch": 1.9825484764542938, + "grad_norm": 0.7088209390640259, + "learning_rate": 3.7906816854042866e-06, + "loss": 0.6554, + "step": 7157 + }, + { + "epoch": 1.9828254847645428, + "grad_norm": 0.7191725969314575, + "learning_rate": 3.7903696893067e-06, + "loss": 0.6065, + "step": 7158 + }, + { + "epoch": 1.9831024930747922, + "grad_norm": 0.6958176493644714, + "learning_rate": 3.7900576658112687e-06, + "loss": 0.5931, + "step": 7159 + }, + { + "epoch": 1.9833795013850417, + "grad_norm": 0.7286674380302429, + "learning_rate": 3.789745614924619e-06, + "loss": 0.5997, + "step": 7160 + }, + { + "epoch": 1.9836565096952907, + "grad_norm": 0.6912921071052551, + "learning_rate": 3.789433536653377e-06, + "loss": 0.5836, + "step": 7161 + }, + { + "epoch": 1.9839335180055402, + "grad_norm": 0.7034099698066711, + "learning_rate": 3.789121431004168e-06, + "loss": 0.6483, + "step": 7162 + }, + { + "epoch": 1.9842105263157894, + "grad_norm": 0.7091851830482483, + "learning_rate": 3.7888092979836176e-06, + "loss": 0.642, + "step": 7163 + }, + { + "epoch": 1.9844875346260387, + "grad_norm": 0.7511205077171326, + "learning_rate": 3.7884971375983563e-06, + "loss": 0.6504, + "step": 7164 + }, + { + "epoch": 1.9847645429362881, + "grad_norm": 0.6957712769508362, + "learning_rate": 3.7881849498550094e-06, + "loss": 0.5966, + "step": 7165 + }, + { + "epoch": 1.9850415512465374, + "grad_norm": 0.7258434891700745, + "learning_rate": 3.7878727347602067e-06, + "loss": 0.6163, + "step": 7166 + }, + { + "epoch": 1.9853185595567866, + "grad_norm": 0.6509557366371155, + "learning_rate": 3.7875604923205768e-06, + "loss": 0.5864, + "step": 7167 + }, + { + "epoch": 1.985595567867036, + "grad_norm": 0.682807445526123, + "learning_rate": 3.7872482225427496e-06, + "loss": 0.6128, + "step": 7168 + }, + { + "epoch": 1.9858725761772853, + "grad_norm": 0.6814695596694946, + "learning_rate": 3.7869359254333555e-06, + "loss": 0.5586, + "step": 7169 + }, + { + "epoch": 1.9861495844875345, + "grad_norm": 0.7576179504394531, + "learning_rate": 3.7866236009990253e-06, + "loss": 0.6315, + "step": 7170 + }, + { + "epoch": 1.986426592797784, + "grad_norm": 0.6948550343513489, + "learning_rate": 3.78631124924639e-06, + "loss": 0.6757, + "step": 7171 + }, + { + "epoch": 1.9867036011080332, + "grad_norm": 0.6997304558753967, + "learning_rate": 3.7859988701820818e-06, + "loss": 0.5999, + "step": 7172 + }, + { + "epoch": 1.9869806094182825, + "grad_norm": 0.7092477083206177, + "learning_rate": 3.7856864638127346e-06, + "loss": 0.595, + "step": 7173 + }, + { + "epoch": 1.987257617728532, + "grad_norm": 0.6907538175582886, + "learning_rate": 3.7853740301449792e-06, + "loss": 0.6108, + "step": 7174 + }, + { + "epoch": 1.9875346260387812, + "grad_norm": 0.6745933294296265, + "learning_rate": 3.7850615691854513e-06, + "loss": 0.6013, + "step": 7175 + }, + { + "epoch": 1.9878116343490304, + "grad_norm": 0.6901183724403381, + "learning_rate": 3.7847490809407848e-06, + "loss": 0.6337, + "step": 7176 + }, + { + "epoch": 1.9880886426592799, + "grad_norm": 0.680350124835968, + "learning_rate": 3.784436565417614e-06, + "loss": 0.6413, + "step": 7177 + }, + { + "epoch": 1.988365650969529, + "grad_norm": 0.6782408952713013, + "learning_rate": 3.784124022622575e-06, + "loss": 0.615, + "step": 7178 + }, + { + "epoch": 1.9886426592797783, + "grad_norm": 0.7361845374107361, + "learning_rate": 3.783811452562304e-06, + "loss": 0.588, + "step": 7179 + }, + { + "epoch": 1.9889196675900278, + "grad_norm": 0.6791418790817261, + "learning_rate": 3.7834988552434365e-06, + "loss": 0.5931, + "step": 7180 + }, + { + "epoch": 1.989196675900277, + "grad_norm": 0.6929588913917542, + "learning_rate": 3.7831862306726114e-06, + "loss": 0.6364, + "step": 7181 + }, + { + "epoch": 1.9894736842105263, + "grad_norm": 0.6642131805419922, + "learning_rate": 3.7828735788564654e-06, + "loss": 0.621, + "step": 7182 + }, + { + "epoch": 1.9897506925207757, + "grad_norm": 0.6879885792732239, + "learning_rate": 3.7825608998016372e-06, + "loss": 0.5924, + "step": 7183 + }, + { + "epoch": 1.990027700831025, + "grad_norm": 0.6691209673881531, + "learning_rate": 3.782248193514766e-06, + "loss": 0.6207, + "step": 7184 + }, + { + "epoch": 1.9903047091412742, + "grad_norm": 0.7109494209289551, + "learning_rate": 3.7819354600024903e-06, + "loss": 0.6008, + "step": 7185 + }, + { + "epoch": 1.9905817174515237, + "grad_norm": 0.7236234545707703, + "learning_rate": 3.7816226992714522e-06, + "loss": 0.638, + "step": 7186 + }, + { + "epoch": 1.9908587257617727, + "grad_norm": 0.7347348928451538, + "learning_rate": 3.7813099113282907e-06, + "loss": 0.6286, + "step": 7187 + }, + { + "epoch": 1.9911357340720222, + "grad_norm": 0.7098999619483948, + "learning_rate": 3.7809970961796484e-06, + "loss": 0.6073, + "step": 7188 + }, + { + "epoch": 1.9914127423822716, + "grad_norm": 0.7165220379829407, + "learning_rate": 3.7806842538321654e-06, + "loss": 0.6491, + "step": 7189 + }, + { + "epoch": 1.9916897506925206, + "grad_norm": 0.6704548597335815, + "learning_rate": 3.780371384292486e-06, + "loss": 0.5961, + "step": 7190 + }, + { + "epoch": 1.99196675900277, + "grad_norm": 0.6767783761024475, + "learning_rate": 3.780058487567252e-06, + "loss": 0.6109, + "step": 7191 + }, + { + "epoch": 1.9922437673130196, + "grad_norm": 0.6728404760360718, + "learning_rate": 3.7797455636631076e-06, + "loss": 0.613, + "step": 7192 + }, + { + "epoch": 1.9925207756232686, + "grad_norm": 0.7355282306671143, + "learning_rate": 3.7794326125866964e-06, + "loss": 0.6164, + "step": 7193 + }, + { + "epoch": 1.992797783933518, + "grad_norm": 0.7722780704498291, + "learning_rate": 3.7791196343446634e-06, + "loss": 0.6603, + "step": 7194 + }, + { + "epoch": 1.9930747922437673, + "grad_norm": 0.6995906233787537, + "learning_rate": 3.778806628943655e-06, + "loss": 0.6094, + "step": 7195 + }, + { + "epoch": 1.9933518005540165, + "grad_norm": 0.7133497595787048, + "learning_rate": 3.7784935963903157e-06, + "loss": 0.5914, + "step": 7196 + }, + { + "epoch": 1.993628808864266, + "grad_norm": 0.6836190223693848, + "learning_rate": 3.7781805366912918e-06, + "loss": 0.6134, + "step": 7197 + }, + { + "epoch": 1.9939058171745152, + "grad_norm": 0.7232199311256409, + "learning_rate": 3.777867449853232e-06, + "loss": 0.611, + "step": 7198 + }, + { + "epoch": 1.9941828254847644, + "grad_norm": 0.7082121968269348, + "learning_rate": 3.777554335882782e-06, + "loss": 0.6699, + "step": 7199 + }, + { + "epoch": 1.994459833795014, + "grad_norm": 0.6821137070655823, + "learning_rate": 3.777241194786591e-06, + "loss": 0.5794, + "step": 7200 + }, + { + "epoch": 1.9947368421052631, + "grad_norm": 0.7654339075088501, + "learning_rate": 3.776928026571308e-06, + "loss": 0.6198, + "step": 7201 + }, + { + "epoch": 1.9950138504155124, + "grad_norm": 0.6920021176338196, + "learning_rate": 3.776614831243582e-06, + "loss": 0.6145, + "step": 7202 + }, + { + "epoch": 1.9952908587257618, + "grad_norm": 0.7104975581169128, + "learning_rate": 3.7763016088100634e-06, + "loss": 0.6675, + "step": 7203 + }, + { + "epoch": 1.995567867036011, + "grad_norm": 0.7255128026008606, + "learning_rate": 3.7759883592774018e-06, + "loss": 0.6338, + "step": 7204 + }, + { + "epoch": 1.9958448753462603, + "grad_norm": 0.7115368843078613, + "learning_rate": 3.77567508265225e-06, + "loss": 0.6179, + "step": 7205 + }, + { + "epoch": 1.9961218836565098, + "grad_norm": 0.6810135245323181, + "learning_rate": 3.775361778941257e-06, + "loss": 0.5555, + "step": 7206 + }, + { + "epoch": 1.996398891966759, + "grad_norm": 0.7034701108932495, + "learning_rate": 3.7750484481510775e-06, + "loss": 0.6148, + "step": 7207 + }, + { + "epoch": 1.9966759002770083, + "grad_norm": 0.722679078578949, + "learning_rate": 3.7747350902883627e-06, + "loss": 0.5851, + "step": 7208 + }, + { + "epoch": 1.9969529085872577, + "grad_norm": 0.686889111995697, + "learning_rate": 3.7744217053597663e-06, + "loss": 0.596, + "step": 7209 + }, + { + "epoch": 1.997229916897507, + "grad_norm": 0.6772050261497498, + "learning_rate": 3.774108293371943e-06, + "loss": 0.6377, + "step": 7210 + }, + { + "epoch": 1.9975069252077562, + "grad_norm": 0.7226324677467346, + "learning_rate": 3.7737948543315473e-06, + "loss": 0.648, + "step": 7211 + }, + { + "epoch": 1.9977839335180057, + "grad_norm": 0.6985737681388855, + "learning_rate": 3.7734813882452336e-06, + "loss": 0.6217, + "step": 7212 + }, + { + "epoch": 1.9980609418282549, + "grad_norm": 0.6721072793006897, + "learning_rate": 3.7731678951196576e-06, + "loss": 0.6291, + "step": 7213 + }, + { + "epoch": 1.9983379501385041, + "grad_norm": 0.6800287961959839, + "learning_rate": 3.7728543749614766e-06, + "loss": 0.6149, + "step": 7214 + }, + { + "epoch": 1.9986149584487536, + "grad_norm": 0.6427153944969177, + "learning_rate": 3.772540827777346e-06, + "loss": 0.5985, + "step": 7215 + }, + { + "epoch": 1.9988919667590028, + "grad_norm": 0.7035733461380005, + "learning_rate": 3.7722272535739247e-06, + "loss": 0.6356, + "step": 7216 + }, + { + "epoch": 1.999168975069252, + "grad_norm": 0.6934232711791992, + "learning_rate": 3.7719136523578687e-06, + "loss": 0.6313, + "step": 7217 + }, + { + "epoch": 1.9994459833795015, + "grad_norm": 0.6813242435455322, + "learning_rate": 3.771600024135839e-06, + "loss": 0.602, + "step": 7218 + }, + { + "epoch": 1.9997229916897505, + "grad_norm": 0.7163416147232056, + "learning_rate": 3.771286368914493e-06, + "loss": 0.6378, + "step": 7219 + }, + { + "epoch": 2.0, + "grad_norm": 0.7049331665039062, + "learning_rate": 3.7709726867004905e-06, + "loss": 0.6352, + "step": 7220 + }, + { + "epoch": 2.0002770083102495, + "grad_norm": 0.6854837536811829, + "learning_rate": 3.7706589775004926e-06, + "loss": 0.6146, + "step": 7221 + }, + { + "epoch": 2.0005540166204985, + "grad_norm": 0.7240393161773682, + "learning_rate": 3.7703452413211596e-06, + "loss": 0.621, + "step": 7222 + }, + { + "epoch": 2.000831024930748, + "grad_norm": 0.6688227653503418, + "learning_rate": 3.770031478169153e-06, + "loss": 0.6052, + "step": 7223 + }, + { + "epoch": 2.0011080332409974, + "grad_norm": 0.6882873773574829, + "learning_rate": 3.7697176880511353e-06, + "loss": 0.5827, + "step": 7224 + }, + { + "epoch": 2.0013850415512464, + "grad_norm": 0.6904376745223999, + "learning_rate": 3.7694038709737674e-06, + "loss": 0.5836, + "step": 7225 + }, + { + "epoch": 2.001662049861496, + "grad_norm": 0.7087603211402893, + "learning_rate": 3.769090026943714e-06, + "loss": 0.5383, + "step": 7226 + }, + { + "epoch": 2.0019390581717453, + "grad_norm": 0.6635582447052002, + "learning_rate": 3.7687761559676397e-06, + "loss": 0.5963, + "step": 7227 + }, + { + "epoch": 2.0022160664819943, + "grad_norm": 0.6937400698661804, + "learning_rate": 3.7684622580522057e-06, + "loss": 0.5906, + "step": 7228 + }, + { + "epoch": 2.002493074792244, + "grad_norm": 0.7310560345649719, + "learning_rate": 3.7681483332040793e-06, + "loss": 0.5661, + "step": 7229 + }, + { + "epoch": 2.0027700831024933, + "grad_norm": 0.7068434357643127, + "learning_rate": 3.767834381429926e-06, + "loss": 0.6078, + "step": 7230 + }, + { + "epoch": 2.0030470914127423, + "grad_norm": 0.7256509065628052, + "learning_rate": 3.7675204027364105e-06, + "loss": 0.6001, + "step": 7231 + }, + { + "epoch": 2.0033240997229917, + "grad_norm": 0.711618959903717, + "learning_rate": 3.7672063971302e-06, + "loss": 0.6469, + "step": 7232 + }, + { + "epoch": 2.003601108033241, + "grad_norm": 0.7369129657745361, + "learning_rate": 3.766892364617961e-06, + "loss": 0.5762, + "step": 7233 + }, + { + "epoch": 2.0038781163434902, + "grad_norm": 0.7153844237327576, + "learning_rate": 3.766578305206363e-06, + "loss": 0.5663, + "step": 7234 + }, + { + "epoch": 2.0041551246537397, + "grad_norm": 0.7051813006401062, + "learning_rate": 3.7662642189020726e-06, + "loss": 0.583, + "step": 7235 + }, + { + "epoch": 2.0044321329639887, + "grad_norm": 0.7093333601951599, + "learning_rate": 3.7659501057117586e-06, + "loss": 0.6124, + "step": 7236 + }, + { + "epoch": 2.004709141274238, + "grad_norm": 0.717320442199707, + "learning_rate": 3.7656359656420915e-06, + "loss": 0.5999, + "step": 7237 + }, + { + "epoch": 2.0049861495844876, + "grad_norm": 0.7107758522033691, + "learning_rate": 3.7653217986997405e-06, + "loss": 0.592, + "step": 7238 + }, + { + "epoch": 2.0052631578947366, + "grad_norm": 0.6741863489151001, + "learning_rate": 3.765007604891377e-06, + "loss": 0.5523, + "step": 7239 + }, + { + "epoch": 2.005540166204986, + "grad_norm": 0.706916868686676, + "learning_rate": 3.764693384223671e-06, + "loss": 0.5652, + "step": 7240 + }, + { + "epoch": 2.0058171745152356, + "grad_norm": 0.740626335144043, + "learning_rate": 3.7643791367032945e-06, + "loss": 0.6423, + "step": 7241 + }, + { + "epoch": 2.0060941828254846, + "grad_norm": 0.649993896484375, + "learning_rate": 3.764064862336921e-06, + "loss": 0.5362, + "step": 7242 + }, + { + "epoch": 2.006371191135734, + "grad_norm": 0.7229255437850952, + "learning_rate": 3.763750561131222e-06, + "loss": 0.5756, + "step": 7243 + }, + { + "epoch": 2.0066481994459835, + "grad_norm": 0.7251301407814026, + "learning_rate": 3.7634362330928707e-06, + "loss": 0.6221, + "step": 7244 + }, + { + "epoch": 2.0069252077562325, + "grad_norm": 0.7756488919258118, + "learning_rate": 3.7631218782285426e-06, + "loss": 0.6022, + "step": 7245 + }, + { + "epoch": 2.007202216066482, + "grad_norm": 0.7517617344856262, + "learning_rate": 3.7628074965449114e-06, + "loss": 0.5917, + "step": 7246 + }, + { + "epoch": 2.0074792243767314, + "grad_norm": 0.7082700729370117, + "learning_rate": 3.7624930880486513e-06, + "loss": 0.5977, + "step": 7247 + }, + { + "epoch": 2.0077562326869804, + "grad_norm": 0.7056994438171387, + "learning_rate": 3.76217865274644e-06, + "loss": 0.5966, + "step": 7248 + }, + { + "epoch": 2.00803324099723, + "grad_norm": 0.685123085975647, + "learning_rate": 3.7618641906449515e-06, + "loss": 0.6168, + "step": 7249 + }, + { + "epoch": 2.0083102493074794, + "grad_norm": 0.7444612979888916, + "learning_rate": 3.761549701750865e-06, + "loss": 0.604, + "step": 7250 + }, + { + "epoch": 2.0085872576177284, + "grad_norm": 0.7341189384460449, + "learning_rate": 3.761235186070855e-06, + "loss": 0.5992, + "step": 7251 + }, + { + "epoch": 2.008864265927978, + "grad_norm": 0.694669246673584, + "learning_rate": 3.760920643611603e-06, + "loss": 0.5974, + "step": 7252 + }, + { + "epoch": 2.0091412742382273, + "grad_norm": 0.7314677834510803, + "learning_rate": 3.7606060743797844e-06, + "loss": 0.6183, + "step": 7253 + }, + { + "epoch": 2.0094182825484763, + "grad_norm": 0.7707086205482483, + "learning_rate": 3.76029147838208e-06, + "loss": 0.5845, + "step": 7254 + }, + { + "epoch": 2.009695290858726, + "grad_norm": 0.7177945375442505, + "learning_rate": 3.759976855625169e-06, + "loss": 0.6028, + "step": 7255 + }, + { + "epoch": 2.0099722991689752, + "grad_norm": 0.6916733384132385, + "learning_rate": 3.759662206115732e-06, + "loss": 0.6221, + "step": 7256 + }, + { + "epoch": 2.0102493074792243, + "grad_norm": 0.6647084951400757, + "learning_rate": 3.7593475298604496e-06, + "loss": 0.583, + "step": 7257 + }, + { + "epoch": 2.0105263157894737, + "grad_norm": 0.7277607917785645, + "learning_rate": 3.7590328268660025e-06, + "loss": 0.6652, + "step": 7258 + }, + { + "epoch": 2.010803324099723, + "grad_norm": 0.7275409698486328, + "learning_rate": 3.7587180971390748e-06, + "loss": 0.6171, + "step": 7259 + }, + { + "epoch": 2.011080332409972, + "grad_norm": 0.7116976976394653, + "learning_rate": 3.758403340686345e-06, + "loss": 0.5722, + "step": 7260 + }, + { + "epoch": 2.0113573407202217, + "grad_norm": 0.6645132899284363, + "learning_rate": 3.758088557514501e-06, + "loss": 0.5321, + "step": 7261 + }, + { + "epoch": 2.011634349030471, + "grad_norm": 0.680085301399231, + "learning_rate": 3.757773747630223e-06, + "loss": 0.5907, + "step": 7262 + }, + { + "epoch": 2.01191135734072, + "grad_norm": 0.6877182126045227, + "learning_rate": 3.7574589110401965e-06, + "loss": 0.5829, + "step": 7263 + }, + { + "epoch": 2.0121883656509696, + "grad_norm": 0.6982957720756531, + "learning_rate": 3.757144047751106e-06, + "loss": 0.5982, + "step": 7264 + }, + { + "epoch": 2.012465373961219, + "grad_norm": 0.6935186982154846, + "learning_rate": 3.756829157769637e-06, + "loss": 0.6095, + "step": 7265 + }, + { + "epoch": 2.012742382271468, + "grad_norm": 0.708821713924408, + "learning_rate": 3.7565142411024756e-06, + "loss": 0.5746, + "step": 7266 + }, + { + "epoch": 2.0130193905817175, + "grad_norm": 0.7320842146873474, + "learning_rate": 3.7561992977563077e-06, + "loss": 0.6074, + "step": 7267 + }, + { + "epoch": 2.0132963988919665, + "grad_norm": 0.6891639232635498, + "learning_rate": 3.755884327737821e-06, + "loss": 0.6069, + "step": 7268 + }, + { + "epoch": 2.013573407202216, + "grad_norm": 0.6643139123916626, + "learning_rate": 3.7555693310537026e-06, + "loss": 0.6329, + "step": 7269 + }, + { + "epoch": 2.0138504155124655, + "grad_norm": 0.6847847700119019, + "learning_rate": 3.7552543077106414e-06, + "loss": 0.6138, + "step": 7270 + }, + { + "epoch": 2.0141274238227145, + "grad_norm": 0.7067223787307739, + "learning_rate": 3.7549392577153255e-06, + "loss": 0.6273, + "step": 7271 + }, + { + "epoch": 2.014404432132964, + "grad_norm": 0.7176226377487183, + "learning_rate": 3.7546241810744444e-06, + "loss": 0.5945, + "step": 7272 + }, + { + "epoch": 2.0146814404432134, + "grad_norm": 0.6766518950462341, + "learning_rate": 3.7543090777946876e-06, + "loss": 0.5691, + "step": 7273 + }, + { + "epoch": 2.0149584487534624, + "grad_norm": 0.7263760566711426, + "learning_rate": 3.7539939478827457e-06, + "loss": 0.6014, + "step": 7274 + }, + { + "epoch": 2.015235457063712, + "grad_norm": 0.6988743543624878, + "learning_rate": 3.753678791345311e-06, + "loss": 0.5582, + "step": 7275 + }, + { + "epoch": 2.0155124653739613, + "grad_norm": 0.7526625990867615, + "learning_rate": 3.7533636081890735e-06, + "loss": 0.6624, + "step": 7276 + }, + { + "epoch": 2.0157894736842104, + "grad_norm": 0.709251344203949, + "learning_rate": 3.7530483984207254e-06, + "loss": 0.5901, + "step": 7277 + }, + { + "epoch": 2.01606648199446, + "grad_norm": 0.6591882705688477, + "learning_rate": 3.752733162046961e-06, + "loss": 0.5809, + "step": 7278 + }, + { + "epoch": 2.0163434903047093, + "grad_norm": 0.6839492321014404, + "learning_rate": 3.7524178990744712e-06, + "loss": 0.613, + "step": 7279 + }, + { + "epoch": 2.0166204986149583, + "grad_norm": 0.7151807546615601, + "learning_rate": 3.752102609509952e-06, + "loss": 0.6245, + "step": 7280 + }, + { + "epoch": 2.0168975069252078, + "grad_norm": 0.7240042686462402, + "learning_rate": 3.7517872933600963e-06, + "loss": 0.6042, + "step": 7281 + }, + { + "epoch": 2.017174515235457, + "grad_norm": 0.7010844945907593, + "learning_rate": 3.7514719506315995e-06, + "loss": 0.6244, + "step": 7282 + }, + { + "epoch": 2.0174515235457062, + "grad_norm": 0.7382633686065674, + "learning_rate": 3.751156581331158e-06, + "loss": 0.5639, + "step": 7283 + }, + { + "epoch": 2.0177285318559557, + "grad_norm": 0.684970498085022, + "learning_rate": 3.750841185465467e-06, + "loss": 0.5623, + "step": 7284 + }, + { + "epoch": 2.018005540166205, + "grad_norm": 0.7206228971481323, + "learning_rate": 3.750525763041223e-06, + "loss": 0.5525, + "step": 7285 + }, + { + "epoch": 2.018282548476454, + "grad_norm": 0.6992058753967285, + "learning_rate": 3.750210314065123e-06, + "loss": 0.5933, + "step": 7286 + }, + { + "epoch": 2.0185595567867036, + "grad_norm": 0.6930119395256042, + "learning_rate": 3.749894838543866e-06, + "loss": 0.5794, + "step": 7287 + }, + { + "epoch": 2.018836565096953, + "grad_norm": 0.6776397228240967, + "learning_rate": 3.7495793364841497e-06, + "loss": 0.5888, + "step": 7288 + }, + { + "epoch": 2.019113573407202, + "grad_norm": 0.6972619891166687, + "learning_rate": 3.7492638078926736e-06, + "loss": 0.6004, + "step": 7289 + }, + { + "epoch": 2.0193905817174516, + "grad_norm": 0.6977049708366394, + "learning_rate": 3.7489482527761358e-06, + "loss": 0.6138, + "step": 7290 + }, + { + "epoch": 2.019667590027701, + "grad_norm": 0.7075570225715637, + "learning_rate": 3.7486326711412373e-06, + "loss": 0.5772, + "step": 7291 + }, + { + "epoch": 2.01994459833795, + "grad_norm": 0.7177691459655762, + "learning_rate": 3.7483170629946784e-06, + "loss": 0.654, + "step": 7292 + }, + { + "epoch": 2.0202216066481995, + "grad_norm": 0.7144123911857605, + "learning_rate": 3.74800142834316e-06, + "loss": 0.5372, + "step": 7293 + }, + { + "epoch": 2.020498614958449, + "grad_norm": 0.7332642078399658, + "learning_rate": 3.747685767193385e-06, + "loss": 0.6061, + "step": 7294 + }, + { + "epoch": 2.020775623268698, + "grad_norm": 0.7368162870407104, + "learning_rate": 3.7473700795520553e-06, + "loss": 0.588, + "step": 7295 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 0.6972573399543762, + "learning_rate": 3.747054365425872e-06, + "loss": 0.6108, + "step": 7296 + }, + { + "epoch": 2.021329639889197, + "grad_norm": 0.7078856229782104, + "learning_rate": 3.746738624821541e-06, + "loss": 0.5558, + "step": 7297 + }, + { + "epoch": 2.021606648199446, + "grad_norm": 0.7175845503807068, + "learning_rate": 3.746422857745765e-06, + "loss": 0.5894, + "step": 7298 + }, + { + "epoch": 2.0218836565096954, + "grad_norm": 0.6819410920143127, + "learning_rate": 3.7461070642052477e-06, + "loss": 0.5635, + "step": 7299 + }, + { + "epoch": 2.0221606648199444, + "grad_norm": 0.6979035139083862, + "learning_rate": 3.745791244206697e-06, + "loss": 0.5714, + "step": 7300 + }, + { + "epoch": 2.022437673130194, + "grad_norm": 0.7359611988067627, + "learning_rate": 3.7454753977568154e-06, + "loss": 0.6479, + "step": 7301 + }, + { + "epoch": 2.0227146814404433, + "grad_norm": 0.724439799785614, + "learning_rate": 3.745159524862311e-06, + "loss": 0.5861, + "step": 7302 + }, + { + "epoch": 2.0229916897506923, + "grad_norm": 0.7226695418357849, + "learning_rate": 3.74484362552989e-06, + "loss": 0.5672, + "step": 7303 + }, + { + "epoch": 2.023268698060942, + "grad_norm": 0.7167572379112244, + "learning_rate": 3.7445276997662604e-06, + "loss": 0.6204, + "step": 7304 + }, + { + "epoch": 2.0235457063711912, + "grad_norm": 0.7843630313873291, + "learning_rate": 3.744211747578129e-06, + "loss": 0.5762, + "step": 7305 + }, + { + "epoch": 2.0238227146814403, + "grad_norm": 0.7135272026062012, + "learning_rate": 3.7438957689722043e-06, + "loss": 0.561, + "step": 7306 + }, + { + "epoch": 2.0240997229916897, + "grad_norm": 0.711733877658844, + "learning_rate": 3.743579763955197e-06, + "loss": 0.6362, + "step": 7307 + }, + { + "epoch": 2.024376731301939, + "grad_norm": 0.7069997191429138, + "learning_rate": 3.743263732533815e-06, + "loss": 0.5785, + "step": 7308 + }, + { + "epoch": 2.024653739612188, + "grad_norm": 0.6933404803276062, + "learning_rate": 3.7429476747147685e-06, + "loss": 0.5499, + "step": 7309 + }, + { + "epoch": 2.0249307479224377, + "grad_norm": 0.7288474440574646, + "learning_rate": 3.7426315905047694e-06, + "loss": 0.6466, + "step": 7310 + }, + { + "epoch": 2.025207756232687, + "grad_norm": 0.7310758829116821, + "learning_rate": 3.7423154799105283e-06, + "loss": 0.6243, + "step": 7311 + }, + { + "epoch": 2.025484764542936, + "grad_norm": 0.6922908425331116, + "learning_rate": 3.7419993429387563e-06, + "loss": 0.5676, + "step": 7312 + }, + { + "epoch": 2.0257617728531856, + "grad_norm": 0.6958523988723755, + "learning_rate": 3.741683179596168e-06, + "loss": 0.6155, + "step": 7313 + }, + { + "epoch": 2.026038781163435, + "grad_norm": 0.7387740612030029, + "learning_rate": 3.7413669898894734e-06, + "loss": 0.6175, + "step": 7314 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.7025557160377502, + "learning_rate": 3.7410507738253876e-06, + "loss": 0.6047, + "step": 7315 + }, + { + "epoch": 2.0265927977839335, + "grad_norm": 0.6605740189552307, + "learning_rate": 3.740734531410626e-06, + "loss": 0.5722, + "step": 7316 + }, + { + "epoch": 2.026869806094183, + "grad_norm": 0.6882765889167786, + "learning_rate": 3.7404182626519005e-06, + "loss": 0.6053, + "step": 7317 + }, + { + "epoch": 2.027146814404432, + "grad_norm": 0.6996808648109436, + "learning_rate": 3.7401019675559277e-06, + "loss": 0.5809, + "step": 7318 + }, + { + "epoch": 2.0274238227146815, + "grad_norm": 0.7477633357048035, + "learning_rate": 3.7397856461294237e-06, + "loss": 0.6255, + "step": 7319 + }, + { + "epoch": 2.027700831024931, + "grad_norm": 0.6916236281394958, + "learning_rate": 3.7394692983791037e-06, + "loss": 0.587, + "step": 7320 + }, + { + "epoch": 2.02797783933518, + "grad_norm": 0.7061457633972168, + "learning_rate": 3.7391529243116857e-06, + "loss": 0.5835, + "step": 7321 + }, + { + "epoch": 2.0282548476454294, + "grad_norm": 0.7239747047424316, + "learning_rate": 3.7388365239338866e-06, + "loss": 0.5874, + "step": 7322 + }, + { + "epoch": 2.028531855955679, + "grad_norm": 0.7012773752212524, + "learning_rate": 3.738520097252425e-06, + "loss": 0.6072, + "step": 7323 + }, + { + "epoch": 2.028808864265928, + "grad_norm": 0.6934051513671875, + "learning_rate": 3.7382036442740183e-06, + "loss": 0.6076, + "step": 7324 + }, + { + "epoch": 2.0290858725761773, + "grad_norm": 0.6960504651069641, + "learning_rate": 3.737887165005386e-06, + "loss": 0.5879, + "step": 7325 + }, + { + "epoch": 2.029362880886427, + "grad_norm": 0.777347981929779, + "learning_rate": 3.737570659453249e-06, + "loss": 0.594, + "step": 7326 + }, + { + "epoch": 2.029639889196676, + "grad_norm": 0.7199022173881531, + "learning_rate": 3.737254127624325e-06, + "loss": 0.5921, + "step": 7327 + }, + { + "epoch": 2.0299168975069253, + "grad_norm": 0.7547945976257324, + "learning_rate": 3.7369375695253374e-06, + "loss": 0.5838, + "step": 7328 + }, + { + "epoch": 2.0301939058171747, + "grad_norm": 0.754098117351532, + "learning_rate": 3.7366209851630057e-06, + "loss": 0.5948, + "step": 7329 + }, + { + "epoch": 2.0304709141274238, + "grad_norm": 0.7216700911521912, + "learning_rate": 3.736304374544053e-06, + "loss": 0.5521, + "step": 7330 + }, + { + "epoch": 2.030747922437673, + "grad_norm": 0.7701409459114075, + "learning_rate": 3.7359877376752006e-06, + "loss": 0.5908, + "step": 7331 + }, + { + "epoch": 2.0310249307479222, + "grad_norm": 0.7693707942962646, + "learning_rate": 3.7356710745631735e-06, + "loss": 0.5769, + "step": 7332 + }, + { + "epoch": 2.0313019390581717, + "grad_norm": 0.7225122451782227, + "learning_rate": 3.7353543852146926e-06, + "loss": 0.6152, + "step": 7333 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 0.7436615824699402, + "learning_rate": 3.7350376696364843e-06, + "loss": 0.619, + "step": 7334 + }, + { + "epoch": 2.03185595567867, + "grad_norm": 0.735950231552124, + "learning_rate": 3.734720927835272e-06, + "loss": 0.6075, + "step": 7335 + }, + { + "epoch": 2.0321329639889196, + "grad_norm": 0.7073866724967957, + "learning_rate": 3.7344041598177812e-06, + "loss": 0.6224, + "step": 7336 + }, + { + "epoch": 2.032409972299169, + "grad_norm": 0.6516287326812744, + "learning_rate": 3.734087365590737e-06, + "loss": 0.5226, + "step": 7337 + }, + { + "epoch": 2.032686980609418, + "grad_norm": 0.7546796202659607, + "learning_rate": 3.7337705451608676e-06, + "loss": 0.6169, + "step": 7338 + }, + { + "epoch": 2.0329639889196676, + "grad_norm": 0.7578698396682739, + "learning_rate": 3.7334536985348986e-06, + "loss": 0.587, + "step": 7339 + }, + { + "epoch": 2.033240997229917, + "grad_norm": 0.6956138014793396, + "learning_rate": 3.733136825719557e-06, + "loss": 0.6144, + "step": 7340 + }, + { + "epoch": 2.033518005540166, + "grad_norm": 0.716972291469574, + "learning_rate": 3.732819926721572e-06, + "loss": 0.5942, + "step": 7341 + }, + { + "epoch": 2.0337950138504155, + "grad_norm": 0.7351236939430237, + "learning_rate": 3.732503001547672e-06, + "loss": 0.5588, + "step": 7342 + }, + { + "epoch": 2.034072022160665, + "grad_norm": 0.6928535103797913, + "learning_rate": 3.732186050204585e-06, + "loss": 0.613, + "step": 7343 + }, + { + "epoch": 2.034349030470914, + "grad_norm": 0.7150267362594604, + "learning_rate": 3.731869072699042e-06, + "loss": 0.6011, + "step": 7344 + }, + { + "epoch": 2.0346260387811634, + "grad_norm": 0.7005348205566406, + "learning_rate": 3.731552069037774e-06, + "loss": 0.5847, + "step": 7345 + }, + { + "epoch": 2.034903047091413, + "grad_norm": 0.6965834498405457, + "learning_rate": 3.731235039227508e-06, + "loss": 0.5559, + "step": 7346 + }, + { + "epoch": 2.035180055401662, + "grad_norm": 0.7243964076042175, + "learning_rate": 3.7309179832749798e-06, + "loss": 0.6314, + "step": 7347 + }, + { + "epoch": 2.0354570637119114, + "grad_norm": 0.7027679681777954, + "learning_rate": 3.7306009011869187e-06, + "loss": 0.5905, + "step": 7348 + }, + { + "epoch": 2.035734072022161, + "grad_norm": 0.7306333780288696, + "learning_rate": 3.730283792970058e-06, + "loss": 0.5806, + "step": 7349 + }, + { + "epoch": 2.03601108033241, + "grad_norm": 0.7483391761779785, + "learning_rate": 3.72996665863113e-06, + "loss": 0.6087, + "step": 7350 + }, + { + "epoch": 2.0362880886426593, + "grad_norm": 0.7386797070503235, + "learning_rate": 3.7296494981768694e-06, + "loss": 0.6034, + "step": 7351 + }, + { + "epoch": 2.0365650969529088, + "grad_norm": 0.7524106502532959, + "learning_rate": 3.72933231161401e-06, + "loss": 0.5895, + "step": 7352 + }, + { + "epoch": 2.036842105263158, + "grad_norm": 0.7452194690704346, + "learning_rate": 3.7290150989492856e-06, + "loss": 0.5701, + "step": 7353 + }, + { + "epoch": 2.0371191135734072, + "grad_norm": 0.724443793296814, + "learning_rate": 3.728697860189433e-06, + "loss": 0.6122, + "step": 7354 + }, + { + "epoch": 2.0373961218836567, + "grad_norm": 0.737069845199585, + "learning_rate": 3.7283805953411863e-06, + "loss": 0.5866, + "step": 7355 + }, + { + "epoch": 2.0376731301939057, + "grad_norm": 0.7455108761787415, + "learning_rate": 3.7280633044112826e-06, + "loss": 0.6072, + "step": 7356 + }, + { + "epoch": 2.037950138504155, + "grad_norm": 0.7549130916595459, + "learning_rate": 3.7277459874064593e-06, + "loss": 0.5652, + "step": 7357 + }, + { + "epoch": 2.0382271468144046, + "grad_norm": 0.7499793767929077, + "learning_rate": 3.727428644333453e-06, + "loss": 0.5898, + "step": 7358 + }, + { + "epoch": 2.0385041551246537, + "grad_norm": 0.7160870432853699, + "learning_rate": 3.7271112751990017e-06, + "loss": 0.5952, + "step": 7359 + }, + { + "epoch": 2.038781163434903, + "grad_norm": 0.7001029253005981, + "learning_rate": 3.7267938800098454e-06, + "loss": 0.5834, + "step": 7360 + }, + { + "epoch": 2.039058171745152, + "grad_norm": 0.7471390962600708, + "learning_rate": 3.7264764587727217e-06, + "loss": 0.6395, + "step": 7361 + }, + { + "epoch": 2.0393351800554016, + "grad_norm": 0.7129092812538147, + "learning_rate": 3.72615901149437e-06, + "loss": 0.6213, + "step": 7362 + }, + { + "epoch": 2.039612188365651, + "grad_norm": 0.7496341466903687, + "learning_rate": 3.7258415381815315e-06, + "loss": 0.6129, + "step": 7363 + }, + { + "epoch": 2.0398891966759, + "grad_norm": 0.7098085284233093, + "learning_rate": 3.7255240388409473e-06, + "loss": 0.5844, + "step": 7364 + }, + { + "epoch": 2.0401662049861495, + "grad_norm": 0.7387219071388245, + "learning_rate": 3.725206513479358e-06, + "loss": 0.6032, + "step": 7365 + }, + { + "epoch": 2.040443213296399, + "grad_norm": 0.712770402431488, + "learning_rate": 3.724888962103505e-06, + "loss": 0.5806, + "step": 7366 + }, + { + "epoch": 2.040720221606648, + "grad_norm": 0.7392038106918335, + "learning_rate": 3.724571384720132e-06, + "loss": 0.6242, + "step": 7367 + }, + { + "epoch": 2.0409972299168975, + "grad_norm": 0.6714391112327576, + "learning_rate": 3.7242537813359803e-06, + "loss": 0.6208, + "step": 7368 + }, + { + "epoch": 2.041274238227147, + "grad_norm": 0.6978964805603027, + "learning_rate": 3.7239361519577956e-06, + "loss": 0.5974, + "step": 7369 + }, + { + "epoch": 2.041551246537396, + "grad_norm": 0.7669302821159363, + "learning_rate": 3.72361849659232e-06, + "loss": 0.6248, + "step": 7370 + }, + { + "epoch": 2.0418282548476454, + "grad_norm": 0.6692224740982056, + "learning_rate": 3.723300815246299e-06, + "loss": 0.586, + "step": 7371 + }, + { + "epoch": 2.042105263157895, + "grad_norm": 0.6983182430267334, + "learning_rate": 3.722983107926478e-06, + "loss": 0.5746, + "step": 7372 + }, + { + "epoch": 2.042382271468144, + "grad_norm": 0.6932432651519775, + "learning_rate": 3.7226653746396025e-06, + "loss": 0.5787, + "step": 7373 + }, + { + "epoch": 2.0426592797783933, + "grad_norm": 0.7647563815116882, + "learning_rate": 3.722347615392419e-06, + "loss": 0.6347, + "step": 7374 + }, + { + "epoch": 2.042936288088643, + "grad_norm": 0.7473055720329285, + "learning_rate": 3.7220298301916735e-06, + "loss": 0.6685, + "step": 7375 + }, + { + "epoch": 2.043213296398892, + "grad_norm": 0.7167715430259705, + "learning_rate": 3.7217120190441134e-06, + "loss": 0.6559, + "step": 7376 + }, + { + "epoch": 2.0434903047091413, + "grad_norm": 0.7501154541969299, + "learning_rate": 3.721394181956489e-06, + "loss": 0.5932, + "step": 7377 + }, + { + "epoch": 2.0437673130193907, + "grad_norm": 0.729622483253479, + "learning_rate": 3.7210763189355455e-06, + "loss": 0.5582, + "step": 7378 + }, + { + "epoch": 2.0440443213296398, + "grad_norm": 0.7165427803993225, + "learning_rate": 3.7207584299880337e-06, + "loss": 0.6243, + "step": 7379 + }, + { + "epoch": 2.044321329639889, + "grad_norm": 0.6978292465209961, + "learning_rate": 3.7204405151207036e-06, + "loss": 0.6019, + "step": 7380 + }, + { + "epoch": 2.0445983379501387, + "grad_norm": 0.7220966219902039, + "learning_rate": 3.720122574340304e-06, + "loss": 0.6171, + "step": 7381 + }, + { + "epoch": 2.0448753462603877, + "grad_norm": 0.7531928420066833, + "learning_rate": 3.7198046076535865e-06, + "loss": 0.5751, + "step": 7382 + }, + { + "epoch": 2.045152354570637, + "grad_norm": 0.6982653737068176, + "learning_rate": 3.7194866150673027e-06, + "loss": 0.5655, + "step": 7383 + }, + { + "epoch": 2.0454293628808866, + "grad_norm": 0.6789435148239136, + "learning_rate": 3.7191685965882033e-06, + "loss": 0.5926, + "step": 7384 + }, + { + "epoch": 2.0457063711911356, + "grad_norm": 0.7709596753120422, + "learning_rate": 3.718850552223041e-06, + "loss": 0.6178, + "step": 7385 + }, + { + "epoch": 2.045983379501385, + "grad_norm": 0.71182781457901, + "learning_rate": 3.71853248197857e-06, + "loss": 0.6004, + "step": 7386 + }, + { + "epoch": 2.0462603878116346, + "grad_norm": 0.6998679637908936, + "learning_rate": 3.7182143858615416e-06, + "loss": 0.5959, + "step": 7387 + }, + { + "epoch": 2.0465373961218836, + "grad_norm": 0.721081554889679, + "learning_rate": 3.717896263878711e-06, + "loss": 0.5927, + "step": 7388 + }, + { + "epoch": 2.046814404432133, + "grad_norm": 0.7610851526260376, + "learning_rate": 3.7175781160368324e-06, + "loss": 0.6316, + "step": 7389 + }, + { + "epoch": 2.0470914127423825, + "grad_norm": 0.7433952689170837, + "learning_rate": 3.7172599423426614e-06, + "loss": 0.6325, + "step": 7390 + }, + { + "epoch": 2.0473684210526315, + "grad_norm": 0.7426512837409973, + "learning_rate": 3.7169417428029525e-06, + "loss": 0.6334, + "step": 7391 + }, + { + "epoch": 2.047645429362881, + "grad_norm": 0.7442209124565125, + "learning_rate": 3.7166235174244637e-06, + "loss": 0.5711, + "step": 7392 + }, + { + "epoch": 2.04792243767313, + "grad_norm": 0.7405110597610474, + "learning_rate": 3.7163052662139497e-06, + "loss": 0.5779, + "step": 7393 + }, + { + "epoch": 2.0481994459833794, + "grad_norm": 0.7343849539756775, + "learning_rate": 3.7159869891781698e-06, + "loss": 0.6144, + "step": 7394 + }, + { + "epoch": 2.048476454293629, + "grad_norm": 0.6877952814102173, + "learning_rate": 3.71566868632388e-06, + "loss": 0.5983, + "step": 7395 + }, + { + "epoch": 2.048753462603878, + "grad_norm": 0.7006337642669678, + "learning_rate": 3.71535035765784e-06, + "loss": 0.6, + "step": 7396 + }, + { + "epoch": 2.0490304709141274, + "grad_norm": 0.7534791827201843, + "learning_rate": 3.715032003186808e-06, + "loss": 0.6124, + "step": 7397 + }, + { + "epoch": 2.049307479224377, + "grad_norm": 0.7348729968070984, + "learning_rate": 3.7147136229175436e-06, + "loss": 0.6252, + "step": 7398 + }, + { + "epoch": 2.049584487534626, + "grad_norm": 0.6931565999984741, + "learning_rate": 3.7143952168568077e-06, + "loss": 0.5888, + "step": 7399 + }, + { + "epoch": 2.0498614958448753, + "grad_norm": 0.7033185362815857, + "learning_rate": 3.7140767850113592e-06, + "loss": 0.5964, + "step": 7400 + }, + { + "epoch": 2.0501385041551248, + "grad_norm": 0.7246971130371094, + "learning_rate": 3.7137583273879606e-06, + "loss": 0.6151, + "step": 7401 + }, + { + "epoch": 2.050415512465374, + "grad_norm": 0.7070119380950928, + "learning_rate": 3.713439843993373e-06, + "loss": 0.5908, + "step": 7402 + }, + { + "epoch": 2.0506925207756233, + "grad_norm": 0.7307522296905518, + "learning_rate": 3.7131213348343585e-06, + "loss": 0.624, + "step": 7403 + }, + { + "epoch": 2.0509695290858727, + "grad_norm": 0.7054348587989807, + "learning_rate": 3.71280279991768e-06, + "loss": 0.5466, + "step": 7404 + }, + { + "epoch": 2.0512465373961217, + "grad_norm": 0.7453206181526184, + "learning_rate": 3.7124842392501014e-06, + "loss": 0.6172, + "step": 7405 + }, + { + "epoch": 2.051523545706371, + "grad_norm": 0.7176256775856018, + "learning_rate": 3.7121656528383866e-06, + "loss": 0.6353, + "step": 7406 + }, + { + "epoch": 2.0518005540166206, + "grad_norm": 0.7283518314361572, + "learning_rate": 3.7118470406892986e-06, + "loss": 0.6067, + "step": 7407 + }, + { + "epoch": 2.0520775623268697, + "grad_norm": 0.7188484072685242, + "learning_rate": 3.7115284028096033e-06, + "loss": 0.5842, + "step": 7408 + }, + { + "epoch": 2.052354570637119, + "grad_norm": 0.7040556073188782, + "learning_rate": 3.7112097392060658e-06, + "loss": 0.6007, + "step": 7409 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.700164794921875, + "learning_rate": 3.7108910498854526e-06, + "loss": 0.5744, + "step": 7410 + }, + { + "epoch": 2.0529085872576176, + "grad_norm": 0.7129940986633301, + "learning_rate": 3.7105723348545308e-06, + "loss": 0.5953, + "step": 7411 + }, + { + "epoch": 2.053185595567867, + "grad_norm": 0.7359551191329956, + "learning_rate": 3.710253594120067e-06, + "loss": 0.5892, + "step": 7412 + }, + { + "epoch": 2.0534626038781165, + "grad_norm": 0.7358061671257019, + "learning_rate": 3.709934827688827e-06, + "loss": 0.5844, + "step": 7413 + }, + { + "epoch": 2.0537396121883655, + "grad_norm": 0.6994933485984802, + "learning_rate": 3.7096160355675827e-06, + "loss": 0.5767, + "step": 7414 + }, + { + "epoch": 2.054016620498615, + "grad_norm": 0.734293520450592, + "learning_rate": 3.7092972177630998e-06, + "loss": 0.6198, + "step": 7415 + }, + { + "epoch": 2.0542936288088645, + "grad_norm": 0.7511576414108276, + "learning_rate": 3.7089783742821493e-06, + "loss": 0.6362, + "step": 7416 + }, + { + "epoch": 2.0545706371191135, + "grad_norm": 0.7923803925514221, + "learning_rate": 3.7086595051315e-06, + "loss": 0.5829, + "step": 7417 + }, + { + "epoch": 2.054847645429363, + "grad_norm": 0.7466977834701538, + "learning_rate": 3.7083406103179235e-06, + "loss": 0.621, + "step": 7418 + }, + { + "epoch": 2.0551246537396124, + "grad_norm": 0.7417007088661194, + "learning_rate": 3.7080216898481897e-06, + "loss": 0.6103, + "step": 7419 + }, + { + "epoch": 2.0554016620498614, + "grad_norm": 0.69034743309021, + "learning_rate": 3.70770274372907e-06, + "loss": 0.6059, + "step": 7420 + }, + { + "epoch": 2.055678670360111, + "grad_norm": 0.6910015344619751, + "learning_rate": 3.707383771967338e-06, + "loss": 0.5914, + "step": 7421 + }, + { + "epoch": 2.05595567867036, + "grad_norm": 0.7586956024169922, + "learning_rate": 3.7070647745697647e-06, + "loss": 0.6004, + "step": 7422 + }, + { + "epoch": 2.0562326869806093, + "grad_norm": 0.7114026546478271, + "learning_rate": 3.706745751543123e-06, + "loss": 0.6295, + "step": 7423 + }, + { + "epoch": 2.056509695290859, + "grad_norm": 0.7049426436424255, + "learning_rate": 3.706426702894188e-06, + "loss": 0.6064, + "step": 7424 + }, + { + "epoch": 2.056786703601108, + "grad_norm": 0.7011547684669495, + "learning_rate": 3.7061076286297335e-06, + "loss": 0.5655, + "step": 7425 + }, + { + "epoch": 2.0570637119113573, + "grad_norm": 0.7338706254959106, + "learning_rate": 3.705788528756533e-06, + "loss": 0.5801, + "step": 7426 + }, + { + "epoch": 2.0573407202216067, + "grad_norm": 0.7272680401802063, + "learning_rate": 3.7054694032813644e-06, + "loss": 0.5601, + "step": 7427 + }, + { + "epoch": 2.0576177285318558, + "grad_norm": 0.744717001914978, + "learning_rate": 3.7051502522110006e-06, + "loss": 0.5771, + "step": 7428 + }, + { + "epoch": 2.057894736842105, + "grad_norm": 0.7375504970550537, + "learning_rate": 3.70483107555222e-06, + "loss": 0.6157, + "step": 7429 + }, + { + "epoch": 2.0581717451523547, + "grad_norm": 0.7337304353713989, + "learning_rate": 3.704511873311798e-06, + "loss": 0.5899, + "step": 7430 + }, + { + "epoch": 2.0584487534626037, + "grad_norm": 0.720333456993103, + "learning_rate": 3.7041926454965137e-06, + "loss": 0.628, + "step": 7431 + }, + { + "epoch": 2.058725761772853, + "grad_norm": 0.6853072643280029, + "learning_rate": 3.7038733921131444e-06, + "loss": 0.5362, + "step": 7432 + }, + { + "epoch": 2.0590027700831026, + "grad_norm": 0.6991992592811584, + "learning_rate": 3.7035541131684684e-06, + "loss": 0.5874, + "step": 7433 + }, + { + "epoch": 2.0592797783933516, + "grad_norm": 0.7000507712364197, + "learning_rate": 3.703234808669265e-06, + "loss": 0.6198, + "step": 7434 + }, + { + "epoch": 2.059556786703601, + "grad_norm": 0.7110688090324402, + "learning_rate": 3.7029154786223144e-06, + "loss": 0.609, + "step": 7435 + }, + { + "epoch": 2.0598337950138506, + "grad_norm": 0.7329934239387512, + "learning_rate": 3.7025961230343954e-06, + "loss": 0.6258, + "step": 7436 + }, + { + "epoch": 2.0601108033240996, + "grad_norm": 0.741564929485321, + "learning_rate": 3.70227674191229e-06, + "loss": 0.6174, + "step": 7437 + }, + { + "epoch": 2.060387811634349, + "grad_norm": 0.7025269865989685, + "learning_rate": 3.701957335262779e-06, + "loss": 0.5482, + "step": 7438 + }, + { + "epoch": 2.0606648199445985, + "grad_norm": 0.7336528301239014, + "learning_rate": 3.701637903092644e-06, + "loss": 0.6085, + "step": 7439 + }, + { + "epoch": 2.0609418282548475, + "grad_norm": 0.6992329955101013, + "learning_rate": 3.7013184454086688e-06, + "loss": 0.5614, + "step": 7440 + }, + { + "epoch": 2.061218836565097, + "grad_norm": 0.7023910284042358, + "learning_rate": 3.700998962217634e-06, + "loss": 0.5856, + "step": 7441 + }, + { + "epoch": 2.0614958448753464, + "grad_norm": 0.727231502532959, + "learning_rate": 3.7006794535263246e-06, + "loss": 0.6333, + "step": 7442 + }, + { + "epoch": 2.0617728531855954, + "grad_norm": 0.7130199670791626, + "learning_rate": 3.7003599193415247e-06, + "loss": 0.5509, + "step": 7443 + }, + { + "epoch": 2.062049861495845, + "grad_norm": 0.6830217242240906, + "learning_rate": 3.700040359670018e-06, + "loss": 0.5795, + "step": 7444 + }, + { + "epoch": 2.0623268698060944, + "grad_norm": 0.7464885115623474, + "learning_rate": 3.6997207745185892e-06, + "loss": 0.63, + "step": 7445 + }, + { + "epoch": 2.0626038781163434, + "grad_norm": 0.749214768409729, + "learning_rate": 3.6994011638940253e-06, + "loss": 0.6179, + "step": 7446 + }, + { + "epoch": 2.062880886426593, + "grad_norm": 0.7185910940170288, + "learning_rate": 3.6990815278031116e-06, + "loss": 0.5941, + "step": 7447 + }, + { + "epoch": 2.0631578947368423, + "grad_norm": 0.7367299795150757, + "learning_rate": 3.698761866252635e-06, + "loss": 0.6337, + "step": 7448 + }, + { + "epoch": 2.0634349030470913, + "grad_norm": 0.7037851810455322, + "learning_rate": 3.6984421792493824e-06, + "loss": 0.5475, + "step": 7449 + }, + { + "epoch": 2.063711911357341, + "grad_norm": 0.7410505414009094, + "learning_rate": 3.6981224668001427e-06, + "loss": 0.5965, + "step": 7450 + }, + { + "epoch": 2.0639889196675902, + "grad_norm": 0.702735185623169, + "learning_rate": 3.6978027289117024e-06, + "loss": 0.561, + "step": 7451 + }, + { + "epoch": 2.0642659279778393, + "grad_norm": 0.7526459693908691, + "learning_rate": 3.6974829655908518e-06, + "loss": 0.6003, + "step": 7452 + }, + { + "epoch": 2.0645429362880887, + "grad_norm": 0.6807567477226257, + "learning_rate": 3.69716317684438e-06, + "loss": 0.5654, + "step": 7453 + }, + { + "epoch": 2.064819944598338, + "grad_norm": 0.7194676995277405, + "learning_rate": 3.6968433626790755e-06, + "loss": 0.6455, + "step": 7454 + }, + { + "epoch": 2.065096952908587, + "grad_norm": 0.7026492953300476, + "learning_rate": 3.6965235231017316e-06, + "loss": 0.6082, + "step": 7455 + }, + { + "epoch": 2.0653739612188367, + "grad_norm": 0.6919288635253906, + "learning_rate": 3.6962036581191367e-06, + "loss": 0.6148, + "step": 7456 + }, + { + "epoch": 2.0656509695290857, + "grad_norm": 0.7587344646453857, + "learning_rate": 3.6958837677380843e-06, + "loss": 0.6202, + "step": 7457 + }, + { + "epoch": 2.065927977839335, + "grad_norm": 0.7068833708763123, + "learning_rate": 3.695563851965364e-06, + "loss": 0.6204, + "step": 7458 + }, + { + "epoch": 2.0662049861495846, + "grad_norm": 0.771920382976532, + "learning_rate": 3.695243910807772e-06, + "loss": 0.6288, + "step": 7459 + }, + { + "epoch": 2.0664819944598336, + "grad_norm": 0.6968458294868469, + "learning_rate": 3.694923944272098e-06, + "loss": 0.5845, + "step": 7460 + }, + { + "epoch": 2.066759002770083, + "grad_norm": 0.6951239705085754, + "learning_rate": 3.694603952365138e-06, + "loss": 0.6345, + "step": 7461 + }, + { + "epoch": 2.0670360110803325, + "grad_norm": 0.7480777502059937, + "learning_rate": 3.694283935093684e-06, + "loss": 0.6369, + "step": 7462 + }, + { + "epoch": 2.0673130193905815, + "grad_norm": 0.7199879288673401, + "learning_rate": 3.6939638924645336e-06, + "loss": 0.6139, + "step": 7463 + }, + { + "epoch": 2.067590027700831, + "grad_norm": 0.6921995878219604, + "learning_rate": 3.6936438244844802e-06, + "loss": 0.5678, + "step": 7464 + }, + { + "epoch": 2.0678670360110805, + "grad_norm": 0.7843019366264343, + "learning_rate": 3.6933237311603205e-06, + "loss": 0.6417, + "step": 7465 + }, + { + "epoch": 2.0681440443213295, + "grad_norm": 0.7345570921897888, + "learning_rate": 3.69300361249885e-06, + "loss": 0.6187, + "step": 7466 + }, + { + "epoch": 2.068421052631579, + "grad_norm": 0.7142504453659058, + "learning_rate": 3.692683468506866e-06, + "loss": 0.6057, + "step": 7467 + }, + { + "epoch": 2.0686980609418284, + "grad_norm": 0.690905749797821, + "learning_rate": 3.6923632991911672e-06, + "loss": 0.6299, + "step": 7468 + }, + { + "epoch": 2.0689750692520774, + "grad_norm": 0.7089608311653137, + "learning_rate": 3.69204310455855e-06, + "loss": 0.6054, + "step": 7469 + }, + { + "epoch": 2.069252077562327, + "grad_norm": 0.757316529750824, + "learning_rate": 3.691722884615814e-06, + "loss": 0.6013, + "step": 7470 + }, + { + "epoch": 2.0695290858725763, + "grad_norm": 0.7615708708763123, + "learning_rate": 3.6914026393697573e-06, + "loss": 0.5864, + "step": 7471 + }, + { + "epoch": 2.0698060941828254, + "grad_norm": 0.7319211959838867, + "learning_rate": 3.6910823688271803e-06, + "loss": 0.5808, + "step": 7472 + }, + { + "epoch": 2.070083102493075, + "grad_norm": 0.7206616401672363, + "learning_rate": 3.690762072994883e-06, + "loss": 0.582, + "step": 7473 + }, + { + "epoch": 2.0703601108033243, + "grad_norm": 0.7332830429077148, + "learning_rate": 3.6904417518796655e-06, + "loss": 0.6, + "step": 7474 + }, + { + "epoch": 2.0706371191135733, + "grad_norm": 0.7284979820251465, + "learning_rate": 3.69012140548833e-06, + "loss": 0.5562, + "step": 7475 + }, + { + "epoch": 2.0709141274238227, + "grad_norm": 0.7407473921775818, + "learning_rate": 3.689801033827678e-06, + "loss": 0.5966, + "step": 7476 + }, + { + "epoch": 2.071191135734072, + "grad_norm": 0.7004116177558899, + "learning_rate": 3.689480636904511e-06, + "loss": 0.6051, + "step": 7477 + }, + { + "epoch": 2.0714681440443212, + "grad_norm": 0.7138201594352722, + "learning_rate": 3.689160214725633e-06, + "loss": 0.5829, + "step": 7478 + }, + { + "epoch": 2.0717451523545707, + "grad_norm": 0.7637184858322144, + "learning_rate": 3.6888397672978465e-06, + "loss": 0.6599, + "step": 7479 + }, + { + "epoch": 2.07202216066482, + "grad_norm": 0.6777431964874268, + "learning_rate": 3.6885192946279557e-06, + "loss": 0.6032, + "step": 7480 + }, + { + "epoch": 2.072299168975069, + "grad_norm": 0.7209843397140503, + "learning_rate": 3.6881987967227656e-06, + "loss": 0.6343, + "step": 7481 + }, + { + "epoch": 2.0725761772853186, + "grad_norm": 0.6750618815422058, + "learning_rate": 3.6878782735890807e-06, + "loss": 0.6436, + "step": 7482 + }, + { + "epoch": 2.072853185595568, + "grad_norm": 0.7013235688209534, + "learning_rate": 3.6875577252337065e-06, + "loss": 0.6335, + "step": 7483 + }, + { + "epoch": 2.073130193905817, + "grad_norm": 0.7054664492607117, + "learning_rate": 3.6872371516634485e-06, + "loss": 0.5663, + "step": 7484 + }, + { + "epoch": 2.0734072022160666, + "grad_norm": 0.7187961339950562, + "learning_rate": 3.686916552885115e-06, + "loss": 0.6126, + "step": 7485 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 0.6890035271644592, + "learning_rate": 3.686595928905511e-06, + "loss": 0.5854, + "step": 7486 + }, + { + "epoch": 2.073961218836565, + "grad_norm": 0.7117055654525757, + "learning_rate": 3.6862752797314455e-06, + "loss": 0.5917, + "step": 7487 + }, + { + "epoch": 2.0742382271468145, + "grad_norm": 0.7330850958824158, + "learning_rate": 3.6859546053697264e-06, + "loss": 0.6316, + "step": 7488 + }, + { + "epoch": 2.0745152354570635, + "grad_norm": 0.7155464291572571, + "learning_rate": 3.685633905827163e-06, + "loss": 0.6043, + "step": 7489 + }, + { + "epoch": 2.074792243767313, + "grad_norm": 0.752101719379425, + "learning_rate": 3.685313181110563e-06, + "loss": 0.5926, + "step": 7490 + }, + { + "epoch": 2.0750692520775624, + "grad_norm": 0.695989727973938, + "learning_rate": 3.684992431226738e-06, + "loss": 0.5759, + "step": 7491 + }, + { + "epoch": 2.0753462603878114, + "grad_norm": 0.7123996019363403, + "learning_rate": 3.684671656182497e-06, + "loss": 0.6116, + "step": 7492 + }, + { + "epoch": 2.075623268698061, + "grad_norm": 0.7395370006561279, + "learning_rate": 3.684350855984652e-06, + "loss": 0.5756, + "step": 7493 + }, + { + "epoch": 2.0759002770083104, + "grad_norm": 0.7672297954559326, + "learning_rate": 3.6840300306400135e-06, + "loss": 0.58, + "step": 7494 + }, + { + "epoch": 2.0761772853185594, + "grad_norm": 0.7145938277244568, + "learning_rate": 3.683709180155393e-06, + "loss": 0.6158, + "step": 7495 + }, + { + "epoch": 2.076454293628809, + "grad_norm": 0.6776156425476074, + "learning_rate": 3.683388304537605e-06, + "loss": 0.6006, + "step": 7496 + }, + { + "epoch": 2.0767313019390583, + "grad_norm": 0.7504327297210693, + "learning_rate": 3.683067403793461e-06, + "loss": 0.5999, + "step": 7497 + }, + { + "epoch": 2.0770083102493073, + "grad_norm": 0.7060835957527161, + "learning_rate": 3.6827464779297744e-06, + "loss": 0.6208, + "step": 7498 + }, + { + "epoch": 2.077285318559557, + "grad_norm": 0.7075418829917908, + "learning_rate": 3.6824255269533595e-06, + "loss": 0.5903, + "step": 7499 + }, + { + "epoch": 2.0775623268698062, + "grad_norm": 0.7271056771278381, + "learning_rate": 3.682104550871031e-06, + "loss": 0.6038, + "step": 7500 + }, + { + "epoch": 2.0778393351800553, + "grad_norm": 0.7134436964988708, + "learning_rate": 3.6817835496896047e-06, + "loss": 0.5851, + "step": 7501 + }, + { + "epoch": 2.0781163434903047, + "grad_norm": 0.7162055373191833, + "learning_rate": 3.681462523415896e-06, + "loss": 0.6299, + "step": 7502 + }, + { + "epoch": 2.078393351800554, + "grad_norm": 0.6949959397315979, + "learning_rate": 3.6811414720567197e-06, + "loss": 0.5712, + "step": 7503 + }, + { + "epoch": 2.078670360110803, + "grad_norm": 0.6959918141365051, + "learning_rate": 3.6808203956188947e-06, + "loss": 0.6075, + "step": 7504 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.7247087955474854, + "learning_rate": 3.6804992941092367e-06, + "loss": 0.6493, + "step": 7505 + }, + { + "epoch": 2.079224376731302, + "grad_norm": 0.7352406978607178, + "learning_rate": 3.680178167534564e-06, + "loss": 0.6091, + "step": 7506 + }, + { + "epoch": 2.079501385041551, + "grad_norm": 0.7071592211723328, + "learning_rate": 3.6798570159016955e-06, + "loss": 0.5718, + "step": 7507 + }, + { + "epoch": 2.0797783933518006, + "grad_norm": 0.739302933216095, + "learning_rate": 3.6795358392174484e-06, + "loss": 0.5814, + "step": 7508 + }, + { + "epoch": 2.08005540166205, + "grad_norm": 0.7068384289741516, + "learning_rate": 3.679214637488644e-06, + "loss": 0.6191, + "step": 7509 + }, + { + "epoch": 2.080332409972299, + "grad_norm": 0.7154803276062012, + "learning_rate": 3.6788934107221016e-06, + "loss": 0.6188, + "step": 7510 + }, + { + "epoch": 2.0806094182825485, + "grad_norm": 0.7316164374351501, + "learning_rate": 3.6785721589246413e-06, + "loss": 0.5853, + "step": 7511 + }, + { + "epoch": 2.080886426592798, + "grad_norm": 0.7105570435523987, + "learning_rate": 3.678250882103084e-06, + "loss": 0.6135, + "step": 7512 + }, + { + "epoch": 2.081163434903047, + "grad_norm": 0.6962291598320007, + "learning_rate": 3.6779295802642524e-06, + "loss": 0.5665, + "step": 7513 + }, + { + "epoch": 2.0814404432132965, + "grad_norm": 0.7440493702888489, + "learning_rate": 3.6776082534149664e-06, + "loss": 0.6088, + "step": 7514 + }, + { + "epoch": 2.081717451523546, + "grad_norm": 0.7357923984527588, + "learning_rate": 3.677286901562051e-06, + "loss": 0.5822, + "step": 7515 + }, + { + "epoch": 2.081994459833795, + "grad_norm": 0.7124137878417969, + "learning_rate": 3.676965524712328e-06, + "loss": 0.6255, + "step": 7516 + }, + { + "epoch": 2.0822714681440444, + "grad_norm": 0.7291326522827148, + "learning_rate": 3.6766441228726212e-06, + "loss": 0.5945, + "step": 7517 + }, + { + "epoch": 2.0825484764542934, + "grad_norm": 0.7190305590629578, + "learning_rate": 3.6763226960497543e-06, + "loss": 0.5883, + "step": 7518 + }, + { + "epoch": 2.082825484764543, + "grad_norm": 0.7270352244377136, + "learning_rate": 3.676001244250553e-06, + "loss": 0.6422, + "step": 7519 + }, + { + "epoch": 2.0831024930747923, + "grad_norm": 0.7007784843444824, + "learning_rate": 3.675679767481842e-06, + "loss": 0.5702, + "step": 7520 + }, + { + "epoch": 2.0833795013850414, + "grad_norm": 0.7307643890380859, + "learning_rate": 3.6753582657504476e-06, + "loss": 0.5961, + "step": 7521 + }, + { + "epoch": 2.083656509695291, + "grad_norm": 0.7441365122795105, + "learning_rate": 3.6750367390631954e-06, + "loss": 0.6231, + "step": 7522 + }, + { + "epoch": 2.0839335180055403, + "grad_norm": 0.7360666394233704, + "learning_rate": 3.6747151874269125e-06, + "loss": 0.5898, + "step": 7523 + }, + { + "epoch": 2.0842105263157893, + "grad_norm": 0.7180540561676025, + "learning_rate": 3.674393610848426e-06, + "loss": 0.6219, + "step": 7524 + }, + { + "epoch": 2.0844875346260388, + "grad_norm": 0.7242913246154785, + "learning_rate": 3.6740720093345646e-06, + "loss": 0.6056, + "step": 7525 + }, + { + "epoch": 2.084764542936288, + "grad_norm": 0.7217580080032349, + "learning_rate": 3.673750382892156e-06, + "loss": 0.6093, + "step": 7526 + }, + { + "epoch": 2.0850415512465372, + "grad_norm": 0.6987811923027039, + "learning_rate": 3.6734287315280293e-06, + "loss": 0.625, + "step": 7527 + }, + { + "epoch": 2.0853185595567867, + "grad_norm": 0.7222145199775696, + "learning_rate": 3.673107055249014e-06, + "loss": 0.6025, + "step": 7528 + }, + { + "epoch": 2.085595567867036, + "grad_norm": 0.7623053789138794, + "learning_rate": 3.67278535406194e-06, + "loss": 0.6376, + "step": 7529 + }, + { + "epoch": 2.085872576177285, + "grad_norm": 0.7541896104812622, + "learning_rate": 3.672463627973639e-06, + "loss": 0.6107, + "step": 7530 + }, + { + "epoch": 2.0861495844875346, + "grad_norm": 0.7208195328712463, + "learning_rate": 3.672141876990939e-06, + "loss": 0.597, + "step": 7531 + }, + { + "epoch": 2.086426592797784, + "grad_norm": 0.7163652777671814, + "learning_rate": 3.671820101120675e-06, + "loss": 0.6422, + "step": 7532 + }, + { + "epoch": 2.086703601108033, + "grad_norm": 0.7693494558334351, + "learning_rate": 3.671498300369678e-06, + "loss": 0.6053, + "step": 7533 + }, + { + "epoch": 2.0869806094182826, + "grad_norm": 0.6839724779129028, + "learning_rate": 3.6711764747447797e-06, + "loss": 0.5143, + "step": 7534 + }, + { + "epoch": 2.087257617728532, + "grad_norm": 0.7062249183654785, + "learning_rate": 3.6708546242528135e-06, + "loss": 0.5237, + "step": 7535 + }, + { + "epoch": 2.087534626038781, + "grad_norm": 0.7552582025527954, + "learning_rate": 3.670532748900615e-06, + "loss": 0.5947, + "step": 7536 + }, + { + "epoch": 2.0878116343490305, + "grad_norm": 0.738476037979126, + "learning_rate": 3.670210848695016e-06, + "loss": 0.6166, + "step": 7537 + }, + { + "epoch": 2.08808864265928, + "grad_norm": 0.7047578692436218, + "learning_rate": 3.6698889236428526e-06, + "loss": 0.6175, + "step": 7538 + }, + { + "epoch": 2.088365650969529, + "grad_norm": 0.7480363845825195, + "learning_rate": 3.66956697375096e-06, + "loss": 0.6328, + "step": 7539 + }, + { + "epoch": 2.0886426592797784, + "grad_norm": 0.7898052930831909, + "learning_rate": 3.6692449990261736e-06, + "loss": 0.6133, + "step": 7540 + }, + { + "epoch": 2.088919667590028, + "grad_norm": 0.7435026168823242, + "learning_rate": 3.6689229994753293e-06, + "loss": 0.5961, + "step": 7541 + }, + { + "epoch": 2.089196675900277, + "grad_norm": 0.716643750667572, + "learning_rate": 3.668600975105266e-06, + "loss": 0.5999, + "step": 7542 + }, + { + "epoch": 2.0894736842105264, + "grad_norm": 0.7211320400238037, + "learning_rate": 3.668278925922819e-06, + "loss": 0.6316, + "step": 7543 + }, + { + "epoch": 2.089750692520776, + "grad_norm": 0.7620773911476135, + "learning_rate": 3.6679568519348264e-06, + "loss": 0.5709, + "step": 7544 + }, + { + "epoch": 2.090027700831025, + "grad_norm": 0.7366681694984436, + "learning_rate": 3.6676347531481283e-06, + "loss": 0.6344, + "step": 7545 + }, + { + "epoch": 2.0903047091412743, + "grad_norm": 0.7111081480979919, + "learning_rate": 3.6673126295695615e-06, + "loss": 0.5959, + "step": 7546 + }, + { + "epoch": 2.0905817174515233, + "grad_norm": 0.74237060546875, + "learning_rate": 3.6669904812059674e-06, + "loss": 0.6095, + "step": 7547 + }, + { + "epoch": 2.090858725761773, + "grad_norm": 0.7371329665184021, + "learning_rate": 3.6666683080641846e-06, + "loss": 0.5726, + "step": 7548 + }, + { + "epoch": 2.0911357340720222, + "grad_norm": 0.7270698547363281, + "learning_rate": 3.666346110151055e-06, + "loss": 0.6043, + "step": 7549 + }, + { + "epoch": 2.0914127423822713, + "grad_norm": 0.738433301448822, + "learning_rate": 3.666023887473418e-06, + "loss": 0.6094, + "step": 7550 + }, + { + "epoch": 2.0916897506925207, + "grad_norm": 0.6856642961502075, + "learning_rate": 3.665701640038117e-06, + "loss": 0.5672, + "step": 7551 + }, + { + "epoch": 2.09196675900277, + "grad_norm": 0.741007924079895, + "learning_rate": 3.6653793678519926e-06, + "loss": 0.6287, + "step": 7552 + }, + { + "epoch": 2.092243767313019, + "grad_norm": 0.703421413898468, + "learning_rate": 3.6650570709218884e-06, + "loss": 0.6057, + "step": 7553 + }, + { + "epoch": 2.0925207756232687, + "grad_norm": 0.7339208722114563, + "learning_rate": 3.6647347492546477e-06, + "loss": 0.6702, + "step": 7554 + }, + { + "epoch": 2.092797783933518, + "grad_norm": 0.7148478627204895, + "learning_rate": 3.664412402857114e-06, + "loss": 0.6141, + "step": 7555 + }, + { + "epoch": 2.093074792243767, + "grad_norm": 0.7352780699729919, + "learning_rate": 3.6640900317361306e-06, + "loss": 0.6118, + "step": 7556 + }, + { + "epoch": 2.0933518005540166, + "grad_norm": 0.7099614143371582, + "learning_rate": 3.6637676358985435e-06, + "loss": 0.6144, + "step": 7557 + }, + { + "epoch": 2.093628808864266, + "grad_norm": 0.7019182443618774, + "learning_rate": 3.663445215351198e-06, + "loss": 0.6303, + "step": 7558 + }, + { + "epoch": 2.093905817174515, + "grad_norm": 0.6563045382499695, + "learning_rate": 3.6631227701009387e-06, + "loss": 0.5652, + "step": 7559 + }, + { + "epoch": 2.0941828254847645, + "grad_norm": 0.7421273589134216, + "learning_rate": 3.662800300154613e-06, + "loss": 0.608, + "step": 7560 + }, + { + "epoch": 2.094459833795014, + "grad_norm": 0.71537846326828, + "learning_rate": 3.662477805519068e-06, + "loss": 0.6085, + "step": 7561 + }, + { + "epoch": 2.094736842105263, + "grad_norm": 0.7306783199310303, + "learning_rate": 3.66215528620115e-06, + "loss": 0.5915, + "step": 7562 + }, + { + "epoch": 2.0950138504155125, + "grad_norm": 0.7352632284164429, + "learning_rate": 3.661832742207707e-06, + "loss": 0.6423, + "step": 7563 + }, + { + "epoch": 2.095290858725762, + "grad_norm": 0.7389764189720154, + "learning_rate": 3.6615101735455883e-06, + "loss": 0.6159, + "step": 7564 + }, + { + "epoch": 2.095567867036011, + "grad_norm": 0.7337745428085327, + "learning_rate": 3.6611875802216427e-06, + "loss": 0.5833, + "step": 7565 + }, + { + "epoch": 2.0958448753462604, + "grad_norm": 0.6928842663764954, + "learning_rate": 3.660864962242719e-06, + "loss": 0.5738, + "step": 7566 + }, + { + "epoch": 2.09612188365651, + "grad_norm": 0.6791226863861084, + "learning_rate": 3.6605423196156687e-06, + "loss": 0.5377, + "step": 7567 + }, + { + "epoch": 2.096398891966759, + "grad_norm": 0.7218356132507324, + "learning_rate": 3.66021965234734e-06, + "loss": 0.5785, + "step": 7568 + }, + { + "epoch": 2.0966759002770083, + "grad_norm": 0.700420618057251, + "learning_rate": 3.6598969604445854e-06, + "loss": 0.585, + "step": 7569 + }, + { + "epoch": 2.096952908587258, + "grad_norm": 0.7712434530258179, + "learning_rate": 3.659574243914257e-06, + "loss": 0.6434, + "step": 7570 + }, + { + "epoch": 2.097229916897507, + "grad_norm": 0.7493064999580383, + "learning_rate": 3.659251502763206e-06, + "loss": 0.6556, + "step": 7571 + }, + { + "epoch": 2.0975069252077563, + "grad_norm": 0.7280117273330688, + "learning_rate": 3.658928736998284e-06, + "loss": 0.596, + "step": 7572 + }, + { + "epoch": 2.0977839335180057, + "grad_norm": 0.7482536435127258, + "learning_rate": 3.6586059466263463e-06, + "loss": 0.5978, + "step": 7573 + }, + { + "epoch": 2.0980609418282548, + "grad_norm": 0.690613329410553, + "learning_rate": 3.6582831316542454e-06, + "loss": 0.6279, + "step": 7574 + }, + { + "epoch": 2.098337950138504, + "grad_norm": 0.6842516660690308, + "learning_rate": 3.657960292088836e-06, + "loss": 0.5736, + "step": 7575 + }, + { + "epoch": 2.0986149584487537, + "grad_norm": 0.7524868845939636, + "learning_rate": 3.657637427936972e-06, + "loss": 0.671, + "step": 7576 + }, + { + "epoch": 2.0988919667590027, + "grad_norm": 0.708136796951294, + "learning_rate": 3.6573145392055097e-06, + "loss": 0.6255, + "step": 7577 + }, + { + "epoch": 2.099168975069252, + "grad_norm": 0.7181639075279236, + "learning_rate": 3.656991625901304e-06, + "loss": 0.6044, + "step": 7578 + }, + { + "epoch": 2.0994459833795016, + "grad_norm": 0.7263873815536499, + "learning_rate": 3.6566686880312106e-06, + "loss": 0.6201, + "step": 7579 + }, + { + "epoch": 2.0997229916897506, + "grad_norm": 0.7212371826171875, + "learning_rate": 3.656345725602089e-06, + "loss": 0.591, + "step": 7580 + }, + { + "epoch": 2.1, + "grad_norm": 0.7355701923370361, + "learning_rate": 3.656022738620793e-06, + "loss": 0.6131, + "step": 7581 + }, + { + "epoch": 2.100277008310249, + "grad_norm": 0.6883821487426758, + "learning_rate": 3.655699727094183e-06, + "loss": 0.5995, + "step": 7582 + }, + { + "epoch": 2.1005540166204986, + "grad_norm": 0.716107189655304, + "learning_rate": 3.655376691029116e-06, + "loss": 0.6251, + "step": 7583 + }, + { + "epoch": 2.100831024930748, + "grad_norm": 0.7313331961631775, + "learning_rate": 3.655053630432452e-06, + "loss": 0.6354, + "step": 7584 + }, + { + "epoch": 2.101108033240997, + "grad_norm": 0.7299684882164001, + "learning_rate": 3.654730545311049e-06, + "loss": 0.6014, + "step": 7585 + }, + { + "epoch": 2.1013850415512465, + "grad_norm": 0.7288849949836731, + "learning_rate": 3.6544074356717675e-06, + "loss": 0.5749, + "step": 7586 + }, + { + "epoch": 2.101662049861496, + "grad_norm": 0.724434494972229, + "learning_rate": 3.654084301521469e-06, + "loss": 0.575, + "step": 7587 + }, + { + "epoch": 2.101939058171745, + "grad_norm": 0.7240490913391113, + "learning_rate": 3.6537611428670127e-06, + "loss": 0.6028, + "step": 7588 + }, + { + "epoch": 2.1022160664819944, + "grad_norm": 0.7254226803779602, + "learning_rate": 3.6534379597152615e-06, + "loss": 0.641, + "step": 7589 + }, + { + "epoch": 2.102493074792244, + "grad_norm": 0.7376781702041626, + "learning_rate": 3.653114752073077e-06, + "loss": 0.6108, + "step": 7590 + }, + { + "epoch": 2.102770083102493, + "grad_norm": 0.7038312554359436, + "learning_rate": 3.6527915199473206e-06, + "loss": 0.609, + "step": 7591 + }, + { + "epoch": 2.1030470914127424, + "grad_norm": 0.7228493094444275, + "learning_rate": 3.652468263344857e-06, + "loss": 0.6033, + "step": 7592 + }, + { + "epoch": 2.103324099722992, + "grad_norm": 0.7285172343254089, + "learning_rate": 3.65214498227255e-06, + "loss": 0.6323, + "step": 7593 + }, + { + "epoch": 2.103601108033241, + "grad_norm": 0.7170820236206055, + "learning_rate": 3.651821676737261e-06, + "loss": 0.5395, + "step": 7594 + }, + { + "epoch": 2.1038781163434903, + "grad_norm": 0.7217197418212891, + "learning_rate": 3.6514983467458577e-06, + "loss": 0.6448, + "step": 7595 + }, + { + "epoch": 2.1041551246537398, + "grad_norm": 0.7080284953117371, + "learning_rate": 3.651174992305203e-06, + "loss": 0.6125, + "step": 7596 + }, + { + "epoch": 2.104432132963989, + "grad_norm": 0.7334601879119873, + "learning_rate": 3.650851613422164e-06, + "loss": 0.5814, + "step": 7597 + }, + { + "epoch": 2.1047091412742382, + "grad_norm": 0.7425627708435059, + "learning_rate": 3.650528210103606e-06, + "loss": 0.5994, + "step": 7598 + }, + { + "epoch": 2.1049861495844877, + "grad_norm": 0.720252275466919, + "learning_rate": 3.6502047823563967e-06, + "loss": 0.5901, + "step": 7599 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.6992184519767761, + "learning_rate": 3.6498813301874016e-06, + "loss": 0.5599, + "step": 7600 + }, + { + "epoch": 2.105540166204986, + "grad_norm": 0.7782900333404541, + "learning_rate": 3.64955785360349e-06, + "loss": 0.6099, + "step": 7601 + }, + { + "epoch": 2.1058171745152356, + "grad_norm": 0.7176836729049683, + "learning_rate": 3.6492343526115292e-06, + "loss": 0.5997, + "step": 7602 + }, + { + "epoch": 2.1060941828254847, + "grad_norm": 0.6862257719039917, + "learning_rate": 3.6489108272183883e-06, + "loss": 0.5779, + "step": 7603 + }, + { + "epoch": 2.106371191135734, + "grad_norm": 0.7442038059234619, + "learning_rate": 3.648587277430936e-06, + "loss": 0.6, + "step": 7604 + }, + { + "epoch": 2.1066481994459836, + "grad_norm": 0.7054752111434937, + "learning_rate": 3.6482637032560437e-06, + "loss": 0.533, + "step": 7605 + }, + { + "epoch": 2.1069252077562326, + "grad_norm": 0.7280328869819641, + "learning_rate": 3.64794010470058e-06, + "loss": 0.666, + "step": 7606 + }, + { + "epoch": 2.107202216066482, + "grad_norm": 0.8324413299560547, + "learning_rate": 3.6476164817714166e-06, + "loss": 0.6287, + "step": 7607 + }, + { + "epoch": 2.107479224376731, + "grad_norm": 0.725425660610199, + "learning_rate": 3.647292834475424e-06, + "loss": 0.6133, + "step": 7608 + }, + { + "epoch": 2.1077562326869805, + "grad_norm": 0.6995035409927368, + "learning_rate": 3.6469691628194753e-06, + "loss": 0.6095, + "step": 7609 + }, + { + "epoch": 2.10803324099723, + "grad_norm": 0.6647850871086121, + "learning_rate": 3.6466454668104423e-06, + "loss": 0.5523, + "step": 7610 + }, + { + "epoch": 2.108310249307479, + "grad_norm": 0.7180917263031006, + "learning_rate": 3.6463217464551977e-06, + "loss": 0.5811, + "step": 7611 + }, + { + "epoch": 2.1085872576177285, + "grad_norm": 0.7054855823516846, + "learning_rate": 3.645998001760615e-06, + "loss": 0.5845, + "step": 7612 + }, + { + "epoch": 2.108864265927978, + "grad_norm": 0.7367815375328064, + "learning_rate": 3.645674232733568e-06, + "loss": 0.6051, + "step": 7613 + }, + { + "epoch": 2.109141274238227, + "grad_norm": 0.6971177458763123, + "learning_rate": 3.6453504393809312e-06, + "loss": 0.5849, + "step": 7614 + }, + { + "epoch": 2.1094182825484764, + "grad_norm": 0.6860442161560059, + "learning_rate": 3.6450266217095804e-06, + "loss": 0.545, + "step": 7615 + }, + { + "epoch": 2.109695290858726, + "grad_norm": 0.7291378974914551, + "learning_rate": 3.64470277972639e-06, + "loss": 0.5841, + "step": 7616 + }, + { + "epoch": 2.109972299168975, + "grad_norm": 0.686470627784729, + "learning_rate": 3.6443789134382356e-06, + "loss": 0.5594, + "step": 7617 + }, + { + "epoch": 2.1102493074792243, + "grad_norm": 0.7135543823242188, + "learning_rate": 3.6440550228519954e-06, + "loss": 0.5797, + "step": 7618 + }, + { + "epoch": 2.110526315789474, + "grad_norm": 0.7279857397079468, + "learning_rate": 3.643731107974545e-06, + "loss": 0.5673, + "step": 7619 + }, + { + "epoch": 2.110803324099723, + "grad_norm": 0.6739892363548279, + "learning_rate": 3.6434071688127627e-06, + "loss": 0.5728, + "step": 7620 + }, + { + "epoch": 2.1110803324099723, + "grad_norm": 0.7875750660896301, + "learning_rate": 3.643083205373526e-06, + "loss": 0.611, + "step": 7621 + }, + { + "epoch": 2.1113573407202217, + "grad_norm": 0.7165523767471313, + "learning_rate": 3.6427592176637133e-06, + "loss": 0.5847, + "step": 7622 + }, + { + "epoch": 2.1116343490304708, + "grad_norm": 0.7262118458747864, + "learning_rate": 3.6424352056902046e-06, + "loss": 0.6097, + "step": 7623 + }, + { + "epoch": 2.11191135734072, + "grad_norm": 0.6838838458061218, + "learning_rate": 3.642111169459879e-06, + "loss": 0.6121, + "step": 7624 + }, + { + "epoch": 2.1121883656509697, + "grad_norm": 0.7182348966598511, + "learning_rate": 3.641787108979617e-06, + "loss": 0.5979, + "step": 7625 + }, + { + "epoch": 2.1124653739612187, + "grad_norm": 0.7114434242248535, + "learning_rate": 3.641463024256298e-06, + "loss": 0.6148, + "step": 7626 + }, + { + "epoch": 2.112742382271468, + "grad_norm": 0.7483696937561035, + "learning_rate": 3.6411389152968046e-06, + "loss": 0.6394, + "step": 7627 + }, + { + "epoch": 2.1130193905817176, + "grad_norm": 0.7226950526237488, + "learning_rate": 3.6408147821080183e-06, + "loss": 0.6132, + "step": 7628 + }, + { + "epoch": 2.1132963988919666, + "grad_norm": 0.7192668914794922, + "learning_rate": 3.64049062469682e-06, + "loss": 0.6113, + "step": 7629 + }, + { + "epoch": 2.113573407202216, + "grad_norm": 0.6802382469177246, + "learning_rate": 3.6401664430700935e-06, + "loss": 0.5291, + "step": 7630 + }, + { + "epoch": 2.1138504155124656, + "grad_norm": 0.7091238498687744, + "learning_rate": 3.6398422372347227e-06, + "loss": 0.6269, + "step": 7631 + }, + { + "epoch": 2.1141274238227146, + "grad_norm": 0.7575234770774841, + "learning_rate": 3.639518007197589e-06, + "loss": 0.5931, + "step": 7632 + }, + { + "epoch": 2.114404432132964, + "grad_norm": 0.7375851273536682, + "learning_rate": 3.6391937529655786e-06, + "loss": 0.6294, + "step": 7633 + }, + { + "epoch": 2.1146814404432135, + "grad_norm": 0.7400649785995483, + "learning_rate": 3.6388694745455756e-06, + "loss": 0.623, + "step": 7634 + }, + { + "epoch": 2.1149584487534625, + "grad_norm": 0.7256798148155212, + "learning_rate": 3.6385451719444655e-06, + "loss": 0.5864, + "step": 7635 + }, + { + "epoch": 2.115235457063712, + "grad_norm": 0.7284665107727051, + "learning_rate": 3.6382208451691333e-06, + "loss": 0.5948, + "step": 7636 + }, + { + "epoch": 2.1155124653739614, + "grad_norm": 0.7431206107139587, + "learning_rate": 3.6378964942264662e-06, + "loss": 0.6077, + "step": 7637 + }, + { + "epoch": 2.1157894736842104, + "grad_norm": 0.727976381778717, + "learning_rate": 3.6375721191233508e-06, + "loss": 0.6108, + "step": 7638 + }, + { + "epoch": 2.11606648199446, + "grad_norm": 0.7988527417182922, + "learning_rate": 3.637247719866674e-06, + "loss": 0.6067, + "step": 7639 + }, + { + "epoch": 2.1163434903047094, + "grad_norm": 0.7804294228553772, + "learning_rate": 3.6369232964633237e-06, + "loss": 0.5959, + "step": 7640 + }, + { + "epoch": 2.1166204986149584, + "grad_norm": 0.7260686159133911, + "learning_rate": 3.6365988489201887e-06, + "loss": 0.6191, + "step": 7641 + }, + { + "epoch": 2.116897506925208, + "grad_norm": 0.712604820728302, + "learning_rate": 3.6362743772441576e-06, + "loss": 0.6136, + "step": 7642 + }, + { + "epoch": 2.117174515235457, + "grad_norm": 0.7114692330360413, + "learning_rate": 3.63594988144212e-06, + "loss": 0.5637, + "step": 7643 + }, + { + "epoch": 2.1174515235457063, + "grad_norm": 0.751819372177124, + "learning_rate": 3.635625361520966e-06, + "loss": 0.5996, + "step": 7644 + }, + { + "epoch": 2.1177285318559558, + "grad_norm": 0.7611382603645325, + "learning_rate": 3.6353008174875844e-06, + "loss": 0.6446, + "step": 7645 + }, + { + "epoch": 2.118005540166205, + "grad_norm": 0.7532554268836975, + "learning_rate": 3.634976249348867e-06, + "loss": 0.6, + "step": 7646 + }, + { + "epoch": 2.1182825484764543, + "grad_norm": 0.7395018935203552, + "learning_rate": 3.6346516571117063e-06, + "loss": 0.599, + "step": 7647 + }, + { + "epoch": 2.1185595567867037, + "grad_norm": 0.7286550998687744, + "learning_rate": 3.6343270407829934e-06, + "loss": 0.6025, + "step": 7648 + }, + { + "epoch": 2.1188365650969527, + "grad_norm": 0.7371105551719666, + "learning_rate": 3.63400240036962e-06, + "loss": 0.6017, + "step": 7649 + }, + { + "epoch": 2.119113573407202, + "grad_norm": 0.7373232841491699, + "learning_rate": 3.63367773587848e-06, + "loss": 0.587, + "step": 7650 + }, + { + "epoch": 2.1193905817174516, + "grad_norm": 0.7241517901420593, + "learning_rate": 3.633353047316467e-06, + "loss": 0.61, + "step": 7651 + }, + { + "epoch": 2.1196675900277007, + "grad_norm": 0.7286142110824585, + "learning_rate": 3.633028334690473e-06, + "loss": 0.6041, + "step": 7652 + }, + { + "epoch": 2.11994459833795, + "grad_norm": 0.7213249206542969, + "learning_rate": 3.632703598007396e-06, + "loss": 0.6292, + "step": 7653 + }, + { + "epoch": 2.1202216066481996, + "grad_norm": 0.7063266634941101, + "learning_rate": 3.632378837274128e-06, + "loss": 0.5628, + "step": 7654 + }, + { + "epoch": 2.1204986149584486, + "grad_norm": 0.7615188956260681, + "learning_rate": 3.6320540524975655e-06, + "loss": 0.6539, + "step": 7655 + }, + { + "epoch": 2.120775623268698, + "grad_norm": 0.7243799567222595, + "learning_rate": 3.6317292436846053e-06, + "loss": 0.598, + "step": 7656 + }, + { + "epoch": 2.1210526315789475, + "grad_norm": 0.716153085231781, + "learning_rate": 3.631404410842142e-06, + "loss": 0.6016, + "step": 7657 + }, + { + "epoch": 2.1213296398891965, + "grad_norm": 0.7011458277702332, + "learning_rate": 3.6310795539770738e-06, + "loss": 0.5904, + "step": 7658 + }, + { + "epoch": 2.121606648199446, + "grad_norm": 0.7134310603141785, + "learning_rate": 3.630754673096299e-06, + "loss": 0.6095, + "step": 7659 + }, + { + "epoch": 2.1218836565096955, + "grad_norm": 0.6938473582267761, + "learning_rate": 3.6304297682067146e-06, + "loss": 0.5916, + "step": 7660 + }, + { + "epoch": 2.1221606648199445, + "grad_norm": 0.7490888833999634, + "learning_rate": 3.6301048393152196e-06, + "loss": 0.6128, + "step": 7661 + }, + { + "epoch": 2.122437673130194, + "grad_norm": 0.6940358877182007, + "learning_rate": 3.6297798864287122e-06, + "loss": 0.5638, + "step": 7662 + }, + { + "epoch": 2.1227146814404434, + "grad_norm": 0.7214860916137695, + "learning_rate": 3.6294549095540933e-06, + "loss": 0.636, + "step": 7663 + }, + { + "epoch": 2.1229916897506924, + "grad_norm": 0.7091872096061707, + "learning_rate": 3.6291299086982624e-06, + "loss": 0.612, + "step": 7664 + }, + { + "epoch": 2.123268698060942, + "grad_norm": 0.7438629269599915, + "learning_rate": 3.628804883868119e-06, + "loss": 0.6278, + "step": 7665 + }, + { + "epoch": 2.1235457063711913, + "grad_norm": 0.7003153562545776, + "learning_rate": 3.628479835070567e-06, + "loss": 0.567, + "step": 7666 + }, + { + "epoch": 2.1238227146814403, + "grad_norm": 0.7519192695617676, + "learning_rate": 3.6281547623125046e-06, + "loss": 0.6302, + "step": 7667 + }, + { + "epoch": 2.12409972299169, + "grad_norm": 0.753014087677002, + "learning_rate": 3.6278296656008366e-06, + "loss": 0.6041, + "step": 7668 + }, + { + "epoch": 2.1243767313019393, + "grad_norm": 0.7036569118499756, + "learning_rate": 3.6275045449424643e-06, + "loss": 0.5714, + "step": 7669 + }, + { + "epoch": 2.1246537396121883, + "grad_norm": 0.805932343006134, + "learning_rate": 3.6271794003442918e-06, + "loss": 0.6174, + "step": 7670 + }, + { + "epoch": 2.1249307479224377, + "grad_norm": 0.7357967495918274, + "learning_rate": 3.626854231813221e-06, + "loss": 0.6006, + "step": 7671 + }, + { + "epoch": 2.1252077562326868, + "grad_norm": 0.7303361296653748, + "learning_rate": 3.6265290393561586e-06, + "loss": 0.5773, + "step": 7672 + }, + { + "epoch": 2.1254847645429362, + "grad_norm": 0.7263323664665222, + "learning_rate": 3.6262038229800067e-06, + "loss": 0.6136, + "step": 7673 + }, + { + "epoch": 2.1257617728531857, + "grad_norm": 0.7003388404846191, + "learning_rate": 3.625878582691672e-06, + "loss": 0.6143, + "step": 7674 + }, + { + "epoch": 2.1260387811634347, + "grad_norm": 0.7387146949768066, + "learning_rate": 3.6255533184980595e-06, + "loss": 0.548, + "step": 7675 + }, + { + "epoch": 2.126315789473684, + "grad_norm": 0.7310213446617126, + "learning_rate": 3.6252280304060768e-06, + "loss": 0.6148, + "step": 7676 + }, + { + "epoch": 2.1265927977839336, + "grad_norm": 0.7344486713409424, + "learning_rate": 3.624902718422628e-06, + "loss": 0.6146, + "step": 7677 + }, + { + "epoch": 2.1268698060941826, + "grad_norm": 0.7056968808174133, + "learning_rate": 3.6245773825546227e-06, + "loss": 0.57, + "step": 7678 + }, + { + "epoch": 2.127146814404432, + "grad_norm": 0.7251293659210205, + "learning_rate": 3.6242520228089673e-06, + "loss": 0.5487, + "step": 7679 + }, + { + "epoch": 2.1274238227146816, + "grad_norm": 0.706130862236023, + "learning_rate": 3.6239266391925703e-06, + "loss": 0.5979, + "step": 7680 + }, + { + "epoch": 2.1277008310249306, + "grad_norm": 0.6742924451828003, + "learning_rate": 3.623601231712341e-06, + "loss": 0.5492, + "step": 7681 + }, + { + "epoch": 2.12797783933518, + "grad_norm": 0.7342013716697693, + "learning_rate": 3.623275800375188e-06, + "loss": 0.6248, + "step": 7682 + }, + { + "epoch": 2.1282548476454295, + "grad_norm": 0.7537729144096375, + "learning_rate": 3.6229503451880215e-06, + "loss": 0.5866, + "step": 7683 + }, + { + "epoch": 2.1285318559556785, + "grad_norm": 0.7483304142951965, + "learning_rate": 3.622624866157751e-06, + "loss": 0.6452, + "step": 7684 + }, + { + "epoch": 2.128808864265928, + "grad_norm": 0.7003677487373352, + "learning_rate": 3.6222993632912882e-06, + "loss": 0.6525, + "step": 7685 + }, + { + "epoch": 2.1290858725761774, + "grad_norm": 0.7310715317726135, + "learning_rate": 3.6219738365955435e-06, + "loss": 0.5519, + "step": 7686 + }, + { + "epoch": 2.1293628808864264, + "grad_norm": 0.7531241178512573, + "learning_rate": 3.621648286077429e-06, + "loss": 0.555, + "step": 7687 + }, + { + "epoch": 2.129639889196676, + "grad_norm": 0.7238945960998535, + "learning_rate": 3.621322711743858e-06, + "loss": 0.5715, + "step": 7688 + }, + { + "epoch": 2.1299168975069254, + "grad_norm": 0.7214088439941406, + "learning_rate": 3.6209971136017413e-06, + "loss": 0.5603, + "step": 7689 + }, + { + "epoch": 2.1301939058171744, + "grad_norm": 0.7460694313049316, + "learning_rate": 3.6206714916579925e-06, + "loss": 0.5563, + "step": 7690 + }, + { + "epoch": 2.130470914127424, + "grad_norm": 0.7241212725639343, + "learning_rate": 3.620345845919527e-06, + "loss": 0.5997, + "step": 7691 + }, + { + "epoch": 2.1307479224376733, + "grad_norm": 0.7266137003898621, + "learning_rate": 3.6200201763932583e-06, + "loss": 0.5617, + "step": 7692 + }, + { + "epoch": 2.1310249307479223, + "grad_norm": 0.7282004952430725, + "learning_rate": 3.6196944830861003e-06, + "loss": 0.5975, + "step": 7693 + }, + { + "epoch": 2.131301939058172, + "grad_norm": 0.7729506492614746, + "learning_rate": 3.61936876600497e-06, + "loss": 0.6256, + "step": 7694 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.721755862236023, + "learning_rate": 3.6190430251567817e-06, + "loss": 0.6396, + "step": 7695 + }, + { + "epoch": 2.1318559556786703, + "grad_norm": 0.7221630811691284, + "learning_rate": 3.6187172605484523e-06, + "loss": 0.6348, + "step": 7696 + }, + { + "epoch": 2.1321329639889197, + "grad_norm": 0.6905040144920349, + "learning_rate": 3.6183914721868986e-06, + "loss": 0.5733, + "step": 7697 + }, + { + "epoch": 2.132409972299169, + "grad_norm": 0.7066609859466553, + "learning_rate": 3.618065660079038e-06, + "loss": 0.5561, + "step": 7698 + }, + { + "epoch": 2.132686980609418, + "grad_norm": 0.7088309526443481, + "learning_rate": 3.6177398242317878e-06, + "loss": 0.5973, + "step": 7699 + }, + { + "epoch": 2.1329639889196677, + "grad_norm": 0.7220036387443542, + "learning_rate": 3.617413964652067e-06, + "loss": 0.5885, + "step": 7700 + }, + { + "epoch": 2.133240997229917, + "grad_norm": 0.7076956629753113, + "learning_rate": 3.6170880813467947e-06, + "loss": 0.5444, + "step": 7701 + }, + { + "epoch": 2.133518005540166, + "grad_norm": 0.7317376136779785, + "learning_rate": 3.6167621743228886e-06, + "loss": 0.6535, + "step": 7702 + }, + { + "epoch": 2.1337950138504156, + "grad_norm": 0.6807712316513062, + "learning_rate": 3.61643624358727e-06, + "loss": 0.5867, + "step": 7703 + }, + { + "epoch": 2.134072022160665, + "grad_norm": 0.7171390056610107, + "learning_rate": 3.6161102891468598e-06, + "loss": 0.5741, + "step": 7704 + }, + { + "epoch": 2.134349030470914, + "grad_norm": 0.7733952403068542, + "learning_rate": 3.6157843110085768e-06, + "loss": 0.5826, + "step": 7705 + }, + { + "epoch": 2.1346260387811635, + "grad_norm": 0.7412691712379456, + "learning_rate": 3.6154583091793434e-06, + "loss": 0.6361, + "step": 7706 + }, + { + "epoch": 2.1349030470914125, + "grad_norm": 0.6959484219551086, + "learning_rate": 3.615132283666083e-06, + "loss": 0.6095, + "step": 7707 + }, + { + "epoch": 2.135180055401662, + "grad_norm": 0.742755651473999, + "learning_rate": 3.6148062344757145e-06, + "loss": 0.6281, + "step": 7708 + }, + { + "epoch": 2.1354570637119115, + "grad_norm": 0.767560601234436, + "learning_rate": 3.6144801616151642e-06, + "loss": 0.6221, + "step": 7709 + }, + { + "epoch": 2.1357340720221605, + "grad_norm": 0.7019870281219482, + "learning_rate": 3.614154065091353e-06, + "loss": 0.5758, + "step": 7710 + }, + { + "epoch": 2.13601108033241, + "grad_norm": 0.7854132056236267, + "learning_rate": 3.613827944911207e-06, + "loss": 0.5995, + "step": 7711 + }, + { + "epoch": 2.1362880886426594, + "grad_norm": 0.7392581105232239, + "learning_rate": 3.613501801081648e-06, + "loss": 0.6336, + "step": 7712 + }, + { + "epoch": 2.1365650969529084, + "grad_norm": 0.7203420400619507, + "learning_rate": 3.613175633609603e-06, + "loss": 0.5868, + "step": 7713 + }, + { + "epoch": 2.136842105263158, + "grad_norm": 0.7442747950553894, + "learning_rate": 3.6128494425019966e-06, + "loss": 0.608, + "step": 7714 + }, + { + "epoch": 2.1371191135734073, + "grad_norm": 0.7600914239883423, + "learning_rate": 3.612523227765754e-06, + "loss": 0.6087, + "step": 7715 + }, + { + "epoch": 2.1373961218836564, + "grad_norm": 0.6972166895866394, + "learning_rate": 3.612196989407802e-06, + "loss": 0.6376, + "step": 7716 + }, + { + "epoch": 2.137673130193906, + "grad_norm": 0.7419806122779846, + "learning_rate": 3.6118707274350695e-06, + "loss": 0.6143, + "step": 7717 + }, + { + "epoch": 2.1379501385041553, + "grad_norm": 0.690531849861145, + "learning_rate": 3.6115444418544803e-06, + "loss": 0.5944, + "step": 7718 + }, + { + "epoch": 2.1382271468144043, + "grad_norm": 0.7530912160873413, + "learning_rate": 3.6112181326729647e-06, + "loss": 0.6317, + "step": 7719 + }, + { + "epoch": 2.1385041551246537, + "grad_norm": 0.7327772378921509, + "learning_rate": 3.61089179989745e-06, + "loss": 0.6371, + "step": 7720 + }, + { + "epoch": 2.138781163434903, + "grad_norm": 0.7340015172958374, + "learning_rate": 3.610565443534865e-06, + "loss": 0.5727, + "step": 7721 + }, + { + "epoch": 2.1390581717451522, + "grad_norm": 0.7161919474601746, + "learning_rate": 3.610239063592141e-06, + "loss": 0.5914, + "step": 7722 + }, + { + "epoch": 2.1393351800554017, + "grad_norm": 0.721517026424408, + "learning_rate": 3.6099126600762056e-06, + "loss": 0.6026, + "step": 7723 + }, + { + "epoch": 2.139612188365651, + "grad_norm": 0.7476277351379395, + "learning_rate": 3.60958623299399e-06, + "loss": 0.6114, + "step": 7724 + }, + { + "epoch": 2.1398891966759, + "grad_norm": 0.7581577897071838, + "learning_rate": 3.609259782352425e-06, + "loss": 0.5867, + "step": 7725 + }, + { + "epoch": 2.1401662049861496, + "grad_norm": 0.7624351978302002, + "learning_rate": 3.6089333081584434e-06, + "loss": 0.5996, + "step": 7726 + }, + { + "epoch": 2.140443213296399, + "grad_norm": 0.7194927334785461, + "learning_rate": 3.608606810418974e-06, + "loss": 0.5811, + "step": 7727 + }, + { + "epoch": 2.140720221606648, + "grad_norm": 0.752798318862915, + "learning_rate": 3.608280289140952e-06, + "loss": 0.6557, + "step": 7728 + }, + { + "epoch": 2.1409972299168976, + "grad_norm": 0.7455686330795288, + "learning_rate": 3.6079537443313084e-06, + "loss": 0.6307, + "step": 7729 + }, + { + "epoch": 2.141274238227147, + "grad_norm": 0.7468121647834778, + "learning_rate": 3.6076271759969785e-06, + "loss": 0.6192, + "step": 7730 + }, + { + "epoch": 2.141551246537396, + "grad_norm": 0.7016478180885315, + "learning_rate": 3.6073005841448936e-06, + "loss": 0.5576, + "step": 7731 + }, + { + "epoch": 2.1418282548476455, + "grad_norm": 0.6816582083702087, + "learning_rate": 3.6069739687819905e-06, + "loss": 0.5899, + "step": 7732 + }, + { + "epoch": 2.1421052631578945, + "grad_norm": 0.6748509407043457, + "learning_rate": 3.6066473299152025e-06, + "loss": 0.6112, + "step": 7733 + }, + { + "epoch": 2.142382271468144, + "grad_norm": 0.7195847034454346, + "learning_rate": 3.606320667551466e-06, + "loss": 0.5903, + "step": 7734 + }, + { + "epoch": 2.1426592797783934, + "grad_norm": 0.7213793396949768, + "learning_rate": 3.6059939816977167e-06, + "loss": 0.6142, + "step": 7735 + }, + { + "epoch": 2.1429362880886424, + "grad_norm": 0.7996098399162292, + "learning_rate": 3.605667272360891e-06, + "loss": 0.6043, + "step": 7736 + }, + { + "epoch": 2.143213296398892, + "grad_norm": 0.7544435858726501, + "learning_rate": 3.605340539547925e-06, + "loss": 0.6007, + "step": 7737 + }, + { + "epoch": 2.1434903047091414, + "grad_norm": 0.7508959770202637, + "learning_rate": 3.6050137832657565e-06, + "loss": 0.6343, + "step": 7738 + }, + { + "epoch": 2.1437673130193904, + "grad_norm": 0.7130975127220154, + "learning_rate": 3.6046870035213246e-06, + "loss": 0.6139, + "step": 7739 + }, + { + "epoch": 2.14404432132964, + "grad_norm": 0.7473478317260742, + "learning_rate": 3.6043602003215657e-06, + "loss": 0.6321, + "step": 7740 + }, + { + "epoch": 2.1443213296398893, + "grad_norm": 0.7028003334999084, + "learning_rate": 3.6040333736734195e-06, + "loss": 0.5854, + "step": 7741 + }, + { + "epoch": 2.1445983379501383, + "grad_norm": 0.7346315383911133, + "learning_rate": 3.6037065235838255e-06, + "loss": 0.596, + "step": 7742 + }, + { + "epoch": 2.144875346260388, + "grad_norm": 0.6991076469421387, + "learning_rate": 3.6033796500597236e-06, + "loss": 0.564, + "step": 7743 + }, + { + "epoch": 2.1451523545706372, + "grad_norm": 0.7503802180290222, + "learning_rate": 3.6030527531080533e-06, + "loss": 0.6233, + "step": 7744 + }, + { + "epoch": 2.1454293628808863, + "grad_norm": 0.718914270401001, + "learning_rate": 3.6027258327357573e-06, + "loss": 0.6037, + "step": 7745 + }, + { + "epoch": 2.1457063711911357, + "grad_norm": 0.7305037975311279, + "learning_rate": 3.6023988889497753e-06, + "loss": 0.5828, + "step": 7746 + }, + { + "epoch": 2.145983379501385, + "grad_norm": 0.7057591080665588, + "learning_rate": 3.602071921757049e-06, + "loss": 0.6097, + "step": 7747 + }, + { + "epoch": 2.146260387811634, + "grad_norm": 0.6981334090232849, + "learning_rate": 3.601744931164522e-06, + "loss": 0.5359, + "step": 7748 + }, + { + "epoch": 2.1465373961218837, + "grad_norm": 0.7055646777153015, + "learning_rate": 3.6014179171791365e-06, + "loss": 0.5552, + "step": 7749 + }, + { + "epoch": 2.146814404432133, + "grad_norm": 0.7171012163162231, + "learning_rate": 3.601090879807836e-06, + "loss": 0.619, + "step": 7750 + }, + { + "epoch": 2.147091412742382, + "grad_norm": 0.7078838348388672, + "learning_rate": 3.6007638190575644e-06, + "loss": 0.5707, + "step": 7751 + }, + { + "epoch": 2.1473684210526316, + "grad_norm": 0.731663167476654, + "learning_rate": 3.6004367349352658e-06, + "loss": 0.6335, + "step": 7752 + }, + { + "epoch": 2.147645429362881, + "grad_norm": 0.6805864572525024, + "learning_rate": 3.6001096274478847e-06, + "loss": 0.5926, + "step": 7753 + }, + { + "epoch": 2.14792243767313, + "grad_norm": 0.7225302457809448, + "learning_rate": 3.599782496602367e-06, + "loss": 0.6214, + "step": 7754 + }, + { + "epoch": 2.1481994459833795, + "grad_norm": 0.7431020736694336, + "learning_rate": 3.599455342405659e-06, + "loss": 0.598, + "step": 7755 + }, + { + "epoch": 2.148476454293629, + "grad_norm": 0.7166742086410522, + "learning_rate": 3.599128164864706e-06, + "loss": 0.6303, + "step": 7756 + }, + { + "epoch": 2.148753462603878, + "grad_norm": 0.7254209518432617, + "learning_rate": 3.598800963986454e-06, + "loss": 0.556, + "step": 7757 + }, + { + "epoch": 2.1490304709141275, + "grad_norm": 0.7554706931114197, + "learning_rate": 3.5984737397778537e-06, + "loss": 0.608, + "step": 7758 + }, + { + "epoch": 2.149307479224377, + "grad_norm": 0.7507380247116089, + "learning_rate": 3.5981464922458488e-06, + "loss": 0.5717, + "step": 7759 + }, + { + "epoch": 2.149584487534626, + "grad_norm": 0.7267554402351379, + "learning_rate": 3.5978192213973904e-06, + "loss": 0.6567, + "step": 7760 + }, + { + "epoch": 2.1498614958448754, + "grad_norm": 0.692969560623169, + "learning_rate": 3.5974919272394265e-06, + "loss": 0.6093, + "step": 7761 + }, + { + "epoch": 2.150138504155125, + "grad_norm": 0.7091807126998901, + "learning_rate": 3.597164609778906e-06, + "loss": 0.5593, + "step": 7762 + }, + { + "epoch": 2.150415512465374, + "grad_norm": 0.7118282914161682, + "learning_rate": 3.596837269022779e-06, + "loss": 0.6712, + "step": 7763 + }, + { + "epoch": 2.1506925207756233, + "grad_norm": 0.7038248777389526, + "learning_rate": 3.5965099049779954e-06, + "loss": 0.5974, + "step": 7764 + }, + { + "epoch": 2.150969529085873, + "grad_norm": 0.7268661260604858, + "learning_rate": 3.596182517651507e-06, + "loss": 0.6084, + "step": 7765 + }, + { + "epoch": 2.151246537396122, + "grad_norm": 0.7075693607330322, + "learning_rate": 3.595855107050264e-06, + "loss": 0.6143, + "step": 7766 + }, + { + "epoch": 2.1515235457063713, + "grad_norm": 0.7023481130599976, + "learning_rate": 3.595527673181219e-06, + "loss": 0.5946, + "step": 7767 + }, + { + "epoch": 2.1518005540166203, + "grad_norm": 0.7299481630325317, + "learning_rate": 3.5952002160513233e-06, + "loss": 0.6691, + "step": 7768 + }, + { + "epoch": 2.1520775623268698, + "grad_norm": 0.697287380695343, + "learning_rate": 3.5948727356675305e-06, + "loss": 0.609, + "step": 7769 + }, + { + "epoch": 2.152354570637119, + "grad_norm": 0.7602940201759338, + "learning_rate": 3.594545232036793e-06, + "loss": 0.643, + "step": 7770 + }, + { + "epoch": 2.1526315789473682, + "grad_norm": 0.6859704852104187, + "learning_rate": 3.5942177051660663e-06, + "loss": 0.6061, + "step": 7771 + }, + { + "epoch": 2.1529085872576177, + "grad_norm": 0.7591689825057983, + "learning_rate": 3.5938901550623022e-06, + "loss": 0.629, + "step": 7772 + }, + { + "epoch": 2.153185595567867, + "grad_norm": 0.7486093044281006, + "learning_rate": 3.5935625817324572e-06, + "loss": 0.6165, + "step": 7773 + }, + { + "epoch": 2.153462603878116, + "grad_norm": 0.7580902576446533, + "learning_rate": 3.593234985183486e-06, + "loss": 0.6549, + "step": 7774 + }, + { + "epoch": 2.1537396121883656, + "grad_norm": 0.717231273651123, + "learning_rate": 3.592907365422344e-06, + "loss": 0.5984, + "step": 7775 + }, + { + "epoch": 2.154016620498615, + "grad_norm": 0.7410253286361694, + "learning_rate": 3.592579722455987e-06, + "loss": 0.5982, + "step": 7776 + }, + { + "epoch": 2.154293628808864, + "grad_norm": 0.7209257483482361, + "learning_rate": 3.592252056291373e-06, + "loss": 0.5881, + "step": 7777 + }, + { + "epoch": 2.1545706371191136, + "grad_norm": 0.7834051251411438, + "learning_rate": 3.5919243669354585e-06, + "loss": 0.6453, + "step": 7778 + }, + { + "epoch": 2.154847645429363, + "grad_norm": 0.7775634527206421, + "learning_rate": 3.5915966543952006e-06, + "loss": 0.6243, + "step": 7779 + }, + { + "epoch": 2.155124653739612, + "grad_norm": 0.7410075068473816, + "learning_rate": 3.5912689186775594e-06, + "loss": 0.6137, + "step": 7780 + }, + { + "epoch": 2.1554016620498615, + "grad_norm": 0.7614990472793579, + "learning_rate": 3.5909411597894916e-06, + "loss": 0.5606, + "step": 7781 + }, + { + "epoch": 2.155678670360111, + "grad_norm": 0.7304420471191406, + "learning_rate": 3.5906133777379572e-06, + "loss": 0.5241, + "step": 7782 + }, + { + "epoch": 2.15595567867036, + "grad_norm": 0.7032648921012878, + "learning_rate": 3.5902855725299156e-06, + "loss": 0.625, + "step": 7783 + }, + { + "epoch": 2.1562326869806094, + "grad_norm": 0.7103816866874695, + "learning_rate": 3.589957744172327e-06, + "loss": 0.5811, + "step": 7784 + }, + { + "epoch": 2.156509695290859, + "grad_norm": 0.7315899133682251, + "learning_rate": 3.589629892672152e-06, + "loss": 0.6132, + "step": 7785 + }, + { + "epoch": 2.156786703601108, + "grad_norm": 0.7375378608703613, + "learning_rate": 3.5893020180363513e-06, + "loss": 0.6179, + "step": 7786 + }, + { + "epoch": 2.1570637119113574, + "grad_norm": 0.7213611602783203, + "learning_rate": 3.5889741202718874e-06, + "loss": 0.571, + "step": 7787 + }, + { + "epoch": 2.157340720221607, + "grad_norm": 0.7085563540458679, + "learning_rate": 3.5886461993857225e-06, + "loss": 0.6045, + "step": 7788 + }, + { + "epoch": 2.157617728531856, + "grad_norm": 0.7195646166801453, + "learning_rate": 3.588318255384817e-06, + "loss": 0.5944, + "step": 7789 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.7459489107131958, + "learning_rate": 3.5879902882761375e-06, + "loss": 0.5727, + "step": 7790 + }, + { + "epoch": 2.1581717451523548, + "grad_norm": 0.7280673980712891, + "learning_rate": 3.587662298066645e-06, + "loss": 0.5796, + "step": 7791 + }, + { + "epoch": 2.158448753462604, + "grad_norm": 0.7652393579483032, + "learning_rate": 3.587334284763304e-06, + "loss": 0.5919, + "step": 7792 + }, + { + "epoch": 2.1587257617728532, + "grad_norm": 0.6975575685501099, + "learning_rate": 3.5870062483730807e-06, + "loss": 0.5909, + "step": 7793 + }, + { + "epoch": 2.1590027700831023, + "grad_norm": 0.7558895945549011, + "learning_rate": 3.586678188902937e-06, + "loss": 0.5739, + "step": 7794 + }, + { + "epoch": 2.1592797783933517, + "grad_norm": 0.6866469383239746, + "learning_rate": 3.5863501063598418e-06, + "loss": 0.554, + "step": 7795 + }, + { + "epoch": 2.159556786703601, + "grad_norm": 0.7125341892242432, + "learning_rate": 3.586022000750759e-06, + "loss": 0.6129, + "step": 7796 + }, + { + "epoch": 2.15983379501385, + "grad_norm": 0.735459566116333, + "learning_rate": 3.585693872082656e-06, + "loss": 0.5721, + "step": 7797 + }, + { + "epoch": 2.1601108033240997, + "grad_norm": 0.7106569409370422, + "learning_rate": 3.5853657203624985e-06, + "loss": 0.6055, + "step": 7798 + }, + { + "epoch": 2.160387811634349, + "grad_norm": 0.7532612085342407, + "learning_rate": 3.585037545597256e-06, + "loss": 0.5598, + "step": 7799 + }, + { + "epoch": 2.160664819944598, + "grad_norm": 0.7423479557037354, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.6258, + "step": 7800 + }, + { + "epoch": 2.1609418282548476, + "grad_norm": 0.7185900807380676, + "learning_rate": 3.5843811269593854e-06, + "loss": 0.5885, + "step": 7801 + }, + { + "epoch": 2.161218836565097, + "grad_norm": 0.7055425047874451, + "learning_rate": 3.584052883100695e-06, + "loss": 0.6391, + "step": 7802 + }, + { + "epoch": 2.161495844875346, + "grad_norm": 0.7540387511253357, + "learning_rate": 3.5837246162247937e-06, + "loss": 0.6234, + "step": 7803 + }, + { + "epoch": 2.1617728531855955, + "grad_norm": 0.6934828758239746, + "learning_rate": 3.58339632633865e-06, + "loss": 0.5873, + "step": 7804 + }, + { + "epoch": 2.162049861495845, + "grad_norm": 0.7087048292160034, + "learning_rate": 3.583068013449237e-06, + "loss": 0.6268, + "step": 7805 + }, + { + "epoch": 2.162326869806094, + "grad_norm": 0.7429408431053162, + "learning_rate": 3.5827396775635232e-06, + "loss": 0.6136, + "step": 7806 + }, + { + "epoch": 2.1626038781163435, + "grad_norm": 0.7649708986282349, + "learning_rate": 3.582411318688481e-06, + "loss": 0.6156, + "step": 7807 + }, + { + "epoch": 2.162880886426593, + "grad_norm": 0.724372386932373, + "learning_rate": 3.582082936831083e-06, + "loss": 0.5765, + "step": 7808 + }, + { + "epoch": 2.163157894736842, + "grad_norm": 0.7201005220413208, + "learning_rate": 3.581754531998301e-06, + "loss": 0.591, + "step": 7809 + }, + { + "epoch": 2.1634349030470914, + "grad_norm": 0.6978209018707275, + "learning_rate": 3.5814261041971075e-06, + "loss": 0.5794, + "step": 7810 + }, + { + "epoch": 2.163711911357341, + "grad_norm": 0.727821409702301, + "learning_rate": 3.5810976534344756e-06, + "loss": 0.612, + "step": 7811 + }, + { + "epoch": 2.16398891966759, + "grad_norm": 0.6700218319892883, + "learning_rate": 3.580769179717381e-06, + "loss": 0.567, + "step": 7812 + }, + { + "epoch": 2.1642659279778393, + "grad_norm": 0.6753674745559692, + "learning_rate": 3.5804406830527955e-06, + "loss": 0.5484, + "step": 7813 + }, + { + "epoch": 2.164542936288089, + "grad_norm": 0.6982012391090393, + "learning_rate": 3.5801121634476955e-06, + "loss": 0.5868, + "step": 7814 + }, + { + "epoch": 2.164819944598338, + "grad_norm": 0.7340361475944519, + "learning_rate": 3.5797836209090564e-06, + "loss": 0.5949, + "step": 7815 + }, + { + "epoch": 2.1650969529085873, + "grad_norm": 0.7365831732749939, + "learning_rate": 3.5794550554438532e-06, + "loss": 0.5934, + "step": 7816 + }, + { + "epoch": 2.1653739612188367, + "grad_norm": 0.7187327146530151, + "learning_rate": 3.579126467059062e-06, + "loss": 0.5934, + "step": 7817 + }, + { + "epoch": 2.1656509695290858, + "grad_norm": 0.7340055108070374, + "learning_rate": 3.578797855761661e-06, + "loss": 0.6309, + "step": 7818 + }, + { + "epoch": 2.165927977839335, + "grad_norm": 0.7367265224456787, + "learning_rate": 3.5784692215586265e-06, + "loss": 0.6118, + "step": 7819 + }, + { + "epoch": 2.1662049861495847, + "grad_norm": 0.7496496438980103, + "learning_rate": 3.5781405644569356e-06, + "loss": 0.5916, + "step": 7820 + }, + { + "epoch": 2.1664819944598337, + "grad_norm": 0.7447252869606018, + "learning_rate": 3.5778118844635678e-06, + "loss": 0.5563, + "step": 7821 + }, + { + "epoch": 2.166759002770083, + "grad_norm": 0.7554126381874084, + "learning_rate": 3.5774831815855017e-06, + "loss": 0.5793, + "step": 7822 + }, + { + "epoch": 2.1670360110803326, + "grad_norm": 0.7430973052978516, + "learning_rate": 3.5771544558297157e-06, + "loss": 0.5782, + "step": 7823 + }, + { + "epoch": 2.1673130193905816, + "grad_norm": 0.7045478820800781, + "learning_rate": 3.5768257072031888e-06, + "loss": 0.618, + "step": 7824 + }, + { + "epoch": 2.167590027700831, + "grad_norm": 0.7048739194869995, + "learning_rate": 3.5764969357129036e-06, + "loss": 0.6288, + "step": 7825 + }, + { + "epoch": 2.1678670360110806, + "grad_norm": 0.7383768558502197, + "learning_rate": 3.5761681413658387e-06, + "loss": 0.5781, + "step": 7826 + }, + { + "epoch": 2.1681440443213296, + "grad_norm": 0.7174773216247559, + "learning_rate": 3.575839324168976e-06, + "loss": 0.6336, + "step": 7827 + }, + { + "epoch": 2.168421052631579, + "grad_norm": 0.713789701461792, + "learning_rate": 3.5755104841292977e-06, + "loss": 0.607, + "step": 7828 + }, + { + "epoch": 2.1686980609418285, + "grad_norm": 0.7295719981193542, + "learning_rate": 3.575181621253785e-06, + "loss": 0.6057, + "step": 7829 + }, + { + "epoch": 2.1689750692520775, + "grad_norm": 0.7278553247451782, + "learning_rate": 3.57485273554942e-06, + "loss": 0.5967, + "step": 7830 + }, + { + "epoch": 2.169252077562327, + "grad_norm": 0.7679740786552429, + "learning_rate": 3.5745238270231875e-06, + "loss": 0.6474, + "step": 7831 + }, + { + "epoch": 2.169529085872576, + "grad_norm": 0.6951683759689331, + "learning_rate": 3.5741948956820693e-06, + "loss": 0.5941, + "step": 7832 + }, + { + "epoch": 2.1698060941828254, + "grad_norm": 0.76551353931427, + "learning_rate": 3.57386594153305e-06, + "loss": 0.5924, + "step": 7833 + }, + { + "epoch": 2.170083102493075, + "grad_norm": 0.7162626385688782, + "learning_rate": 3.5735369645831157e-06, + "loss": 0.6041, + "step": 7834 + }, + { + "epoch": 2.170360110803324, + "grad_norm": 0.6925835013389587, + "learning_rate": 3.573207964839249e-06, + "loss": 0.6008, + "step": 7835 + }, + { + "epoch": 2.1706371191135734, + "grad_norm": 0.7028840184211731, + "learning_rate": 3.5728789423084375e-06, + "loss": 0.6002, + "step": 7836 + }, + { + "epoch": 2.170914127423823, + "grad_norm": 0.7359599471092224, + "learning_rate": 3.5725498969976657e-06, + "loss": 0.5797, + "step": 7837 + }, + { + "epoch": 2.171191135734072, + "grad_norm": 0.724312961101532, + "learning_rate": 3.57222082891392e-06, + "loss": 0.6589, + "step": 7838 + }, + { + "epoch": 2.1714681440443213, + "grad_norm": 0.7298066020011902, + "learning_rate": 3.571891738064189e-06, + "loss": 0.5953, + "step": 7839 + }, + { + "epoch": 2.1717451523545708, + "grad_norm": 0.6635059118270874, + "learning_rate": 3.5715626244554587e-06, + "loss": 0.5261, + "step": 7840 + }, + { + "epoch": 2.17202216066482, + "grad_norm": 0.7057240009307861, + "learning_rate": 3.5712334880947175e-06, + "loss": 0.5618, + "step": 7841 + }, + { + "epoch": 2.1722991689750693, + "grad_norm": 0.819150984287262, + "learning_rate": 3.5709043289889538e-06, + "loss": 0.625, + "step": 7842 + }, + { + "epoch": 2.1725761772853187, + "grad_norm": 0.7225316762924194, + "learning_rate": 3.5705751471451555e-06, + "loss": 0.6125, + "step": 7843 + }, + { + "epoch": 2.1728531855955677, + "grad_norm": 0.7002236843109131, + "learning_rate": 3.5702459425703146e-06, + "loss": 0.5824, + "step": 7844 + }, + { + "epoch": 2.173130193905817, + "grad_norm": 0.6795487999916077, + "learning_rate": 3.5699167152714174e-06, + "loss": 0.56, + "step": 7845 + }, + { + "epoch": 2.1734072022160666, + "grad_norm": 0.6594259142875671, + "learning_rate": 3.5695874652554573e-06, + "loss": 0.5474, + "step": 7846 + }, + { + "epoch": 2.1736842105263157, + "grad_norm": 0.7530267834663391, + "learning_rate": 3.5692581925294234e-06, + "loss": 0.5675, + "step": 7847 + }, + { + "epoch": 2.173961218836565, + "grad_norm": 0.7334387898445129, + "learning_rate": 3.5689288971003073e-06, + "loss": 0.6163, + "step": 7848 + }, + { + "epoch": 2.1742382271468146, + "grad_norm": 0.7736994624137878, + "learning_rate": 3.568599578975101e-06, + "loss": 0.6707, + "step": 7849 + }, + { + "epoch": 2.1745152354570636, + "grad_norm": 0.7506792545318604, + "learning_rate": 3.568270238160797e-06, + "loss": 0.5819, + "step": 7850 + }, + { + "epoch": 2.174792243767313, + "grad_norm": 0.7608615159988403, + "learning_rate": 3.567940874664388e-06, + "loss": 0.5939, + "step": 7851 + }, + { + "epoch": 2.1750692520775625, + "grad_norm": 0.7625887989997864, + "learning_rate": 3.5676114884928666e-06, + "loss": 0.571, + "step": 7852 + }, + { + "epoch": 2.1753462603878115, + "grad_norm": 0.7180339694023132, + "learning_rate": 3.567282079653227e-06, + "loss": 0.5816, + "step": 7853 + }, + { + "epoch": 2.175623268698061, + "grad_norm": 0.723455548286438, + "learning_rate": 3.566952648152463e-06, + "loss": 0.619, + "step": 7854 + }, + { + "epoch": 2.1759002770083105, + "grad_norm": 0.7430705428123474, + "learning_rate": 3.5666231939975702e-06, + "loss": 0.6143, + "step": 7855 + }, + { + "epoch": 2.1761772853185595, + "grad_norm": 0.7321606278419495, + "learning_rate": 3.566293717195543e-06, + "loss": 0.593, + "step": 7856 + }, + { + "epoch": 2.176454293628809, + "grad_norm": 0.7337496280670166, + "learning_rate": 3.5659642177533773e-06, + "loss": 0.5753, + "step": 7857 + }, + { + "epoch": 2.176731301939058, + "grad_norm": 0.7165916562080383, + "learning_rate": 3.5656346956780685e-06, + "loss": 0.6012, + "step": 7858 + }, + { + "epoch": 2.1770083102493074, + "grad_norm": 0.7456101179122925, + "learning_rate": 3.565305150976614e-06, + "loss": 0.6037, + "step": 7859 + }, + { + "epoch": 2.177285318559557, + "grad_norm": 0.7195954322814941, + "learning_rate": 3.5649755836560107e-06, + "loss": 0.6076, + "step": 7860 + }, + { + "epoch": 2.177562326869806, + "grad_norm": 0.7319379448890686, + "learning_rate": 3.5646459937232556e-06, + "loss": 0.5677, + "step": 7861 + }, + { + "epoch": 2.1778393351800553, + "grad_norm": 0.7634261846542358, + "learning_rate": 3.564316381185348e-06, + "loss": 0.5846, + "step": 7862 + }, + { + "epoch": 2.178116343490305, + "grad_norm": 0.7687897086143494, + "learning_rate": 3.563986746049286e-06, + "loss": 0.6219, + "step": 7863 + }, + { + "epoch": 2.178393351800554, + "grad_norm": 0.6961368918418884, + "learning_rate": 3.563657088322068e-06, + "loss": 0.6413, + "step": 7864 + }, + { + "epoch": 2.1786703601108033, + "grad_norm": 0.7764820456504822, + "learning_rate": 3.5633274080106927e-06, + "loss": 0.62, + "step": 7865 + }, + { + "epoch": 2.1789473684210527, + "grad_norm": 0.6963434219360352, + "learning_rate": 3.562997705122162e-06, + "loss": 0.572, + "step": 7866 + }, + { + "epoch": 2.1792243767313018, + "grad_norm": 0.7819361090660095, + "learning_rate": 3.562667979663475e-06, + "loss": 0.6084, + "step": 7867 + }, + { + "epoch": 2.179501385041551, + "grad_norm": 0.6757632493972778, + "learning_rate": 3.5623382316416334e-06, + "loss": 0.5393, + "step": 7868 + }, + { + "epoch": 2.1797783933518007, + "grad_norm": 0.690197765827179, + "learning_rate": 3.5620084610636385e-06, + "loss": 0.5816, + "step": 7869 + }, + { + "epoch": 2.1800554016620497, + "grad_norm": 0.7298179864883423, + "learning_rate": 3.561678667936491e-06, + "loss": 0.6169, + "step": 7870 + }, + { + "epoch": 2.180332409972299, + "grad_norm": 0.6915467381477356, + "learning_rate": 3.5613488522671947e-06, + "loss": 0.6357, + "step": 7871 + }, + { + "epoch": 2.1806094182825486, + "grad_norm": 0.7635214328765869, + "learning_rate": 3.561019014062752e-06, + "loss": 0.6146, + "step": 7872 + }, + { + "epoch": 2.1808864265927976, + "grad_norm": 0.737816572189331, + "learning_rate": 3.560689153330166e-06, + "loss": 0.6044, + "step": 7873 + }, + { + "epoch": 2.181163434903047, + "grad_norm": 0.6996750831604004, + "learning_rate": 3.5603592700764406e-06, + "loss": 0.5811, + "step": 7874 + }, + { + "epoch": 2.1814404432132966, + "grad_norm": 0.7186986804008484, + "learning_rate": 3.560029364308579e-06, + "loss": 0.6217, + "step": 7875 + }, + { + "epoch": 2.1817174515235456, + "grad_norm": 0.740057110786438, + "learning_rate": 3.5596994360335883e-06, + "loss": 0.5861, + "step": 7876 + }, + { + "epoch": 2.181994459833795, + "grad_norm": 0.7343260049819946, + "learning_rate": 3.559369485258472e-06, + "loss": 0.5851, + "step": 7877 + }, + { + "epoch": 2.1822714681440445, + "grad_norm": 0.7151075005531311, + "learning_rate": 3.559039511990236e-06, + "loss": 0.563, + "step": 7878 + }, + { + "epoch": 2.1825484764542935, + "grad_norm": 0.7891567945480347, + "learning_rate": 3.5587095162358874e-06, + "loss": 0.6416, + "step": 7879 + }, + { + "epoch": 2.182825484764543, + "grad_norm": 0.7027353644371033, + "learning_rate": 3.558379498002431e-06, + "loss": 0.6098, + "step": 7880 + }, + { + "epoch": 2.1831024930747924, + "grad_norm": 0.7250735759735107, + "learning_rate": 3.558049457296876e-06, + "loss": 0.5759, + "step": 7881 + }, + { + "epoch": 2.1833795013850414, + "grad_norm": 0.6830952763557434, + "learning_rate": 3.5577193941262288e-06, + "loss": 0.5817, + "step": 7882 + }, + { + "epoch": 2.183656509695291, + "grad_norm": 0.7518918514251709, + "learning_rate": 3.5573893084974974e-06, + "loss": 0.5948, + "step": 7883 + }, + { + "epoch": 2.1839335180055404, + "grad_norm": 0.6851316094398499, + "learning_rate": 3.5570592004176906e-06, + "loss": 0.6175, + "step": 7884 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.7158825397491455, + "learning_rate": 3.5567290698938185e-06, + "loss": 0.6003, + "step": 7885 + }, + { + "epoch": 2.184487534626039, + "grad_norm": 0.7178775072097778, + "learning_rate": 3.556398916932889e-06, + "loss": 0.5882, + "step": 7886 + }, + { + "epoch": 2.1847645429362883, + "grad_norm": 0.7214950323104858, + "learning_rate": 3.5560687415419128e-06, + "loss": 0.6367, + "step": 7887 + }, + { + "epoch": 2.1850415512465373, + "grad_norm": 0.73500657081604, + "learning_rate": 3.5557385437279e-06, + "loss": 0.5479, + "step": 7888 + }, + { + "epoch": 2.1853185595567868, + "grad_norm": 0.7137157320976257, + "learning_rate": 3.555408323497862e-06, + "loss": 0.5845, + "step": 7889 + }, + { + "epoch": 2.1855955678670362, + "grad_norm": 0.65468829870224, + "learning_rate": 3.5550780808588103e-06, + "loss": 0.5309, + "step": 7890 + }, + { + "epoch": 2.1858725761772853, + "grad_norm": 0.71401047706604, + "learning_rate": 3.554747815817756e-06, + "loss": 0.5621, + "step": 7891 + }, + { + "epoch": 2.1861495844875347, + "grad_norm": 0.753894567489624, + "learning_rate": 3.5544175283817127e-06, + "loss": 0.6106, + "step": 7892 + }, + { + "epoch": 2.1864265927977837, + "grad_norm": 0.7078069448471069, + "learning_rate": 3.5540872185576924e-06, + "loss": 0.577, + "step": 7893 + }, + { + "epoch": 2.186703601108033, + "grad_norm": 0.7022084593772888, + "learning_rate": 3.553756886352708e-06, + "loss": 0.5592, + "step": 7894 + }, + { + "epoch": 2.1869806094182827, + "grad_norm": 0.7479102611541748, + "learning_rate": 3.5534265317737755e-06, + "loss": 0.6316, + "step": 7895 + }, + { + "epoch": 2.1872576177285317, + "grad_norm": 0.7682097554206848, + "learning_rate": 3.5530961548279062e-06, + "loss": 0.6092, + "step": 7896 + }, + { + "epoch": 2.187534626038781, + "grad_norm": 0.7278096675872803, + "learning_rate": 3.5527657555221163e-06, + "loss": 0.5541, + "step": 7897 + }, + { + "epoch": 2.1878116343490306, + "grad_norm": 0.7436990141868591, + "learning_rate": 3.552435333863422e-06, + "loss": 0.5878, + "step": 7898 + }, + { + "epoch": 2.1880886426592796, + "grad_norm": 0.7513325810432434, + "learning_rate": 3.552104889858837e-06, + "loss": 0.6403, + "step": 7899 + }, + { + "epoch": 2.188365650969529, + "grad_norm": 0.7407289147377014, + "learning_rate": 3.5517744235153786e-06, + "loss": 0.5961, + "step": 7900 + }, + { + "epoch": 2.1886426592797785, + "grad_norm": 0.7015832662582397, + "learning_rate": 3.551443934840063e-06, + "loss": 0.6063, + "step": 7901 + }, + { + "epoch": 2.1889196675900275, + "grad_norm": 0.7211142778396606, + "learning_rate": 3.551113423839908e-06, + "loss": 0.5823, + "step": 7902 + }, + { + "epoch": 2.189196675900277, + "grad_norm": 0.7457058429718018, + "learning_rate": 3.5507828905219298e-06, + "loss": 0.6282, + "step": 7903 + }, + { + "epoch": 2.1894736842105265, + "grad_norm": 0.7473145723342896, + "learning_rate": 3.5504523348931486e-06, + "loss": 0.5861, + "step": 7904 + }, + { + "epoch": 2.1897506925207755, + "grad_norm": 0.7545790672302246, + "learning_rate": 3.550121756960581e-06, + "loss": 0.5869, + "step": 7905 + }, + { + "epoch": 2.190027700831025, + "grad_norm": 0.6933561563491821, + "learning_rate": 3.549791156731247e-06, + "loss": 0.5913, + "step": 7906 + }, + { + "epoch": 2.1903047091412744, + "grad_norm": 0.7041760087013245, + "learning_rate": 3.5494605342121663e-06, + "loss": 0.5895, + "step": 7907 + }, + { + "epoch": 2.1905817174515234, + "grad_norm": 0.7071885466575623, + "learning_rate": 3.549129889410358e-06, + "loss": 0.581, + "step": 7908 + }, + { + "epoch": 2.190858725761773, + "grad_norm": 0.7134894728660583, + "learning_rate": 3.5487992223328434e-06, + "loss": 0.6123, + "step": 7909 + }, + { + "epoch": 2.1911357340720223, + "grad_norm": 0.7097256779670715, + "learning_rate": 3.5484685329866424e-06, + "loss": 0.6277, + "step": 7910 + }, + { + "epoch": 2.1914127423822714, + "grad_norm": 0.7199460864067078, + "learning_rate": 3.5481378213787777e-06, + "loss": 0.6123, + "step": 7911 + }, + { + "epoch": 2.191689750692521, + "grad_norm": 0.724058985710144, + "learning_rate": 3.5478070875162697e-06, + "loss": 0.6141, + "step": 7912 + }, + { + "epoch": 2.1919667590027703, + "grad_norm": 0.7432721853256226, + "learning_rate": 3.547476331406141e-06, + "loss": 0.5756, + "step": 7913 + }, + { + "epoch": 2.1922437673130193, + "grad_norm": 0.7083652019500732, + "learning_rate": 3.5471455530554157e-06, + "loss": 0.5764, + "step": 7914 + }, + { + "epoch": 2.1925207756232687, + "grad_norm": 0.7218777537345886, + "learning_rate": 3.546814752471116e-06, + "loss": 0.6246, + "step": 7915 + }, + { + "epoch": 2.192797783933518, + "grad_norm": 0.7310037016868591, + "learning_rate": 3.5464839296602655e-06, + "loss": 0.6346, + "step": 7916 + }, + { + "epoch": 2.1930747922437672, + "grad_norm": 0.7280022501945496, + "learning_rate": 3.5461530846298894e-06, + "loss": 0.6128, + "step": 7917 + }, + { + "epoch": 2.1933518005540167, + "grad_norm": 0.7664794325828552, + "learning_rate": 3.545822217387011e-06, + "loss": 0.6016, + "step": 7918 + }, + { + "epoch": 2.1936288088642657, + "grad_norm": 0.7563583254814148, + "learning_rate": 3.545491327938656e-06, + "loss": 0.6326, + "step": 7919 + }, + { + "epoch": 2.193905817174515, + "grad_norm": 0.7406309843063354, + "learning_rate": 3.545160416291852e-06, + "loss": 0.612, + "step": 7920 + }, + { + "epoch": 2.1941828254847646, + "grad_norm": 0.7212445735931396, + "learning_rate": 3.5448294824536212e-06, + "loss": 0.6482, + "step": 7921 + }, + { + "epoch": 2.1944598337950136, + "grad_norm": 0.7566403150558472, + "learning_rate": 3.5444985264309933e-06, + "loss": 0.6009, + "step": 7922 + }, + { + "epoch": 2.194736842105263, + "grad_norm": 0.6928979158401489, + "learning_rate": 3.5441675482309947e-06, + "loss": 0.5822, + "step": 7923 + }, + { + "epoch": 2.1950138504155126, + "grad_norm": 0.7452521324157715, + "learning_rate": 3.543836547860652e-06, + "loss": 0.6114, + "step": 7924 + }, + { + "epoch": 2.1952908587257616, + "grad_norm": 0.7450066804885864, + "learning_rate": 3.543505525326993e-06, + "loss": 0.6043, + "step": 7925 + }, + { + "epoch": 2.195567867036011, + "grad_norm": 0.7499051690101624, + "learning_rate": 3.5431744806370475e-06, + "loss": 0.6264, + "step": 7926 + }, + { + "epoch": 2.1958448753462605, + "grad_norm": 0.7499408721923828, + "learning_rate": 3.542843413797844e-06, + "loss": 0.6628, + "step": 7927 + }, + { + "epoch": 2.1961218836565095, + "grad_norm": 0.7410886287689209, + "learning_rate": 3.5425123248164116e-06, + "loss": 0.6215, + "step": 7928 + }, + { + "epoch": 2.196398891966759, + "grad_norm": 0.6964012980461121, + "learning_rate": 3.54218121369978e-06, + "loss": 0.6098, + "step": 7929 + }, + { + "epoch": 2.1966759002770084, + "grad_norm": 0.7218595743179321, + "learning_rate": 3.5418500804549805e-06, + "loss": 0.6061, + "step": 7930 + }, + { + "epoch": 2.1969529085872574, + "grad_norm": 0.750812292098999, + "learning_rate": 3.5415189250890425e-06, + "loss": 0.6302, + "step": 7931 + }, + { + "epoch": 2.197229916897507, + "grad_norm": 0.7358490228652954, + "learning_rate": 3.541187747608998e-06, + "loss": 0.6109, + "step": 7932 + }, + { + "epoch": 2.1975069252077564, + "grad_norm": 0.7339686155319214, + "learning_rate": 3.5408565480218795e-06, + "loss": 0.6392, + "step": 7933 + }, + { + "epoch": 2.1977839335180054, + "grad_norm": 0.7436063885688782, + "learning_rate": 3.5405253263347173e-06, + "loss": 0.619, + "step": 7934 + }, + { + "epoch": 2.198060941828255, + "grad_norm": 0.8030242919921875, + "learning_rate": 3.5401940825545456e-06, + "loss": 0.6468, + "step": 7935 + }, + { + "epoch": 2.1983379501385043, + "grad_norm": 0.7203686833381653, + "learning_rate": 3.5398628166883976e-06, + "loss": 0.6196, + "step": 7936 + }, + { + "epoch": 2.1986149584487533, + "grad_norm": 0.7480223178863525, + "learning_rate": 3.5395315287433063e-06, + "loss": 0.6097, + "step": 7937 + }, + { + "epoch": 2.198891966759003, + "grad_norm": 0.6992270946502686, + "learning_rate": 3.5392002187263054e-06, + "loss": 0.5731, + "step": 7938 + }, + { + "epoch": 2.1991689750692522, + "grad_norm": 0.7402212023735046, + "learning_rate": 3.538868886644431e-06, + "loss": 0.5997, + "step": 7939 + }, + { + "epoch": 2.1994459833795013, + "grad_norm": 0.6975589990615845, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.5753, + "step": 7940 + }, + { + "epoch": 2.1997229916897507, + "grad_norm": 0.6758665442466736, + "learning_rate": 3.5382061563141986e-06, + "loss": 0.6344, + "step": 7941 + }, + { + "epoch": 2.2, + "grad_norm": 0.7077716588973999, + "learning_rate": 3.5378747580799122e-06, + "loss": 0.692, + "step": 7942 + }, + { + "epoch": 2.200277008310249, + "grad_norm": 0.74322110414505, + "learning_rate": 3.537543337808894e-06, + "loss": 0.5733, + "step": 7943 + }, + { + "epoch": 2.2005540166204987, + "grad_norm": 0.7803390622138977, + "learning_rate": 3.5372118955081815e-06, + "loss": 0.6034, + "step": 7944 + }, + { + "epoch": 2.200831024930748, + "grad_norm": 0.7345275282859802, + "learning_rate": 3.5368804311848114e-06, + "loss": 0.6302, + "step": 7945 + }, + { + "epoch": 2.201108033240997, + "grad_norm": 0.7228006720542908, + "learning_rate": 3.536548944845822e-06, + "loss": 0.5631, + "step": 7946 + }, + { + "epoch": 2.2013850415512466, + "grad_norm": 0.7092156410217285, + "learning_rate": 3.5362174364982517e-06, + "loss": 0.5946, + "step": 7947 + }, + { + "epoch": 2.201662049861496, + "grad_norm": 0.7122862339019775, + "learning_rate": 3.535885906149139e-06, + "loss": 0.5755, + "step": 7948 + }, + { + "epoch": 2.201939058171745, + "grad_norm": 0.737549364566803, + "learning_rate": 3.5355543538055232e-06, + "loss": 0.6109, + "step": 7949 + }, + { + "epoch": 2.2022160664819945, + "grad_norm": 0.7759289741516113, + "learning_rate": 3.5352227794744437e-06, + "loss": 0.6329, + "step": 7950 + }, + { + "epoch": 2.202493074792244, + "grad_norm": 0.7205516695976257, + "learning_rate": 3.5348911831629405e-06, + "loss": 0.5825, + "step": 7951 + }, + { + "epoch": 2.202770083102493, + "grad_norm": 0.7226518988609314, + "learning_rate": 3.5345595648780562e-06, + "loss": 0.5624, + "step": 7952 + }, + { + "epoch": 2.2030470914127425, + "grad_norm": 0.7641230821609497, + "learning_rate": 3.5342279246268286e-06, + "loss": 0.6224, + "step": 7953 + }, + { + "epoch": 2.203324099722992, + "grad_norm": 0.7313389182090759, + "learning_rate": 3.533896262416302e-06, + "loss": 0.6215, + "step": 7954 + }, + { + "epoch": 2.203601108033241, + "grad_norm": 0.7629680633544922, + "learning_rate": 3.5335645782535174e-06, + "loss": 0.6266, + "step": 7955 + }, + { + "epoch": 2.2038781163434904, + "grad_norm": 0.7637547850608826, + "learning_rate": 3.5332328721455174e-06, + "loss": 0.6103, + "step": 7956 + }, + { + "epoch": 2.2041551246537394, + "grad_norm": 0.750389575958252, + "learning_rate": 3.5329011440993443e-06, + "loss": 0.5681, + "step": 7957 + }, + { + "epoch": 2.204432132963989, + "grad_norm": 0.7237061262130737, + "learning_rate": 3.532569394122043e-06, + "loss": 0.5848, + "step": 7958 + }, + { + "epoch": 2.2047091412742383, + "grad_norm": 0.7263107895851135, + "learning_rate": 3.5322376222206556e-06, + "loss": 0.6069, + "step": 7959 + }, + { + "epoch": 2.2049861495844874, + "grad_norm": 0.7003798484802246, + "learning_rate": 3.531905828402228e-06, + "loss": 0.6112, + "step": 7960 + }, + { + "epoch": 2.205263157894737, + "grad_norm": 0.7772372364997864, + "learning_rate": 3.5315740126738053e-06, + "loss": 0.593, + "step": 7961 + }, + { + "epoch": 2.2055401662049863, + "grad_norm": 0.7660946846008301, + "learning_rate": 3.5312421750424304e-06, + "loss": 0.5922, + "step": 7962 + }, + { + "epoch": 2.2058171745152353, + "grad_norm": 0.7145115733146667, + "learning_rate": 3.5309103155151512e-06, + "loss": 0.6184, + "step": 7963 + }, + { + "epoch": 2.2060941828254848, + "grad_norm": 0.7356007099151611, + "learning_rate": 3.530578434099014e-06, + "loss": 0.5603, + "step": 7964 + }, + { + "epoch": 2.206371191135734, + "grad_norm": 0.6965721249580383, + "learning_rate": 3.530246530801064e-06, + "loss": 0.5912, + "step": 7965 + }, + { + "epoch": 2.2066481994459832, + "grad_norm": 0.7097307443618774, + "learning_rate": 3.5299146056283496e-06, + "loss": 0.6278, + "step": 7966 + }, + { + "epoch": 2.2069252077562327, + "grad_norm": 0.7481800317764282, + "learning_rate": 3.529582658587918e-06, + "loss": 0.6229, + "step": 7967 + }, + { + "epoch": 2.207202216066482, + "grad_norm": 0.7750024795532227, + "learning_rate": 3.529250689686817e-06, + "loss": 0.5928, + "step": 7968 + }, + { + "epoch": 2.207479224376731, + "grad_norm": 0.6997442841529846, + "learning_rate": 3.5289186989320954e-06, + "loss": 0.5417, + "step": 7969 + }, + { + "epoch": 2.2077562326869806, + "grad_norm": 0.712722897529602, + "learning_rate": 3.5285866863308025e-06, + "loss": 0.595, + "step": 7970 + }, + { + "epoch": 2.20803324099723, + "grad_norm": 0.76949542760849, + "learning_rate": 3.5282546518899878e-06, + "loss": 0.6026, + "step": 7971 + }, + { + "epoch": 2.208310249307479, + "grad_norm": 0.8288016319274902, + "learning_rate": 3.5279225956167006e-06, + "loss": 0.5849, + "step": 7972 + }, + { + "epoch": 2.2085872576177286, + "grad_norm": 0.7565370202064514, + "learning_rate": 3.527590517517992e-06, + "loss": 0.5859, + "step": 7973 + }, + { + "epoch": 2.208864265927978, + "grad_norm": 0.7121152877807617, + "learning_rate": 3.527258417600912e-06, + "loss": 0.5988, + "step": 7974 + }, + { + "epoch": 2.209141274238227, + "grad_norm": 0.7364125847816467, + "learning_rate": 3.5269262958725124e-06, + "loss": 0.6228, + "step": 7975 + }, + { + "epoch": 2.2094182825484765, + "grad_norm": 0.7307944297790527, + "learning_rate": 3.5265941523398455e-06, + "loss": 0.6114, + "step": 7976 + }, + { + "epoch": 2.209695290858726, + "grad_norm": 0.7665871381759644, + "learning_rate": 3.5262619870099635e-06, + "loss": 0.6285, + "step": 7977 + }, + { + "epoch": 2.209972299168975, + "grad_norm": 0.7392860054969788, + "learning_rate": 3.5259297998899184e-06, + "loss": 0.5715, + "step": 7978 + }, + { + "epoch": 2.2102493074792244, + "grad_norm": 0.7718192338943481, + "learning_rate": 3.525597590986763e-06, + "loss": 0.616, + "step": 7979 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.7106596231460571, + "learning_rate": 3.5252653603075522e-06, + "loss": 0.6153, + "step": 7980 + }, + { + "epoch": 2.210803324099723, + "grad_norm": 0.7141216993331909, + "learning_rate": 3.5249331078593397e-06, + "loss": 0.6069, + "step": 7981 + }, + { + "epoch": 2.2110803324099724, + "grad_norm": 0.7057957649230957, + "learning_rate": 3.52460083364918e-06, + "loss": 0.5425, + "step": 7982 + }, + { + "epoch": 2.2113573407202214, + "grad_norm": 0.7577681541442871, + "learning_rate": 3.5242685376841273e-06, + "loss": 0.621, + "step": 7983 + }, + { + "epoch": 2.211634349030471, + "grad_norm": 0.72541743516922, + "learning_rate": 3.523936219971239e-06, + "loss": 0.5972, + "step": 7984 + }, + { + "epoch": 2.2119113573407203, + "grad_norm": 0.7325984239578247, + "learning_rate": 3.5236038805175686e-06, + "loss": 0.6067, + "step": 7985 + }, + { + "epoch": 2.2121883656509693, + "grad_norm": 0.752688467502594, + "learning_rate": 3.5232715193301746e-06, + "loss": 0.6298, + "step": 7986 + }, + { + "epoch": 2.212465373961219, + "grad_norm": 0.6812304258346558, + "learning_rate": 3.5229391364161137e-06, + "loss": 0.5075, + "step": 7987 + }, + { + "epoch": 2.2127423822714682, + "grad_norm": 0.7274405360221863, + "learning_rate": 3.5226067317824415e-06, + "loss": 0.5995, + "step": 7988 + }, + { + "epoch": 2.2130193905817173, + "grad_norm": 0.7464661598205566, + "learning_rate": 3.5222743054362173e-06, + "loss": 0.6198, + "step": 7989 + }, + { + "epoch": 2.2132963988919667, + "grad_norm": 0.761583149433136, + "learning_rate": 3.521941857384499e-06, + "loss": 0.56, + "step": 7990 + }, + { + "epoch": 2.213573407202216, + "grad_norm": 0.7192556262016296, + "learning_rate": 3.5216093876343452e-06, + "loss": 0.6106, + "step": 7991 + }, + { + "epoch": 2.213850415512465, + "grad_norm": 0.7213461995124817, + "learning_rate": 3.5212768961928153e-06, + "loss": 0.5608, + "step": 7992 + }, + { + "epoch": 2.2141274238227147, + "grad_norm": 0.7017572522163391, + "learning_rate": 3.52094438306697e-06, + "loss": 0.5734, + "step": 7993 + }, + { + "epoch": 2.214404432132964, + "grad_norm": 0.7463569641113281, + "learning_rate": 3.5206118482638664e-06, + "loss": 0.6096, + "step": 7994 + }, + { + "epoch": 2.214681440443213, + "grad_norm": 0.754392683506012, + "learning_rate": 3.5202792917905676e-06, + "loss": 0.608, + "step": 7995 + }, + { + "epoch": 2.2149584487534626, + "grad_norm": 0.7312411069869995, + "learning_rate": 3.5199467136541342e-06, + "loss": 0.5546, + "step": 7996 + }, + { + "epoch": 2.215235457063712, + "grad_norm": 0.7295164465904236, + "learning_rate": 3.519614113861627e-06, + "loss": 0.5974, + "step": 7997 + }, + { + "epoch": 2.215512465373961, + "grad_norm": 0.7471133470535278, + "learning_rate": 3.519281492420108e-06, + "loss": 0.6558, + "step": 7998 + }, + { + "epoch": 2.2157894736842105, + "grad_norm": 0.7575737237930298, + "learning_rate": 3.518948849336641e-06, + "loss": 0.6012, + "step": 7999 + }, + { + "epoch": 2.21606648199446, + "grad_norm": 0.6775148510932922, + "learning_rate": 3.5186161846182874e-06, + "loss": 0.5475, + "step": 8000 + }, + { + "epoch": 2.216343490304709, + "grad_norm": 0.7134628891944885, + "learning_rate": 3.518283498272111e-06, + "loss": 0.5482, + "step": 8001 + }, + { + "epoch": 2.2166204986149585, + "grad_norm": 0.7109372019767761, + "learning_rate": 3.5179507903051746e-06, + "loss": 0.6066, + "step": 8002 + }, + { + "epoch": 2.216897506925208, + "grad_norm": 0.7727257013320923, + "learning_rate": 3.5176180607245447e-06, + "loss": 0.6076, + "step": 8003 + }, + { + "epoch": 2.217174515235457, + "grad_norm": 0.7703292965888977, + "learning_rate": 3.5172853095372837e-06, + "loss": 0.6273, + "step": 8004 + }, + { + "epoch": 2.2174515235457064, + "grad_norm": 0.7161400318145752, + "learning_rate": 3.5169525367504574e-06, + "loss": 0.6014, + "step": 8005 + }, + { + "epoch": 2.217728531855956, + "grad_norm": 0.8021954298019409, + "learning_rate": 3.5166197423711336e-06, + "loss": 0.6468, + "step": 8006 + }, + { + "epoch": 2.218005540166205, + "grad_norm": 0.725152313709259, + "learning_rate": 3.516286926406375e-06, + "loss": 0.6367, + "step": 8007 + }, + { + "epoch": 2.2182825484764543, + "grad_norm": 0.7513183951377869, + "learning_rate": 3.51595408886325e-06, + "loss": 0.6373, + "step": 8008 + }, + { + "epoch": 2.218559556786704, + "grad_norm": 0.719607412815094, + "learning_rate": 3.515621229748825e-06, + "loss": 0.6255, + "step": 8009 + }, + { + "epoch": 2.218836565096953, + "grad_norm": 0.7222772240638733, + "learning_rate": 3.5152883490701682e-06, + "loss": 0.5828, + "step": 8010 + }, + { + "epoch": 2.2191135734072023, + "grad_norm": 0.7790684700012207, + "learning_rate": 3.5149554468343463e-06, + "loss": 0.5909, + "step": 8011 + }, + { + "epoch": 2.2193905817174517, + "grad_norm": 0.7023915648460388, + "learning_rate": 3.514622523048429e-06, + "loss": 0.5937, + "step": 8012 + }, + { + "epoch": 2.2196675900277008, + "grad_norm": 0.7843884229660034, + "learning_rate": 3.514289577719484e-06, + "loss": 0.6214, + "step": 8013 + }, + { + "epoch": 2.21994459833795, + "grad_norm": 0.7272974252700806, + "learning_rate": 3.5139566108545814e-06, + "loss": 0.6052, + "step": 8014 + }, + { + "epoch": 2.2202216066481997, + "grad_norm": 0.7483271956443787, + "learning_rate": 3.5136236224607906e-06, + "loss": 0.5648, + "step": 8015 + }, + { + "epoch": 2.2204986149584487, + "grad_norm": 0.7253241539001465, + "learning_rate": 3.513290612545181e-06, + "loss": 0.5772, + "step": 8016 + }, + { + "epoch": 2.220775623268698, + "grad_norm": 0.7203364372253418, + "learning_rate": 3.512957581114825e-06, + "loss": 0.5419, + "step": 8017 + }, + { + "epoch": 2.221052631578947, + "grad_norm": 0.7568876147270203, + "learning_rate": 3.5126245281767924e-06, + "loss": 0.645, + "step": 8018 + }, + { + "epoch": 2.2213296398891966, + "grad_norm": 0.7632686495780945, + "learning_rate": 3.5122914537381554e-06, + "loss": 0.606, + "step": 8019 + }, + { + "epoch": 2.221606648199446, + "grad_norm": 0.7100051045417786, + "learning_rate": 3.5119583578059845e-06, + "loss": 0.5833, + "step": 8020 + }, + { + "epoch": 2.221883656509695, + "grad_norm": 0.704388439655304, + "learning_rate": 3.5116252403873546e-06, + "loss": 0.5662, + "step": 8021 + }, + { + "epoch": 2.2221606648199446, + "grad_norm": 0.7191629409790039, + "learning_rate": 3.5112921014893375e-06, + "loss": 0.5969, + "step": 8022 + }, + { + "epoch": 2.222437673130194, + "grad_norm": 0.7262014746665955, + "learning_rate": 3.510958941119006e-06, + "loss": 0.608, + "step": 8023 + }, + { + "epoch": 2.222714681440443, + "grad_norm": 0.7569392323493958, + "learning_rate": 3.510625759283434e-06, + "loss": 0.6573, + "step": 8024 + }, + { + "epoch": 2.2229916897506925, + "grad_norm": 0.7174189686775208, + "learning_rate": 3.5102925559896977e-06, + "loss": 0.5955, + "step": 8025 + }, + { + "epoch": 2.223268698060942, + "grad_norm": 0.7140632271766663, + "learning_rate": 3.5099593312448694e-06, + "loss": 0.5963, + "step": 8026 + }, + { + "epoch": 2.223545706371191, + "grad_norm": 0.739309549331665, + "learning_rate": 3.509626085056026e-06, + "loss": 0.6161, + "step": 8027 + }, + { + "epoch": 2.2238227146814404, + "grad_norm": 0.7275558710098267, + "learning_rate": 3.509292817430242e-06, + "loss": 0.6273, + "step": 8028 + }, + { + "epoch": 2.22409972299169, + "grad_norm": 0.6667734384536743, + "learning_rate": 3.5089595283745943e-06, + "loss": 0.5515, + "step": 8029 + }, + { + "epoch": 2.224376731301939, + "grad_norm": 0.7046604752540588, + "learning_rate": 3.508626217896158e-06, + "loss": 0.6296, + "step": 8030 + }, + { + "epoch": 2.2246537396121884, + "grad_norm": 0.6949466466903687, + "learning_rate": 3.508292886002013e-06, + "loss": 0.59, + "step": 8031 + }, + { + "epoch": 2.224930747922438, + "grad_norm": 0.7113898992538452, + "learning_rate": 3.5079595326992345e-06, + "loss": 0.6243, + "step": 8032 + }, + { + "epoch": 2.225207756232687, + "grad_norm": 0.6967516541481018, + "learning_rate": 3.507626157994901e-06, + "loss": 0.5638, + "step": 8033 + }, + { + "epoch": 2.2254847645429363, + "grad_norm": 0.7007837891578674, + "learning_rate": 3.507292761896091e-06, + "loss": 0.5835, + "step": 8034 + }, + { + "epoch": 2.2257617728531858, + "grad_norm": 0.7281460762023926, + "learning_rate": 3.506959344409884e-06, + "loss": 0.605, + "step": 8035 + }, + { + "epoch": 2.226038781163435, + "grad_norm": 0.6967830061912537, + "learning_rate": 3.5066259055433575e-06, + "loss": 0.6225, + "step": 8036 + }, + { + "epoch": 2.2263157894736842, + "grad_norm": 0.7038334608078003, + "learning_rate": 3.506292445303593e-06, + "loss": 0.5902, + "step": 8037 + }, + { + "epoch": 2.2265927977839337, + "grad_norm": 0.7103922367095947, + "learning_rate": 3.5059589636976704e-06, + "loss": 0.5674, + "step": 8038 + }, + { + "epoch": 2.2268698060941827, + "grad_norm": 0.759024977684021, + "learning_rate": 3.505625460732669e-06, + "loss": 0.5504, + "step": 8039 + }, + { + "epoch": 2.227146814404432, + "grad_norm": 0.6931019425392151, + "learning_rate": 3.5052919364156718e-06, + "loss": 0.5769, + "step": 8040 + }, + { + "epoch": 2.2274238227146816, + "grad_norm": 0.7260349988937378, + "learning_rate": 3.5049583907537593e-06, + "loss": 0.5885, + "step": 8041 + }, + { + "epoch": 2.2277008310249307, + "grad_norm": 0.7508696913719177, + "learning_rate": 3.504624823754014e-06, + "loss": 0.5971, + "step": 8042 + }, + { + "epoch": 2.22797783933518, + "grad_norm": 0.7547364234924316, + "learning_rate": 3.5042912354235183e-06, + "loss": 0.6605, + "step": 8043 + }, + { + "epoch": 2.228254847645429, + "grad_norm": 0.7311235070228577, + "learning_rate": 3.5039576257693543e-06, + "loss": 0.6323, + "step": 8044 + }, + { + "epoch": 2.2285318559556786, + "grad_norm": 0.7337801456451416, + "learning_rate": 3.5036239947986066e-06, + "loss": 0.604, + "step": 8045 + }, + { + "epoch": 2.228808864265928, + "grad_norm": 0.7613157629966736, + "learning_rate": 3.503290342518358e-06, + "loss": 0.6399, + "step": 8046 + }, + { + "epoch": 2.229085872576177, + "grad_norm": 0.7410513162612915, + "learning_rate": 3.502956668935694e-06, + "loss": 0.5878, + "step": 8047 + }, + { + "epoch": 2.2293628808864265, + "grad_norm": 0.7026098370552063, + "learning_rate": 3.5026229740576985e-06, + "loss": 0.5976, + "step": 8048 + }, + { + "epoch": 2.229639889196676, + "grad_norm": 0.7590800523757935, + "learning_rate": 3.502289257891457e-06, + "loss": 0.5861, + "step": 8049 + }, + { + "epoch": 2.229916897506925, + "grad_norm": 0.700961172580719, + "learning_rate": 3.5019555204440546e-06, + "loss": 0.5721, + "step": 8050 + }, + { + "epoch": 2.2301939058171745, + "grad_norm": 0.7313873767852783, + "learning_rate": 3.5016217617225777e-06, + "loss": 0.6313, + "step": 8051 + }, + { + "epoch": 2.230470914127424, + "grad_norm": 0.7244826555252075, + "learning_rate": 3.5012879817341126e-06, + "loss": 0.5764, + "step": 8052 + }, + { + "epoch": 2.230747922437673, + "grad_norm": 0.7611440420150757, + "learning_rate": 3.5009541804857474e-06, + "loss": 0.6375, + "step": 8053 + }, + { + "epoch": 2.2310249307479224, + "grad_norm": 0.7172898650169373, + "learning_rate": 3.5006203579845687e-06, + "loss": 0.5634, + "step": 8054 + }, + { + "epoch": 2.231301939058172, + "grad_norm": 0.7191401720046997, + "learning_rate": 3.500286514237664e-06, + "loss": 0.6085, + "step": 8055 + }, + { + "epoch": 2.231578947368421, + "grad_norm": 0.7275636196136475, + "learning_rate": 3.4999526492521223e-06, + "loss": 0.6298, + "step": 8056 + }, + { + "epoch": 2.2318559556786703, + "grad_norm": 0.7126155495643616, + "learning_rate": 3.499618763035033e-06, + "loss": 0.6263, + "step": 8057 + }, + { + "epoch": 2.23213296398892, + "grad_norm": 0.7015401721000671, + "learning_rate": 3.4992848555934837e-06, + "loss": 0.5872, + "step": 8058 + }, + { + "epoch": 2.232409972299169, + "grad_norm": 0.8176736235618591, + "learning_rate": 3.498950926934565e-06, + "loss": 0.6324, + "step": 8059 + }, + { + "epoch": 2.2326869806094183, + "grad_norm": 0.6858984231948853, + "learning_rate": 3.4986169770653684e-06, + "loss": 0.5711, + "step": 8060 + }, + { + "epoch": 2.2329639889196677, + "grad_norm": 0.7461230158805847, + "learning_rate": 3.498283005992982e-06, + "loss": 0.6037, + "step": 8061 + }, + { + "epoch": 2.2332409972299168, + "grad_norm": 0.7021551728248596, + "learning_rate": 3.497949013724498e-06, + "loss": 0.5787, + "step": 8062 + }, + { + "epoch": 2.233518005540166, + "grad_norm": 0.7063918113708496, + "learning_rate": 3.4976150002670085e-06, + "loss": 0.5894, + "step": 8063 + }, + { + "epoch": 2.2337950138504157, + "grad_norm": 0.7009927034378052, + "learning_rate": 3.4972809656276047e-06, + "loss": 0.6006, + "step": 8064 + }, + { + "epoch": 2.2340720221606647, + "grad_norm": 0.7303755879402161, + "learning_rate": 3.4969469098133784e-06, + "loss": 0.6307, + "step": 8065 + }, + { + "epoch": 2.234349030470914, + "grad_norm": 0.717224657535553, + "learning_rate": 3.4966128328314243e-06, + "loss": 0.6233, + "step": 8066 + }, + { + "epoch": 2.2346260387811636, + "grad_norm": 0.7022543549537659, + "learning_rate": 3.4962787346888348e-06, + "loss": 0.5937, + "step": 8067 + }, + { + "epoch": 2.2349030470914126, + "grad_norm": 0.747253954410553, + "learning_rate": 3.4959446153927035e-06, + "loss": 0.603, + "step": 8068 + }, + { + "epoch": 2.235180055401662, + "grad_norm": 0.6907825469970703, + "learning_rate": 3.495610474950124e-06, + "loss": 0.5969, + "step": 8069 + }, + { + "epoch": 2.2354570637119116, + "grad_norm": 0.6939720511436462, + "learning_rate": 3.4952763133681923e-06, + "loss": 0.6142, + "step": 8070 + }, + { + "epoch": 2.2357340720221606, + "grad_norm": 0.7356911897659302, + "learning_rate": 3.494942130654002e-06, + "loss": 0.5927, + "step": 8071 + }, + { + "epoch": 2.23601108033241, + "grad_norm": 0.7271432280540466, + "learning_rate": 3.4946079268146497e-06, + "loss": 0.5765, + "step": 8072 + }, + { + "epoch": 2.2362880886426595, + "grad_norm": 0.7113369107246399, + "learning_rate": 3.4942737018572316e-06, + "loss": 0.6001, + "step": 8073 + }, + { + "epoch": 2.2365650969529085, + "grad_norm": 0.6899865865707397, + "learning_rate": 3.493939455788843e-06, + "loss": 0.5519, + "step": 8074 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.7123438119888306, + "learning_rate": 3.4936051886165825e-06, + "loss": 0.5608, + "step": 8075 + }, + { + "epoch": 2.2371191135734074, + "grad_norm": 0.7510248422622681, + "learning_rate": 3.4932709003475456e-06, + "loss": 0.6301, + "step": 8076 + }, + { + "epoch": 2.2373961218836564, + "grad_norm": 0.704402506351471, + "learning_rate": 3.4929365909888314e-06, + "loss": 0.622, + "step": 8077 + }, + { + "epoch": 2.237673130193906, + "grad_norm": 0.7204563617706299, + "learning_rate": 3.4926022605475373e-06, + "loss": 0.6035, + "step": 8078 + }, + { + "epoch": 2.237950138504155, + "grad_norm": 0.7577924132347107, + "learning_rate": 3.4922679090307633e-06, + "loss": 0.5979, + "step": 8079 + }, + { + "epoch": 2.2382271468144044, + "grad_norm": 0.7190003991127014, + "learning_rate": 3.4919335364456063e-06, + "loss": 0.6026, + "step": 8080 + }, + { + "epoch": 2.238504155124654, + "grad_norm": 0.7183365821838379, + "learning_rate": 3.491599142799168e-06, + "loss": 0.6029, + "step": 8081 + }, + { + "epoch": 2.238781163434903, + "grad_norm": 0.6923686861991882, + "learning_rate": 3.491264728098548e-06, + "loss": 0.5583, + "step": 8082 + }, + { + "epoch": 2.2390581717451523, + "grad_norm": 0.6836802363395691, + "learning_rate": 3.490930292350846e-06, + "loss": 0.5992, + "step": 8083 + }, + { + "epoch": 2.2393351800554018, + "grad_norm": 0.73431795835495, + "learning_rate": 3.490595835563163e-06, + "loss": 0.5559, + "step": 8084 + }, + { + "epoch": 2.239612188365651, + "grad_norm": 0.7008590698242188, + "learning_rate": 3.4902613577426007e-06, + "loss": 0.5306, + "step": 8085 + }, + { + "epoch": 2.2398891966759003, + "grad_norm": 0.7099395394325256, + "learning_rate": 3.4899268588962613e-06, + "loss": 0.613, + "step": 8086 + }, + { + "epoch": 2.2401662049861497, + "grad_norm": 0.7241997122764587, + "learning_rate": 3.4895923390312465e-06, + "loss": 0.6184, + "step": 8087 + }, + { + "epoch": 2.2404432132963987, + "grad_norm": 0.7413530349731445, + "learning_rate": 3.4892577981546595e-06, + "loss": 0.589, + "step": 8088 + }, + { + "epoch": 2.240720221606648, + "grad_norm": 0.736506462097168, + "learning_rate": 3.4889232362736024e-06, + "loss": 0.5764, + "step": 8089 + }, + { + "epoch": 2.2409972299168976, + "grad_norm": 0.777686595916748, + "learning_rate": 3.48858865339518e-06, + "loss": 0.622, + "step": 8090 + }, + { + "epoch": 2.2412742382271467, + "grad_norm": 0.7260578274726868, + "learning_rate": 3.488254049526496e-06, + "loss": 0.6277, + "step": 8091 + }, + { + "epoch": 2.241551246537396, + "grad_norm": 0.6801887154579163, + "learning_rate": 3.4879194246746552e-06, + "loss": 0.5615, + "step": 8092 + }, + { + "epoch": 2.2418282548476456, + "grad_norm": 0.7225388884544373, + "learning_rate": 3.4875847788467614e-06, + "loss": 0.5972, + "step": 8093 + }, + { + "epoch": 2.2421052631578946, + "grad_norm": 0.7416013479232788, + "learning_rate": 3.4872501120499215e-06, + "loss": 0.6084, + "step": 8094 + }, + { + "epoch": 2.242382271468144, + "grad_norm": 0.7532913684844971, + "learning_rate": 3.48691542429124e-06, + "loss": 0.6234, + "step": 8095 + }, + { + "epoch": 2.2426592797783935, + "grad_norm": 0.701535701751709, + "learning_rate": 3.486580715577824e-06, + "loss": 0.6049, + "step": 8096 + }, + { + "epoch": 2.2429362880886425, + "grad_norm": 0.7444953918457031, + "learning_rate": 3.486245985916779e-06, + "loss": 0.6411, + "step": 8097 + }, + { + "epoch": 2.243213296398892, + "grad_norm": 0.7297351956367493, + "learning_rate": 3.4859112353152146e-06, + "loss": 0.5702, + "step": 8098 + }, + { + "epoch": 2.2434903047091415, + "grad_norm": 0.7168240547180176, + "learning_rate": 3.485576463780236e-06, + "loss": 0.6272, + "step": 8099 + }, + { + "epoch": 2.2437673130193905, + "grad_norm": 0.7194172143936157, + "learning_rate": 3.485241671318953e-06, + "loss": 0.5861, + "step": 8100 + }, + { + "epoch": 2.24404432132964, + "grad_norm": 0.7189883589744568, + "learning_rate": 3.4849068579384728e-06, + "loss": 0.5831, + "step": 8101 + }, + { + "epoch": 2.2443213296398894, + "grad_norm": 0.7073516845703125, + "learning_rate": 3.484572023645904e-06, + "loss": 0.588, + "step": 8102 + }, + { + "epoch": 2.2445983379501384, + "grad_norm": 0.738373875617981, + "learning_rate": 3.4842371684483576e-06, + "loss": 0.6412, + "step": 8103 + }, + { + "epoch": 2.244875346260388, + "grad_norm": 0.7670702934265137, + "learning_rate": 3.483902292352943e-06, + "loss": 0.6184, + "step": 8104 + }, + { + "epoch": 2.245152354570637, + "grad_norm": 0.757293701171875, + "learning_rate": 3.48356739536677e-06, + "loss": 0.6134, + "step": 8105 + }, + { + "epoch": 2.2454293628808863, + "grad_norm": 0.7449289560317993, + "learning_rate": 3.4832324774969482e-06, + "loss": 0.6422, + "step": 8106 + }, + { + "epoch": 2.245706371191136, + "grad_norm": 0.7168107032775879, + "learning_rate": 3.4828975387505914e-06, + "loss": 0.6189, + "step": 8107 + }, + { + "epoch": 2.245983379501385, + "grad_norm": 0.7923652529716492, + "learning_rate": 3.4825625791348093e-06, + "loss": 0.5577, + "step": 8108 + }, + { + "epoch": 2.2462603878116343, + "grad_norm": 0.7246668934822083, + "learning_rate": 3.4822275986567146e-06, + "loss": 0.555, + "step": 8109 + }, + { + "epoch": 2.2465373961218837, + "grad_norm": 0.7271319031715393, + "learning_rate": 3.48189259732342e-06, + "loss": 0.5997, + "step": 8110 + }, + { + "epoch": 2.2468144044321328, + "grad_norm": 0.7270683646202087, + "learning_rate": 3.481557575142038e-06, + "loss": 0.609, + "step": 8111 + }, + { + "epoch": 2.247091412742382, + "grad_norm": 0.7465324401855469, + "learning_rate": 3.481222532119681e-06, + "loss": 0.5592, + "step": 8112 + }, + { + "epoch": 2.2473684210526317, + "grad_norm": 0.7218868732452393, + "learning_rate": 3.480887468263465e-06, + "loss": 0.6101, + "step": 8113 + }, + { + "epoch": 2.2476454293628807, + "grad_norm": 0.7048141360282898, + "learning_rate": 3.480552383580502e-06, + "loss": 0.5884, + "step": 8114 + }, + { + "epoch": 2.24792243767313, + "grad_norm": 0.7366784811019897, + "learning_rate": 3.4802172780779088e-06, + "loss": 0.6199, + "step": 8115 + }, + { + "epoch": 2.2481994459833796, + "grad_norm": 0.7352172136306763, + "learning_rate": 3.479882151762799e-06, + "loss": 0.6353, + "step": 8116 + }, + { + "epoch": 2.2484764542936286, + "grad_norm": 0.7354872226715088, + "learning_rate": 3.4795470046422896e-06, + "loss": 0.6304, + "step": 8117 + }, + { + "epoch": 2.248753462603878, + "grad_norm": 0.6982522010803223, + "learning_rate": 3.479211836723495e-06, + "loss": 0.5632, + "step": 8118 + }, + { + "epoch": 2.2490304709141276, + "grad_norm": 0.7308842539787292, + "learning_rate": 3.478876648013533e-06, + "loss": 0.5846, + "step": 8119 + }, + { + "epoch": 2.2493074792243766, + "grad_norm": 0.7680553793907166, + "learning_rate": 3.4785414385195205e-06, + "loss": 0.6225, + "step": 8120 + }, + { + "epoch": 2.249584487534626, + "grad_norm": 0.7662866711616516, + "learning_rate": 3.478206208248573e-06, + "loss": 0.5734, + "step": 8121 + }, + { + "epoch": 2.2498614958448755, + "grad_norm": 0.7630404233932495, + "learning_rate": 3.47787095720781e-06, + "loss": 0.5667, + "step": 8122 + }, + { + "epoch": 2.2501385041551245, + "grad_norm": 0.7009580731391907, + "learning_rate": 3.4775356854043497e-06, + "loss": 0.5763, + "step": 8123 + }, + { + "epoch": 2.250415512465374, + "grad_norm": 0.7273560762405396, + "learning_rate": 3.4772003928453106e-06, + "loss": 0.58, + "step": 8124 + }, + { + "epoch": 2.2506925207756234, + "grad_norm": 0.7440486550331116, + "learning_rate": 3.476865079537811e-06, + "loss": 0.6165, + "step": 8125 + }, + { + "epoch": 2.2509695290858724, + "grad_norm": 0.712805449962616, + "learning_rate": 3.4765297454889717e-06, + "loss": 0.6104, + "step": 8126 + }, + { + "epoch": 2.251246537396122, + "grad_norm": 0.7313037514686584, + "learning_rate": 3.4761943907059122e-06, + "loss": 0.573, + "step": 8127 + }, + { + "epoch": 2.2515235457063714, + "grad_norm": 0.7176337838172913, + "learning_rate": 3.475859015195753e-06, + "loss": 0.6145, + "step": 8128 + }, + { + "epoch": 2.2518005540166204, + "grad_norm": 0.7878891825675964, + "learning_rate": 3.4755236189656137e-06, + "loss": 0.588, + "step": 8129 + }, + { + "epoch": 2.25207756232687, + "grad_norm": 0.7895088195800781, + "learning_rate": 3.4751882020226174e-06, + "loss": 0.602, + "step": 8130 + }, + { + "epoch": 2.2523545706371193, + "grad_norm": 0.7553879618644714, + "learning_rate": 3.4748527643738856e-06, + "loss": 0.6034, + "step": 8131 + }, + { + "epoch": 2.2526315789473683, + "grad_norm": 0.6995738744735718, + "learning_rate": 3.474517306026539e-06, + "loss": 0.5861, + "step": 8132 + }, + { + "epoch": 2.252908587257618, + "grad_norm": 0.7691447138786316, + "learning_rate": 3.474181826987703e-06, + "loss": 0.5564, + "step": 8133 + }, + { + "epoch": 2.2531855955678672, + "grad_norm": 0.6974400877952576, + "learning_rate": 3.473846327264498e-06, + "loss": 0.6081, + "step": 8134 + }, + { + "epoch": 2.2534626038781163, + "grad_norm": 0.7364979386329651, + "learning_rate": 3.473510806864049e-06, + "loss": 0.6534, + "step": 8135 + }, + { + "epoch": 2.2537396121883657, + "grad_norm": 0.7088637948036194, + "learning_rate": 3.4731752657934793e-06, + "loss": 0.5524, + "step": 8136 + }, + { + "epoch": 2.254016620498615, + "grad_norm": 0.7505431175231934, + "learning_rate": 3.4728397040599137e-06, + "loss": 0.5805, + "step": 8137 + }, + { + "epoch": 2.254293628808864, + "grad_norm": 0.7170282006263733, + "learning_rate": 3.472504121670476e-06, + "loss": 0.5365, + "step": 8138 + }, + { + "epoch": 2.2545706371191137, + "grad_norm": 0.713196873664856, + "learning_rate": 3.472168518632293e-06, + "loss": 0.6709, + "step": 8139 + }, + { + "epoch": 2.254847645429363, + "grad_norm": 0.7076315879821777, + "learning_rate": 3.4718328949524894e-06, + "loss": 0.5865, + "step": 8140 + }, + { + "epoch": 2.255124653739612, + "grad_norm": 0.7079249620437622, + "learning_rate": 3.4714972506381916e-06, + "loss": 0.5976, + "step": 8141 + }, + { + "epoch": 2.2554016620498616, + "grad_norm": 0.7071895003318787, + "learning_rate": 3.4711615856965264e-06, + "loss": 0.5831, + "step": 8142 + }, + { + "epoch": 2.255678670360111, + "grad_norm": 0.7474883198738098, + "learning_rate": 3.4708259001346204e-06, + "loss": 0.6041, + "step": 8143 + }, + { + "epoch": 2.25595567867036, + "grad_norm": 0.7047725319862366, + "learning_rate": 3.470490193959602e-06, + "loss": 0.5933, + "step": 8144 + }, + { + "epoch": 2.2562326869806095, + "grad_norm": 0.7273043990135193, + "learning_rate": 3.470154467178598e-06, + "loss": 0.6108, + "step": 8145 + }, + { + "epoch": 2.2565096952908585, + "grad_norm": 0.7017418146133423, + "learning_rate": 3.4698187197987377e-06, + "loss": 0.5158, + "step": 8146 + }, + { + "epoch": 2.256786703601108, + "grad_norm": 0.7130430936813354, + "learning_rate": 3.4694829518271485e-06, + "loss": 0.6265, + "step": 8147 + }, + { + "epoch": 2.2570637119113575, + "grad_norm": 0.6907879710197449, + "learning_rate": 3.4691471632709604e-06, + "loss": 0.6218, + "step": 8148 + }, + { + "epoch": 2.2573407202216065, + "grad_norm": 0.7310351729393005, + "learning_rate": 3.4688113541373037e-06, + "loss": 0.6313, + "step": 8149 + }, + { + "epoch": 2.257617728531856, + "grad_norm": 0.7051122188568115, + "learning_rate": 3.468475524433308e-06, + "loss": 0.6325, + "step": 8150 + }, + { + "epoch": 2.2578947368421054, + "grad_norm": 0.7112586498260498, + "learning_rate": 3.4681396741661028e-06, + "loss": 0.6221, + "step": 8151 + }, + { + "epoch": 2.2581717451523544, + "grad_norm": 0.7428193092346191, + "learning_rate": 3.467803803342821e-06, + "loss": 0.6333, + "step": 8152 + }, + { + "epoch": 2.258448753462604, + "grad_norm": 0.7509081363677979, + "learning_rate": 3.467467911970592e-06, + "loss": 0.5948, + "step": 8153 + }, + { + "epoch": 2.2587257617728533, + "grad_norm": 0.7133442163467407, + "learning_rate": 3.467132000056549e-06, + "loss": 0.5938, + "step": 8154 + }, + { + "epoch": 2.2590027700831024, + "grad_norm": 0.73454350233078, + "learning_rate": 3.466796067607824e-06, + "loss": 0.6126, + "step": 8155 + }, + { + "epoch": 2.259279778393352, + "grad_norm": 0.7535995244979858, + "learning_rate": 3.4664601146315496e-06, + "loss": 0.5605, + "step": 8156 + }, + { + "epoch": 2.2595567867036013, + "grad_norm": 0.7269196510314941, + "learning_rate": 3.4661241411348586e-06, + "loss": 0.6224, + "step": 8157 + }, + { + "epoch": 2.2598337950138503, + "grad_norm": 0.7485670447349548, + "learning_rate": 3.465788147124885e-06, + "loss": 0.6162, + "step": 8158 + }, + { + "epoch": 2.2601108033240997, + "grad_norm": 0.7506015300750732, + "learning_rate": 3.4654521326087627e-06, + "loss": 0.6221, + "step": 8159 + }, + { + "epoch": 2.260387811634349, + "grad_norm": 0.7276700139045715, + "learning_rate": 3.4651160975936256e-06, + "loss": 0.5293, + "step": 8160 + }, + { + "epoch": 2.2606648199445982, + "grad_norm": 0.7210519909858704, + "learning_rate": 3.4647800420866096e-06, + "loss": 0.5598, + "step": 8161 + }, + { + "epoch": 2.2609418282548477, + "grad_norm": 0.6977429389953613, + "learning_rate": 3.4644439660948497e-06, + "loss": 0.5866, + "step": 8162 + }, + { + "epoch": 2.261218836565097, + "grad_norm": 0.7661787271499634, + "learning_rate": 3.4641078696254814e-06, + "loss": 0.5967, + "step": 8163 + }, + { + "epoch": 2.261495844875346, + "grad_norm": 0.7270045280456543, + "learning_rate": 3.4637717526856408e-06, + "loss": 0.6014, + "step": 8164 + }, + { + "epoch": 2.2617728531855956, + "grad_norm": 0.7071644067764282, + "learning_rate": 3.4634356152824654e-06, + "loss": 0.6252, + "step": 8165 + }, + { + "epoch": 2.2620498614958446, + "grad_norm": 0.7300014495849609, + "learning_rate": 3.4630994574230904e-06, + "loss": 0.5668, + "step": 8166 + }, + { + "epoch": 2.262326869806094, + "grad_norm": 0.8083431124687195, + "learning_rate": 3.462763279114655e-06, + "loss": 0.6159, + "step": 8167 + }, + { + "epoch": 2.2626038781163436, + "grad_norm": 0.7473114132881165, + "learning_rate": 3.4624270803642967e-06, + "loss": 0.5696, + "step": 8168 + }, + { + "epoch": 2.2628808864265926, + "grad_norm": 0.7719151973724365, + "learning_rate": 3.462090861179154e-06, + "loss": 0.5858, + "step": 8169 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.7300695180892944, + "learning_rate": 3.461754621566364e-06, + "loss": 0.573, + "step": 8170 + }, + { + "epoch": 2.2634349030470915, + "grad_norm": 0.7490740418434143, + "learning_rate": 3.4614183615330683e-06, + "loss": 0.5772, + "step": 8171 + }, + { + "epoch": 2.2637119113573405, + "grad_norm": 0.6871481537818909, + "learning_rate": 3.4610820810864053e-06, + "loss": 0.5572, + "step": 8172 + }, + { + "epoch": 2.26398891966759, + "grad_norm": 0.7448837161064148, + "learning_rate": 3.4607457802335153e-06, + "loss": 0.5986, + "step": 8173 + }, + { + "epoch": 2.2642659279778394, + "grad_norm": 0.7582398056983948, + "learning_rate": 3.4604094589815402e-06, + "loss": 0.6259, + "step": 8174 + }, + { + "epoch": 2.2645429362880884, + "grad_norm": 0.7317702770233154, + "learning_rate": 3.4600731173376184e-06, + "loss": 0.5945, + "step": 8175 + }, + { + "epoch": 2.264819944598338, + "grad_norm": 0.6793874502182007, + "learning_rate": 3.4597367553088927e-06, + "loss": 0.5673, + "step": 8176 + }, + { + "epoch": 2.2650969529085874, + "grad_norm": 0.745205283164978, + "learning_rate": 3.459400372902505e-06, + "loss": 0.6109, + "step": 8177 + }, + { + "epoch": 2.2653739612188364, + "grad_norm": 0.7162503004074097, + "learning_rate": 3.459063970125597e-06, + "loss": 0.6033, + "step": 8178 + }, + { + "epoch": 2.265650969529086, + "grad_norm": 0.7062127590179443, + "learning_rate": 3.458727546985312e-06, + "loss": 0.5657, + "step": 8179 + }, + { + "epoch": 2.2659279778393353, + "grad_norm": 0.7209470272064209, + "learning_rate": 3.458391103488793e-06, + "loss": 0.5944, + "step": 8180 + }, + { + "epoch": 2.2662049861495843, + "grad_norm": 0.7374285459518433, + "learning_rate": 3.4580546396431837e-06, + "loss": 0.6177, + "step": 8181 + }, + { + "epoch": 2.266481994459834, + "grad_norm": 0.7219187021255493, + "learning_rate": 3.4577181554556273e-06, + "loss": 0.6365, + "step": 8182 + }, + { + "epoch": 2.2667590027700832, + "grad_norm": 0.7407369613647461, + "learning_rate": 3.457381650933268e-06, + "loss": 0.5901, + "step": 8183 + }, + { + "epoch": 2.2670360110803323, + "grad_norm": 0.7512839436531067, + "learning_rate": 3.4570451260832533e-06, + "loss": 0.5955, + "step": 8184 + }, + { + "epoch": 2.2673130193905817, + "grad_norm": 0.7049588561058044, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.6389, + "step": 8185 + }, + { + "epoch": 2.267590027700831, + "grad_norm": 0.7323600053787231, + "learning_rate": 3.4563720154288306e-06, + "loss": 0.5776, + "step": 8186 + }, + { + "epoch": 2.26786703601108, + "grad_norm": 0.7171778082847595, + "learning_rate": 3.4560354296387167e-06, + "loss": 0.6096, + "step": 8187 + }, + { + "epoch": 2.2681440443213297, + "grad_norm": 0.7620019316673279, + "learning_rate": 3.455698823549528e-06, + "loss": 0.6409, + "step": 8188 + }, + { + "epoch": 2.268421052631579, + "grad_norm": 0.7481417059898376, + "learning_rate": 3.455362197168413e-06, + "loss": 0.6072, + "step": 8189 + }, + { + "epoch": 2.268698060941828, + "grad_norm": 0.7183117270469666, + "learning_rate": 3.4550255505025198e-06, + "loss": 0.5921, + "step": 8190 + }, + { + "epoch": 2.2689750692520776, + "grad_norm": 0.7216724753379822, + "learning_rate": 3.454688883558994e-06, + "loss": 0.5779, + "step": 8191 + }, + { + "epoch": 2.269252077562327, + "grad_norm": 0.7133140563964844, + "learning_rate": 3.454352196344986e-06, + "loss": 0.6287, + "step": 8192 + }, + { + "epoch": 2.269529085872576, + "grad_norm": 0.698267936706543, + "learning_rate": 3.4540154888676435e-06, + "loss": 0.5707, + "step": 8193 + }, + { + "epoch": 2.2698060941828255, + "grad_norm": 0.6946624517440796, + "learning_rate": 3.4536787611341162e-06, + "loss": 0.5806, + "step": 8194 + }, + { + "epoch": 2.270083102493075, + "grad_norm": 0.7460806369781494, + "learning_rate": 3.453342013151553e-06, + "loss": 0.5734, + "step": 8195 + }, + { + "epoch": 2.270360110803324, + "grad_norm": 0.6981709599494934, + "learning_rate": 3.4530052449271044e-06, + "loss": 0.5999, + "step": 8196 + }, + { + "epoch": 2.2706371191135735, + "grad_norm": 0.744199812412262, + "learning_rate": 3.4526684564679212e-06, + "loss": 0.6382, + "step": 8197 + }, + { + "epoch": 2.270914127423823, + "grad_norm": 0.7204859256744385, + "learning_rate": 3.452331647781153e-06, + "loss": 0.599, + "step": 8198 + }, + { + "epoch": 2.271191135734072, + "grad_norm": 0.7273908257484436, + "learning_rate": 3.4519948188739523e-06, + "loss": 0.6032, + "step": 8199 + }, + { + "epoch": 2.2714681440443214, + "grad_norm": 0.7486979961395264, + "learning_rate": 3.4516579697534708e-06, + "loss": 0.6435, + "step": 8200 + }, + { + "epoch": 2.271745152354571, + "grad_norm": 0.7121773362159729, + "learning_rate": 3.4513211004268595e-06, + "loss": 0.5527, + "step": 8201 + }, + { + "epoch": 2.27202216066482, + "grad_norm": 0.736427903175354, + "learning_rate": 3.4509842109012725e-06, + "loss": 0.6071, + "step": 8202 + }, + { + "epoch": 2.2722991689750693, + "grad_norm": 0.7484080791473389, + "learning_rate": 3.450647301183862e-06, + "loss": 0.6488, + "step": 8203 + }, + { + "epoch": 2.272576177285319, + "grad_norm": 0.705027163028717, + "learning_rate": 3.4503103712817815e-06, + "loss": 0.6334, + "step": 8204 + }, + { + "epoch": 2.272853185595568, + "grad_norm": 0.7610204815864563, + "learning_rate": 3.4499734212021847e-06, + "loss": 0.5988, + "step": 8205 + }, + { + "epoch": 2.2731301939058173, + "grad_norm": 0.7779908776283264, + "learning_rate": 3.4496364509522275e-06, + "loss": 0.5966, + "step": 8206 + }, + { + "epoch": 2.2734072022160663, + "grad_norm": 0.7502352595329285, + "learning_rate": 3.449299460539062e-06, + "loss": 0.5684, + "step": 8207 + }, + { + "epoch": 2.2736842105263158, + "grad_norm": 0.8014580011367798, + "learning_rate": 3.4489624499698447e-06, + "loss": 0.5943, + "step": 8208 + }, + { + "epoch": 2.273961218836565, + "grad_norm": 0.7266554832458496, + "learning_rate": 3.448625419251732e-06, + "loss": 0.6069, + "step": 8209 + }, + { + "epoch": 2.2742382271468142, + "grad_norm": 0.764426052570343, + "learning_rate": 3.4482883683918788e-06, + "loss": 0.636, + "step": 8210 + }, + { + "epoch": 2.2745152354570637, + "grad_norm": 0.7291103005409241, + "learning_rate": 3.4479512973974415e-06, + "loss": 0.6315, + "step": 8211 + }, + { + "epoch": 2.274792243767313, + "grad_norm": 0.7153321504592896, + "learning_rate": 3.447614206275578e-06, + "loss": 0.5893, + "step": 8212 + }, + { + "epoch": 2.275069252077562, + "grad_norm": 0.7283423542976379, + "learning_rate": 3.4472770950334443e-06, + "loss": 0.5989, + "step": 8213 + }, + { + "epoch": 2.2753462603878116, + "grad_norm": 0.7300357818603516, + "learning_rate": 3.4469399636781987e-06, + "loss": 0.547, + "step": 8214 + }, + { + "epoch": 2.275623268698061, + "grad_norm": 0.751139223575592, + "learning_rate": 3.4466028122170003e-06, + "loss": 0.579, + "step": 8215 + }, + { + "epoch": 2.27590027700831, + "grad_norm": 0.7210396528244019, + "learning_rate": 3.446265640657006e-06, + "loss": 0.6054, + "step": 8216 + }, + { + "epoch": 2.2761772853185596, + "grad_norm": 0.7154489755630493, + "learning_rate": 3.445928449005376e-06, + "loss": 0.5572, + "step": 8217 + }, + { + "epoch": 2.276454293628809, + "grad_norm": 0.7469511032104492, + "learning_rate": 3.4455912372692696e-06, + "loss": 0.5857, + "step": 8218 + }, + { + "epoch": 2.276731301939058, + "grad_norm": 0.7889577150344849, + "learning_rate": 3.445254005455847e-06, + "loss": 0.5819, + "step": 8219 + }, + { + "epoch": 2.2770083102493075, + "grad_norm": 0.7772889733314514, + "learning_rate": 3.444916753572267e-06, + "loss": 0.5903, + "step": 8220 + }, + { + "epoch": 2.277285318559557, + "grad_norm": 0.7089385390281677, + "learning_rate": 3.444579481625691e-06, + "loss": 0.5651, + "step": 8221 + }, + { + "epoch": 2.277562326869806, + "grad_norm": 0.7505936026573181, + "learning_rate": 3.444242189623281e-06, + "loss": 0.66, + "step": 8222 + }, + { + "epoch": 2.2778393351800554, + "grad_norm": 0.7459689378738403, + "learning_rate": 3.4439048775721983e-06, + "loss": 0.5863, + "step": 8223 + }, + { + "epoch": 2.278116343490305, + "grad_norm": 0.6946021318435669, + "learning_rate": 3.4435675454796035e-06, + "loss": 0.6156, + "step": 8224 + }, + { + "epoch": 2.278393351800554, + "grad_norm": 0.7873076796531677, + "learning_rate": 3.4432301933526614e-06, + "loss": 0.6069, + "step": 8225 + }, + { + "epoch": 2.2786703601108034, + "grad_norm": 0.7463105916976929, + "learning_rate": 3.442892821198533e-06, + "loss": 0.6001, + "step": 8226 + }, + { + "epoch": 2.2789473684210524, + "grad_norm": 0.7991135716438293, + "learning_rate": 3.442555429024382e-06, + "loss": 0.611, + "step": 8227 + }, + { + "epoch": 2.279224376731302, + "grad_norm": 0.7371905446052551, + "learning_rate": 3.4422180168373725e-06, + "loss": 0.655, + "step": 8228 + }, + { + "epoch": 2.2795013850415513, + "grad_norm": 0.7202360033988953, + "learning_rate": 3.4418805846446678e-06, + "loss": 0.6133, + "step": 8229 + }, + { + "epoch": 2.2797783933518003, + "grad_norm": 0.7565407156944275, + "learning_rate": 3.441543132453433e-06, + "loss": 0.622, + "step": 8230 + }, + { + "epoch": 2.28005540166205, + "grad_norm": 0.7521671652793884, + "learning_rate": 3.4412056602708337e-06, + "loss": 0.6133, + "step": 8231 + }, + { + "epoch": 2.2803324099722992, + "grad_norm": 0.709683895111084, + "learning_rate": 3.4408681681040344e-06, + "loss": 0.5736, + "step": 8232 + }, + { + "epoch": 2.2806094182825483, + "grad_norm": 0.7759260535240173, + "learning_rate": 3.440530655960201e-06, + "loss": 0.5931, + "step": 8233 + }, + { + "epoch": 2.2808864265927977, + "grad_norm": 0.722997784614563, + "learning_rate": 3.4401931238464996e-06, + "loss": 0.5617, + "step": 8234 + }, + { + "epoch": 2.281163434903047, + "grad_norm": 0.7048691511154175, + "learning_rate": 3.4398555717700975e-06, + "loss": 0.5994, + "step": 8235 + }, + { + "epoch": 2.281440443213296, + "grad_norm": 0.7193622589111328, + "learning_rate": 3.439517999738162e-06, + "loss": 0.5688, + "step": 8236 + }, + { + "epoch": 2.2817174515235457, + "grad_norm": 0.7562454342842102, + "learning_rate": 3.4391804077578592e-06, + "loss": 0.6113, + "step": 8237 + }, + { + "epoch": 2.281994459833795, + "grad_norm": 0.737359344959259, + "learning_rate": 3.438842795836359e-06, + "loss": 0.5539, + "step": 8238 + }, + { + "epoch": 2.282271468144044, + "grad_norm": 0.7290983200073242, + "learning_rate": 3.4385051639808277e-06, + "loss": 0.645, + "step": 8239 + }, + { + "epoch": 2.2825484764542936, + "grad_norm": 0.7329986095428467, + "learning_rate": 3.438167512198436e-06, + "loss": 0.5951, + "step": 8240 + }, + { + "epoch": 2.282825484764543, + "grad_norm": 0.7410658001899719, + "learning_rate": 3.4378298404963517e-06, + "loss": 0.6091, + "step": 8241 + }, + { + "epoch": 2.283102493074792, + "grad_norm": 0.7463988661766052, + "learning_rate": 3.437492148881745e-06, + "loss": 0.6081, + "step": 8242 + }, + { + "epoch": 2.2833795013850415, + "grad_norm": 0.7464751601219177, + "learning_rate": 3.4371544373617857e-06, + "loss": 0.5989, + "step": 8243 + }, + { + "epoch": 2.283656509695291, + "grad_norm": 0.744561493396759, + "learning_rate": 3.436816705943644e-06, + "loss": 0.5968, + "step": 8244 + }, + { + "epoch": 2.28393351800554, + "grad_norm": 0.7255566716194153, + "learning_rate": 3.436478954634492e-06, + "loss": 0.5382, + "step": 8245 + }, + { + "epoch": 2.2842105263157895, + "grad_norm": 0.749146580696106, + "learning_rate": 3.4361411834415e-06, + "loss": 0.6063, + "step": 8246 + }, + { + "epoch": 2.284487534626039, + "grad_norm": 0.7449796795845032, + "learning_rate": 3.4358033923718405e-06, + "loss": 0.6034, + "step": 8247 + }, + { + "epoch": 2.284764542936288, + "grad_norm": 0.7429720163345337, + "learning_rate": 3.4354655814326847e-06, + "loss": 0.5836, + "step": 8248 + }, + { + "epoch": 2.2850415512465374, + "grad_norm": 0.6745700836181641, + "learning_rate": 3.435127750631206e-06, + "loss": 0.5858, + "step": 8249 + }, + { + "epoch": 2.285318559556787, + "grad_norm": 0.7240549325942993, + "learning_rate": 3.4347898999745766e-06, + "loss": 0.5872, + "step": 8250 + }, + { + "epoch": 2.285595567867036, + "grad_norm": 0.7113307118415833, + "learning_rate": 3.4344520294699714e-06, + "loss": 0.6165, + "step": 8251 + }, + { + "epoch": 2.2858725761772853, + "grad_norm": 0.7669567465782166, + "learning_rate": 3.4341141391245626e-06, + "loss": 0.5966, + "step": 8252 + }, + { + "epoch": 2.286149584487535, + "grad_norm": 0.7530611157417297, + "learning_rate": 3.433776228945525e-06, + "loss": 0.5964, + "step": 8253 + }, + { + "epoch": 2.286426592797784, + "grad_norm": 0.7283420562744141, + "learning_rate": 3.433438298940034e-06, + "loss": 0.6044, + "step": 8254 + }, + { + "epoch": 2.2867036011080333, + "grad_norm": 0.6976724863052368, + "learning_rate": 3.4331003491152635e-06, + "loss": 0.5674, + "step": 8255 + }, + { + "epoch": 2.2869806094182827, + "grad_norm": 0.6952914595603943, + "learning_rate": 3.43276237947839e-06, + "loss": 0.5729, + "step": 8256 + }, + { + "epoch": 2.2872576177285318, + "grad_norm": 0.7647460103034973, + "learning_rate": 3.4324243900365894e-06, + "loss": 0.5718, + "step": 8257 + }, + { + "epoch": 2.287534626038781, + "grad_norm": 0.7178311347961426, + "learning_rate": 3.432086380797037e-06, + "loss": 0.5855, + "step": 8258 + }, + { + "epoch": 2.2878116343490307, + "grad_norm": 0.7695008516311646, + "learning_rate": 3.4317483517669115e-06, + "loss": 0.6255, + "step": 8259 + }, + { + "epoch": 2.2880886426592797, + "grad_norm": 0.7568122744560242, + "learning_rate": 3.431410302953389e-06, + "loss": 0.5432, + "step": 8260 + }, + { + "epoch": 2.288365650969529, + "grad_norm": 0.6739785075187683, + "learning_rate": 3.4310722343636464e-06, + "loss": 0.5778, + "step": 8261 + }, + { + "epoch": 2.2886426592797786, + "grad_norm": 0.7452985644340515, + "learning_rate": 3.4307341460048633e-06, + "loss": 0.6284, + "step": 8262 + }, + { + "epoch": 2.2889196675900276, + "grad_norm": 0.7170669436454773, + "learning_rate": 3.430396037884217e-06, + "loss": 0.6227, + "step": 8263 + }, + { + "epoch": 2.289196675900277, + "grad_norm": 0.7811469435691833, + "learning_rate": 3.4300579100088875e-06, + "loss": 0.5934, + "step": 8264 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.7597870826721191, + "learning_rate": 3.4297197623860525e-06, + "loss": 0.6306, + "step": 8265 + }, + { + "epoch": 2.2897506925207756, + "grad_norm": 0.7523090839385986, + "learning_rate": 3.4293815950228936e-06, + "loss": 0.5915, + "step": 8266 + }, + { + "epoch": 2.290027700831025, + "grad_norm": 0.7411076426506042, + "learning_rate": 3.4290434079265894e-06, + "loss": 0.6061, + "step": 8267 + }, + { + "epoch": 2.2903047091412745, + "grad_norm": 0.7374351620674133, + "learning_rate": 3.4287052011043214e-06, + "loss": 0.5945, + "step": 8268 + }, + { + "epoch": 2.2905817174515235, + "grad_norm": 0.7511986494064331, + "learning_rate": 3.4283669745632705e-06, + "loss": 0.5952, + "step": 8269 + }, + { + "epoch": 2.290858725761773, + "grad_norm": 0.7723880410194397, + "learning_rate": 3.4280287283106172e-06, + "loss": 0.6109, + "step": 8270 + }, + { + "epoch": 2.291135734072022, + "grad_norm": 0.694105327129364, + "learning_rate": 3.427690462353545e-06, + "loss": 0.6328, + "step": 8271 + }, + { + "epoch": 2.2914127423822714, + "grad_norm": 0.7096155285835266, + "learning_rate": 3.4273521766992348e-06, + "loss": 0.589, + "step": 8272 + }, + { + "epoch": 2.291689750692521, + "grad_norm": 0.7510867714881897, + "learning_rate": 3.42701387135487e-06, + "loss": 0.5721, + "step": 8273 + }, + { + "epoch": 2.29196675900277, + "grad_norm": 0.7638363838195801, + "learning_rate": 3.4266755463276325e-06, + "loss": 0.6358, + "step": 8274 + }, + { + "epoch": 2.2922437673130194, + "grad_norm": 0.7557123899459839, + "learning_rate": 3.4263372016247074e-06, + "loss": 0.6244, + "step": 8275 + }, + { + "epoch": 2.292520775623269, + "grad_norm": 0.725536048412323, + "learning_rate": 3.4259988372532783e-06, + "loss": 0.6014, + "step": 8276 + }, + { + "epoch": 2.292797783933518, + "grad_norm": 0.7189280390739441, + "learning_rate": 3.4256604532205285e-06, + "loss": 0.5749, + "step": 8277 + }, + { + "epoch": 2.2930747922437673, + "grad_norm": 0.745522677898407, + "learning_rate": 3.425322049533643e-06, + "loss": 0.6433, + "step": 8278 + }, + { + "epoch": 2.2933518005540168, + "grad_norm": 0.7054515480995178, + "learning_rate": 3.4249836261998086e-06, + "loss": 0.5802, + "step": 8279 + }, + { + "epoch": 2.293628808864266, + "grad_norm": 0.75422602891922, + "learning_rate": 3.4246451832262085e-06, + "loss": 0.595, + "step": 8280 + }, + { + "epoch": 2.2939058171745152, + "grad_norm": 0.6765782237052917, + "learning_rate": 3.424306720620031e-06, + "loss": 0.5849, + "step": 8281 + }, + { + "epoch": 2.2941828254847647, + "grad_norm": 0.7536196112632751, + "learning_rate": 3.4239682383884603e-06, + "loss": 0.6326, + "step": 8282 + }, + { + "epoch": 2.2944598337950137, + "grad_norm": 0.7260246872901917, + "learning_rate": 3.423629736538685e-06, + "loss": 0.555, + "step": 8283 + }, + { + "epoch": 2.294736842105263, + "grad_norm": 0.7225943207740784, + "learning_rate": 3.4232912150778914e-06, + "loss": 0.6272, + "step": 8284 + }, + { + "epoch": 2.2950138504155126, + "grad_norm": 0.7318356037139893, + "learning_rate": 3.4229526740132672e-06, + "loss": 0.6487, + "step": 8285 + }, + { + "epoch": 2.2952908587257617, + "grad_norm": 0.6813390851020813, + "learning_rate": 3.4226141133520014e-06, + "loss": 0.5994, + "step": 8286 + }, + { + "epoch": 2.295567867036011, + "grad_norm": 0.7114197015762329, + "learning_rate": 3.4222755331012813e-06, + "loss": 0.5947, + "step": 8287 + }, + { + "epoch": 2.29584487534626, + "grad_norm": 0.7054829597473145, + "learning_rate": 3.4219369332682967e-06, + "loss": 0.6042, + "step": 8288 + }, + { + "epoch": 2.2961218836565096, + "grad_norm": 0.7095068693161011, + "learning_rate": 3.421598313860237e-06, + "loss": 0.6166, + "step": 8289 + }, + { + "epoch": 2.296398891966759, + "grad_norm": 0.7199954390525818, + "learning_rate": 3.421259674884291e-06, + "loss": 0.6092, + "step": 8290 + }, + { + "epoch": 2.296675900277008, + "grad_norm": 0.7267475128173828, + "learning_rate": 3.420921016347649e-06, + "loss": 0.6098, + "step": 8291 + }, + { + "epoch": 2.2969529085872575, + "grad_norm": 0.7270237803459167, + "learning_rate": 3.420582338257504e-06, + "loss": 0.6621, + "step": 8292 + }, + { + "epoch": 2.297229916897507, + "grad_norm": 0.7058791518211365, + "learning_rate": 3.420243640621043e-06, + "loss": 0.58, + "step": 8293 + }, + { + "epoch": 2.297506925207756, + "grad_norm": 0.6881886124610901, + "learning_rate": 3.419904923445461e-06, + "loss": 0.5873, + "step": 8294 + }, + { + "epoch": 2.2977839335180055, + "grad_norm": 0.70926433801651, + "learning_rate": 3.419566186737947e-06, + "loss": 0.6075, + "step": 8295 + }, + { + "epoch": 2.298060941828255, + "grad_norm": 0.7379501461982727, + "learning_rate": 3.4192274305056956e-06, + "loss": 0.5708, + "step": 8296 + }, + { + "epoch": 2.298337950138504, + "grad_norm": 0.7627331614494324, + "learning_rate": 3.4188886547558974e-06, + "loss": 0.6277, + "step": 8297 + }, + { + "epoch": 2.2986149584487534, + "grad_norm": 0.7369683384895325, + "learning_rate": 3.418549859495747e-06, + "loss": 0.6135, + "step": 8298 + }, + { + "epoch": 2.298891966759003, + "grad_norm": 0.7661909461021423, + "learning_rate": 3.4182110447324374e-06, + "loss": 0.603, + "step": 8299 + }, + { + "epoch": 2.299168975069252, + "grad_norm": 0.7049823999404907, + "learning_rate": 3.417872210473162e-06, + "loss": 0.6006, + "step": 8300 + }, + { + "epoch": 2.2994459833795013, + "grad_norm": 0.7327120304107666, + "learning_rate": 3.4175333567251163e-06, + "loss": 0.5925, + "step": 8301 + }, + { + "epoch": 2.299722991689751, + "grad_norm": 0.7394639253616333, + "learning_rate": 3.417194483495494e-06, + "loss": 0.6356, + "step": 8302 + }, + { + "epoch": 2.3, + "grad_norm": 0.7379699349403381, + "learning_rate": 3.4168555907914904e-06, + "loss": 0.6067, + "step": 8303 + }, + { + "epoch": 2.3002770083102493, + "grad_norm": 0.7700881958007812, + "learning_rate": 3.4165166786203012e-06, + "loss": 0.6182, + "step": 8304 + }, + { + "epoch": 2.3005540166204987, + "grad_norm": 0.7343998551368713, + "learning_rate": 3.4161777469891227e-06, + "loss": 0.5753, + "step": 8305 + }, + { + "epoch": 2.3008310249307478, + "grad_norm": 0.7086063623428345, + "learning_rate": 3.415838795905151e-06, + "loss": 0.5989, + "step": 8306 + }, + { + "epoch": 2.301108033240997, + "grad_norm": 0.7197765111923218, + "learning_rate": 3.4154998253755823e-06, + "loss": 0.6181, + "step": 8307 + }, + { + "epoch": 2.3013850415512467, + "grad_norm": 0.7595135569572449, + "learning_rate": 3.415160835407615e-06, + "loss": 0.6289, + "step": 8308 + }, + { + "epoch": 2.3016620498614957, + "grad_norm": 0.7638220191001892, + "learning_rate": 3.414821826008446e-06, + "loss": 0.6284, + "step": 8309 + }, + { + "epoch": 2.301939058171745, + "grad_norm": 0.7343319058418274, + "learning_rate": 3.414482797185273e-06, + "loss": 0.5967, + "step": 8310 + }, + { + "epoch": 2.3022160664819946, + "grad_norm": 0.7228246331214905, + "learning_rate": 3.4141437489452955e-06, + "loss": 0.6157, + "step": 8311 + }, + { + "epoch": 2.3024930747922436, + "grad_norm": 0.7245225310325623, + "learning_rate": 3.413804681295711e-06, + "loss": 0.5779, + "step": 8312 + }, + { + "epoch": 2.302770083102493, + "grad_norm": 0.749394416809082, + "learning_rate": 3.4134655942437197e-06, + "loss": 0.6285, + "step": 8313 + }, + { + "epoch": 2.3030470914127426, + "grad_norm": 0.6916159987449646, + "learning_rate": 3.413126487796522e-06, + "loss": 0.6378, + "step": 8314 + }, + { + "epoch": 2.3033240997229916, + "grad_norm": 0.7362995147705078, + "learning_rate": 3.4127873619613154e-06, + "loss": 0.6086, + "step": 8315 + }, + { + "epoch": 2.303601108033241, + "grad_norm": 0.758251965045929, + "learning_rate": 3.412448216745303e-06, + "loss": 0.5945, + "step": 8316 + }, + { + "epoch": 2.3038781163434905, + "grad_norm": 0.7599399089813232, + "learning_rate": 3.4121090521556856e-06, + "loss": 0.636, + "step": 8317 + }, + { + "epoch": 2.3041551246537395, + "grad_norm": 0.7391661405563354, + "learning_rate": 3.411769868199663e-06, + "loss": 0.6042, + "step": 8318 + }, + { + "epoch": 2.304432132963989, + "grad_norm": 0.7565715312957764, + "learning_rate": 3.4114306648844374e-06, + "loss": 0.6037, + "step": 8319 + }, + { + "epoch": 2.3047091412742384, + "grad_norm": 0.7810177206993103, + "learning_rate": 3.411091442217212e-06, + "loss": 0.5512, + "step": 8320 + }, + { + "epoch": 2.3049861495844874, + "grad_norm": 0.7262973785400391, + "learning_rate": 3.410752200205188e-06, + "loss": 0.5859, + "step": 8321 + }, + { + "epoch": 2.305263157894737, + "grad_norm": 0.7462362051010132, + "learning_rate": 3.4104129388555695e-06, + "loss": 0.6192, + "step": 8322 + }, + { + "epoch": 2.3055401662049864, + "grad_norm": 0.7291772365570068, + "learning_rate": 3.4100736581755587e-06, + "loss": 0.5888, + "step": 8323 + }, + { + "epoch": 2.3058171745152354, + "grad_norm": 0.7448986172676086, + "learning_rate": 3.4097343581723607e-06, + "loss": 0.5408, + "step": 8324 + }, + { + "epoch": 2.306094182825485, + "grad_norm": 0.6898923516273499, + "learning_rate": 3.409395038853179e-06, + "loss": 0.542, + "step": 8325 + }, + { + "epoch": 2.3063711911357343, + "grad_norm": 0.7469924092292786, + "learning_rate": 3.409055700225218e-06, + "loss": 0.6154, + "step": 8326 + }, + { + "epoch": 2.3066481994459833, + "grad_norm": 0.7666891813278198, + "learning_rate": 3.408716342295683e-06, + "loss": 0.6176, + "step": 8327 + }, + { + "epoch": 2.3069252077562328, + "grad_norm": 0.7476541996002197, + "learning_rate": 3.408376965071779e-06, + "loss": 0.5989, + "step": 8328 + }, + { + "epoch": 2.3072022160664822, + "grad_norm": 0.7600588798522949, + "learning_rate": 3.408037568560713e-06, + "loss": 0.5709, + "step": 8329 + }, + { + "epoch": 2.3074792243767313, + "grad_norm": 0.7915143370628357, + "learning_rate": 3.4076981527696904e-06, + "loss": 0.5905, + "step": 8330 + }, + { + "epoch": 2.3077562326869807, + "grad_norm": 0.7339158654212952, + "learning_rate": 3.407358717705918e-06, + "loss": 0.6444, + "step": 8331 + }, + { + "epoch": 2.3080332409972297, + "grad_norm": 0.7408339381217957, + "learning_rate": 3.4070192633766025e-06, + "loss": 0.6151, + "step": 8332 + }, + { + "epoch": 2.308310249307479, + "grad_norm": 0.7135486006736755, + "learning_rate": 3.4066797897889525e-06, + "loss": 0.5706, + "step": 8333 + }, + { + "epoch": 2.3085872576177286, + "grad_norm": 0.7659527659416199, + "learning_rate": 3.406340296950175e-06, + "loss": 0.618, + "step": 8334 + }, + { + "epoch": 2.3088642659279777, + "grad_norm": 0.7388715744018555, + "learning_rate": 3.406000784867478e-06, + "loss": 0.6056, + "step": 8335 + }, + { + "epoch": 2.309141274238227, + "grad_norm": 0.6996129155158997, + "learning_rate": 3.4056612535480708e-06, + "loss": 0.5716, + "step": 8336 + }, + { + "epoch": 2.3094182825484766, + "grad_norm": 0.7552042007446289, + "learning_rate": 3.4053217029991626e-06, + "loss": 0.6436, + "step": 8337 + }, + { + "epoch": 2.3096952908587256, + "grad_norm": 0.7340453863143921, + "learning_rate": 3.4049821332279627e-06, + "loss": 0.6117, + "step": 8338 + }, + { + "epoch": 2.309972299168975, + "grad_norm": 0.7310197949409485, + "learning_rate": 3.4046425442416807e-06, + "loss": 0.5781, + "step": 8339 + }, + { + "epoch": 2.3102493074792245, + "grad_norm": 0.7644363641738892, + "learning_rate": 3.4043029360475277e-06, + "loss": 0.568, + "step": 8340 + }, + { + "epoch": 2.3105263157894735, + "grad_norm": 0.7341253161430359, + "learning_rate": 3.4039633086527137e-06, + "loss": 0.6255, + "step": 8341 + }, + { + "epoch": 2.310803324099723, + "grad_norm": 0.7020862698554993, + "learning_rate": 3.4036236620644503e-06, + "loss": 0.5814, + "step": 8342 + }, + { + "epoch": 2.3110803324099725, + "grad_norm": 0.7140727639198303, + "learning_rate": 3.4032839962899488e-06, + "loss": 0.5942, + "step": 8343 + }, + { + "epoch": 2.3113573407202215, + "grad_norm": 0.7498226165771484, + "learning_rate": 3.402944311336421e-06, + "loss": 0.6625, + "step": 8344 + }, + { + "epoch": 2.311634349030471, + "grad_norm": 0.7233909964561462, + "learning_rate": 3.4026046072110803e-06, + "loss": 0.5978, + "step": 8345 + }, + { + "epoch": 2.3119113573407204, + "grad_norm": 0.6893860697746277, + "learning_rate": 3.4022648839211392e-06, + "loss": 0.5497, + "step": 8346 + }, + { + "epoch": 2.3121883656509694, + "grad_norm": 0.7233559489250183, + "learning_rate": 3.4019251414738095e-06, + "loss": 0.5462, + "step": 8347 + }, + { + "epoch": 2.312465373961219, + "grad_norm": 0.7121790051460266, + "learning_rate": 3.401585379876306e-06, + "loss": 0.6133, + "step": 8348 + }, + { + "epoch": 2.3127423822714683, + "grad_norm": 0.7205917835235596, + "learning_rate": 3.401245599135843e-06, + "loss": 0.5382, + "step": 8349 + }, + { + "epoch": 2.3130193905817173, + "grad_norm": 0.7295979261398315, + "learning_rate": 3.400905799259634e-06, + "loss": 0.601, + "step": 8350 + }, + { + "epoch": 2.313296398891967, + "grad_norm": 0.7754128575325012, + "learning_rate": 3.400565980254894e-06, + "loss": 0.6109, + "step": 8351 + }, + { + "epoch": 2.313573407202216, + "grad_norm": 0.7137840390205383, + "learning_rate": 3.4002261421288386e-06, + "loss": 0.5897, + "step": 8352 + }, + { + "epoch": 2.3138504155124653, + "grad_norm": 0.7415006756782532, + "learning_rate": 3.3998862848886837e-06, + "loss": 0.6249, + "step": 8353 + }, + { + "epoch": 2.3141274238227147, + "grad_norm": 0.7502437233924866, + "learning_rate": 3.3995464085416446e-06, + "loss": 0.6291, + "step": 8354 + }, + { + "epoch": 2.3144044321329638, + "grad_norm": 0.7403385043144226, + "learning_rate": 3.3992065130949374e-06, + "loss": 0.6393, + "step": 8355 + }, + { + "epoch": 2.3146814404432132, + "grad_norm": 0.7676597833633423, + "learning_rate": 3.3988665985557806e-06, + "loss": 0.5729, + "step": 8356 + }, + { + "epoch": 2.3149584487534627, + "grad_norm": 0.7246792912483215, + "learning_rate": 3.39852666493139e-06, + "loss": 0.6401, + "step": 8357 + }, + { + "epoch": 2.3152354570637117, + "grad_norm": 0.7207006216049194, + "learning_rate": 3.398186712228984e-06, + "loss": 0.6507, + "step": 8358 + }, + { + "epoch": 2.315512465373961, + "grad_norm": 0.736382007598877, + "learning_rate": 3.3978467404557804e-06, + "loss": 0.622, + "step": 8359 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.7351194620132446, + "learning_rate": 3.3975067496189967e-06, + "loss": 0.568, + "step": 8360 + }, + { + "epoch": 2.3160664819944596, + "grad_norm": 0.7116756439208984, + "learning_rate": 3.397166739725854e-06, + "loss": 0.6029, + "step": 8361 + }, + { + "epoch": 2.316343490304709, + "grad_norm": 0.7074264883995056, + "learning_rate": 3.3968267107835694e-06, + "loss": 0.5407, + "step": 8362 + }, + { + "epoch": 2.3166204986149586, + "grad_norm": 0.7448917627334595, + "learning_rate": 3.396486662799364e-06, + "loss": 0.5873, + "step": 8363 + }, + { + "epoch": 2.3168975069252076, + "grad_norm": 0.7695679068565369, + "learning_rate": 3.3961465957804576e-06, + "loss": 0.6348, + "step": 8364 + }, + { + "epoch": 2.317174515235457, + "grad_norm": 0.7828521728515625, + "learning_rate": 3.3958065097340705e-06, + "loss": 0.6235, + "step": 8365 + }, + { + "epoch": 2.3174515235457065, + "grad_norm": 0.7109050154685974, + "learning_rate": 3.395466404667422e-06, + "loss": 0.6131, + "step": 8366 + }, + { + "epoch": 2.3177285318559555, + "grad_norm": 0.7334344983100891, + "learning_rate": 3.3951262805877366e-06, + "loss": 0.5505, + "step": 8367 + }, + { + "epoch": 2.318005540166205, + "grad_norm": 0.7734163999557495, + "learning_rate": 3.394786137502234e-06, + "loss": 0.5752, + "step": 8368 + }, + { + "epoch": 2.3182825484764544, + "grad_norm": 0.7226410508155823, + "learning_rate": 3.3944459754181358e-06, + "loss": 0.5992, + "step": 8369 + }, + { + "epoch": 2.3185595567867034, + "grad_norm": 0.7585699558258057, + "learning_rate": 3.3941057943426662e-06, + "loss": 0.5907, + "step": 8370 + }, + { + "epoch": 2.318836565096953, + "grad_norm": 0.7304117679595947, + "learning_rate": 3.393765594283048e-06, + "loss": 0.574, + "step": 8371 + }, + { + "epoch": 2.3191135734072024, + "grad_norm": 0.7299443483352661, + "learning_rate": 3.393425375246503e-06, + "loss": 0.5973, + "step": 8372 + }, + { + "epoch": 2.3193905817174514, + "grad_norm": 0.7117907404899597, + "learning_rate": 3.3930851372402553e-06, + "loss": 0.5968, + "step": 8373 + }, + { + "epoch": 2.319667590027701, + "grad_norm": 0.6997157335281372, + "learning_rate": 3.3927448802715294e-06, + "loss": 0.5612, + "step": 8374 + }, + { + "epoch": 2.3199445983379503, + "grad_norm": 0.7486206889152527, + "learning_rate": 3.3924046043475504e-06, + "loss": 0.6317, + "step": 8375 + }, + { + "epoch": 2.3202216066481993, + "grad_norm": 0.7381830811500549, + "learning_rate": 3.392064309475543e-06, + "loss": 0.6116, + "step": 8376 + }, + { + "epoch": 2.320498614958449, + "grad_norm": 0.7718245387077332, + "learning_rate": 3.3917239956627313e-06, + "loss": 0.6098, + "step": 8377 + }, + { + "epoch": 2.3207756232686982, + "grad_norm": 0.7560763359069824, + "learning_rate": 3.391383662916343e-06, + "loss": 0.6764, + "step": 8378 + }, + { + "epoch": 2.3210526315789473, + "grad_norm": 0.7376003861427307, + "learning_rate": 3.3910433112436023e-06, + "loss": 0.6091, + "step": 8379 + }, + { + "epoch": 2.3213296398891967, + "grad_norm": 0.7103112936019897, + "learning_rate": 3.3907029406517373e-06, + "loss": 0.6179, + "step": 8380 + }, + { + "epoch": 2.321606648199446, + "grad_norm": 0.7458259463310242, + "learning_rate": 3.3903625511479745e-06, + "loss": 0.6336, + "step": 8381 + }, + { + "epoch": 2.321883656509695, + "grad_norm": 0.763077437877655, + "learning_rate": 3.39002214273954e-06, + "loss": 0.6306, + "step": 8382 + }, + { + "epoch": 2.3221606648199447, + "grad_norm": 0.7884553670883179, + "learning_rate": 3.3896817154336633e-06, + "loss": 0.6052, + "step": 8383 + }, + { + "epoch": 2.322437673130194, + "grad_norm": 0.7037105560302734, + "learning_rate": 3.3893412692375716e-06, + "loss": 0.5635, + "step": 8384 + }, + { + "epoch": 2.322714681440443, + "grad_norm": 0.7140675783157349, + "learning_rate": 3.389000804158493e-06, + "loss": 0.617, + "step": 8385 + }, + { + "epoch": 2.3229916897506926, + "grad_norm": 0.7219676375389099, + "learning_rate": 3.388660320203657e-06, + "loss": 0.5544, + "step": 8386 + }, + { + "epoch": 2.323268698060942, + "grad_norm": 0.7285830974578857, + "learning_rate": 3.3883198173802945e-06, + "loss": 0.5882, + "step": 8387 + }, + { + "epoch": 2.323545706371191, + "grad_norm": 0.71585613489151, + "learning_rate": 3.3879792956956325e-06, + "loss": 0.6073, + "step": 8388 + }, + { + "epoch": 2.3238227146814405, + "grad_norm": 0.7456919550895691, + "learning_rate": 3.3876387551569027e-06, + "loss": 0.6009, + "step": 8389 + }, + { + "epoch": 2.32409972299169, + "grad_norm": 0.7833950519561768, + "learning_rate": 3.387298195771336e-06, + "loss": 0.5607, + "step": 8390 + }, + { + "epoch": 2.324376731301939, + "grad_norm": 0.7337169051170349, + "learning_rate": 3.386957617546162e-06, + "loss": 0.5897, + "step": 8391 + }, + { + "epoch": 2.3246537396121885, + "grad_norm": 0.7572849988937378, + "learning_rate": 3.386617020488613e-06, + "loss": 0.6423, + "step": 8392 + }, + { + "epoch": 2.3249307479224375, + "grad_norm": 0.7516975402832031, + "learning_rate": 3.38627640460592e-06, + "loss": 0.657, + "step": 8393 + }, + { + "epoch": 2.325207756232687, + "grad_norm": 0.764487624168396, + "learning_rate": 3.3859357699053165e-06, + "loss": 0.6049, + "step": 8394 + }, + { + "epoch": 2.3254847645429364, + "grad_norm": 0.717101514339447, + "learning_rate": 3.3855951163940333e-06, + "loss": 0.5653, + "step": 8395 + }, + { + "epoch": 2.3257617728531854, + "grad_norm": 0.6801470518112183, + "learning_rate": 3.385254444079304e-06, + "loss": 0.5768, + "step": 8396 + }, + { + "epoch": 2.326038781163435, + "grad_norm": 0.7541027665138245, + "learning_rate": 3.3849137529683627e-06, + "loss": 0.587, + "step": 8397 + }, + { + "epoch": 2.3263157894736843, + "grad_norm": 0.7836506962776184, + "learning_rate": 3.384573043068443e-06, + "loss": 0.5845, + "step": 8398 + }, + { + "epoch": 2.3265927977839334, + "grad_norm": 0.6979066729545593, + "learning_rate": 3.384232314386778e-06, + "loss": 0.5293, + "step": 8399 + }, + { + "epoch": 2.326869806094183, + "grad_norm": 0.749910295009613, + "learning_rate": 3.3838915669306038e-06, + "loss": 0.5824, + "step": 8400 + }, + { + "epoch": 2.3271468144044323, + "grad_norm": 0.71835857629776, + "learning_rate": 3.3835508007071534e-06, + "loss": 0.5732, + "step": 8401 + }, + { + "epoch": 2.3274238227146813, + "grad_norm": 0.742140531539917, + "learning_rate": 3.3832100157236635e-06, + "loss": 0.5952, + "step": 8402 + }, + { + "epoch": 2.3277008310249307, + "grad_norm": 0.7104704976081848, + "learning_rate": 3.3828692119873694e-06, + "loss": 0.5742, + "step": 8403 + }, + { + "epoch": 2.32797783933518, + "grad_norm": 0.7173059582710266, + "learning_rate": 3.3825283895055073e-06, + "loss": 0.6068, + "step": 8404 + }, + { + "epoch": 2.3282548476454292, + "grad_norm": 0.7052407264709473, + "learning_rate": 3.382187548285314e-06, + "loss": 0.5935, + "step": 8405 + }, + { + "epoch": 2.3285318559556787, + "grad_norm": 0.7657210826873779, + "learning_rate": 3.381846688334026e-06, + "loss": 0.6105, + "step": 8406 + }, + { + "epoch": 2.328808864265928, + "grad_norm": 0.7314922213554382, + "learning_rate": 3.381505809658881e-06, + "loss": 0.5613, + "step": 8407 + }, + { + "epoch": 2.329085872576177, + "grad_norm": 0.7196123600006104, + "learning_rate": 3.3811649122671166e-06, + "loss": 0.6182, + "step": 8408 + }, + { + "epoch": 2.3293628808864266, + "grad_norm": 0.7536889910697937, + "learning_rate": 3.38082399616597e-06, + "loss": 0.6235, + "step": 8409 + }, + { + "epoch": 2.329639889196676, + "grad_norm": 0.7460631132125854, + "learning_rate": 3.380483061362682e-06, + "loss": 0.6351, + "step": 8410 + }, + { + "epoch": 2.329916897506925, + "grad_norm": 0.7032564878463745, + "learning_rate": 3.380142107864489e-06, + "loss": 0.5941, + "step": 8411 + }, + { + "epoch": 2.3301939058171746, + "grad_norm": 0.7191113233566284, + "learning_rate": 3.379801135678632e-06, + "loss": 0.6226, + "step": 8412 + }, + { + "epoch": 2.3304709141274236, + "grad_norm": 0.7230985760688782, + "learning_rate": 3.3794601448123498e-06, + "loss": 0.601, + "step": 8413 + }, + { + "epoch": 2.330747922437673, + "grad_norm": 0.7542722225189209, + "learning_rate": 3.3791191352728823e-06, + "loss": 0.6325, + "step": 8414 + }, + { + "epoch": 2.3310249307479225, + "grad_norm": 0.7551451325416565, + "learning_rate": 3.3787781070674713e-06, + "loss": 0.6039, + "step": 8415 + }, + { + "epoch": 2.3313019390581715, + "grad_norm": 0.7307731509208679, + "learning_rate": 3.3784370602033572e-06, + "loss": 0.6207, + "step": 8416 + }, + { + "epoch": 2.331578947368421, + "grad_norm": 0.7245236039161682, + "learning_rate": 3.3780959946877806e-06, + "loss": 0.6177, + "step": 8417 + }, + { + "epoch": 2.3318559556786704, + "grad_norm": 0.7293622493743896, + "learning_rate": 3.3777549105279834e-06, + "loss": 0.6165, + "step": 8418 + }, + { + "epoch": 2.3321329639889194, + "grad_norm": 0.7345945239067078, + "learning_rate": 3.3774138077312087e-06, + "loss": 0.5866, + "step": 8419 + }, + { + "epoch": 2.332409972299169, + "grad_norm": 0.7424420118331909, + "learning_rate": 3.3770726863046973e-06, + "loss": 0.6303, + "step": 8420 + }, + { + "epoch": 2.3326869806094184, + "grad_norm": 0.7340677976608276, + "learning_rate": 3.376731546255693e-06, + "loss": 0.6184, + "step": 8421 + }, + { + "epoch": 2.3329639889196674, + "grad_norm": 0.6973327398300171, + "learning_rate": 3.37639038759144e-06, + "loss": 0.5948, + "step": 8422 + }, + { + "epoch": 2.333240997229917, + "grad_norm": 0.7196833491325378, + "learning_rate": 3.3760492103191806e-06, + "loss": 0.5884, + "step": 8423 + }, + { + "epoch": 2.3335180055401663, + "grad_norm": 0.7527026534080505, + "learning_rate": 3.3757080144461583e-06, + "loss": 0.5772, + "step": 8424 + }, + { + "epoch": 2.3337950138504153, + "grad_norm": 0.7155172228813171, + "learning_rate": 3.3753667999796193e-06, + "loss": 0.6154, + "step": 8425 + }, + { + "epoch": 2.334072022160665, + "grad_norm": 0.7029452323913574, + "learning_rate": 3.375025566926808e-06, + "loss": 0.6057, + "step": 8426 + }, + { + "epoch": 2.3343490304709142, + "grad_norm": 0.7369312644004822, + "learning_rate": 3.3746843152949687e-06, + "loss": 0.542, + "step": 8427 + }, + { + "epoch": 2.3346260387811633, + "grad_norm": 0.6887011528015137, + "learning_rate": 3.3743430450913485e-06, + "loss": 0.5605, + "step": 8428 + }, + { + "epoch": 2.3349030470914127, + "grad_norm": 0.7363905310630798, + "learning_rate": 3.374001756323192e-06, + "loss": 0.5791, + "step": 8429 + }, + { + "epoch": 2.335180055401662, + "grad_norm": 0.6778358817100525, + "learning_rate": 3.3736604489977465e-06, + "loss": 0.5713, + "step": 8430 + }, + { + "epoch": 2.335457063711911, + "grad_norm": 0.7384731769561768, + "learning_rate": 3.3733191231222577e-06, + "loss": 0.6085, + "step": 8431 + }, + { + "epoch": 2.3357340720221607, + "grad_norm": 0.7593647241592407, + "learning_rate": 3.3729777787039752e-06, + "loss": 0.6202, + "step": 8432 + }, + { + "epoch": 2.33601108033241, + "grad_norm": 0.7375938296318054, + "learning_rate": 3.372636415750144e-06, + "loss": 0.6186, + "step": 8433 + }, + { + "epoch": 2.336288088642659, + "grad_norm": 0.7485685348510742, + "learning_rate": 3.372295034268014e-06, + "loss": 0.5942, + "step": 8434 + }, + { + "epoch": 2.3365650969529086, + "grad_norm": 0.7516508102416992, + "learning_rate": 3.3719536342648323e-06, + "loss": 0.5925, + "step": 8435 + }, + { + "epoch": 2.336842105263158, + "grad_norm": 0.7607292532920837, + "learning_rate": 3.3716122157478488e-06, + "loss": 0.606, + "step": 8436 + }, + { + "epoch": 2.337119113573407, + "grad_norm": 0.7966906428337097, + "learning_rate": 3.371270778724311e-06, + "loss": 0.6508, + "step": 8437 + }, + { + "epoch": 2.3373961218836565, + "grad_norm": 0.7265411019325256, + "learning_rate": 3.3709293232014705e-06, + "loss": 0.6312, + "step": 8438 + }, + { + "epoch": 2.337673130193906, + "grad_norm": 0.7127956748008728, + "learning_rate": 3.370587849186576e-06, + "loss": 0.5349, + "step": 8439 + }, + { + "epoch": 2.337950138504155, + "grad_norm": 0.7615678310394287, + "learning_rate": 3.370246356686878e-06, + "loss": 0.6082, + "step": 8440 + }, + { + "epoch": 2.3382271468144045, + "grad_norm": 0.7224056124687195, + "learning_rate": 3.3699048457096283e-06, + "loss": 0.5952, + "step": 8441 + }, + { + "epoch": 2.338504155124654, + "grad_norm": 0.7433934807777405, + "learning_rate": 3.3695633162620765e-06, + "loss": 0.604, + "step": 8442 + }, + { + "epoch": 2.338781163434903, + "grad_norm": 0.767772912979126, + "learning_rate": 3.369221768351475e-06, + "loss": 0.6144, + "step": 8443 + }, + { + "epoch": 2.3390581717451524, + "grad_norm": 0.754507839679718, + "learning_rate": 3.3688802019850757e-06, + "loss": 0.6141, + "step": 8444 + }, + { + "epoch": 2.339335180055402, + "grad_norm": 0.7405969500541687, + "learning_rate": 3.368538617170131e-06, + "loss": 0.6131, + "step": 8445 + }, + { + "epoch": 2.339612188365651, + "grad_norm": 0.6835383176803589, + "learning_rate": 3.368197013913893e-06, + "loss": 0.5526, + "step": 8446 + }, + { + "epoch": 2.3398891966759003, + "grad_norm": 0.7214337587356567, + "learning_rate": 3.367855392223616e-06, + "loss": 0.6244, + "step": 8447 + }, + { + "epoch": 2.34016620498615, + "grad_norm": 0.714373767375946, + "learning_rate": 3.367513752106552e-06, + "loss": 0.5685, + "step": 8448 + }, + { + "epoch": 2.340443213296399, + "grad_norm": 0.7374280095100403, + "learning_rate": 3.3671720935699566e-06, + "loss": 0.6242, + "step": 8449 + }, + { + "epoch": 2.3407202216066483, + "grad_norm": 0.8040729761123657, + "learning_rate": 3.3668304166210817e-06, + "loss": 0.5767, + "step": 8450 + }, + { + "epoch": 2.3409972299168977, + "grad_norm": 0.765451967716217, + "learning_rate": 3.366488721267185e-06, + "loss": 0.6042, + "step": 8451 + }, + { + "epoch": 2.3412742382271468, + "grad_norm": 0.751152515411377, + "learning_rate": 3.366147007515519e-06, + "loss": 0.6059, + "step": 8452 + }, + { + "epoch": 2.341551246537396, + "grad_norm": 0.7184852361679077, + "learning_rate": 3.3658052753733406e-06, + "loss": 0.6025, + "step": 8453 + }, + { + "epoch": 2.3418282548476457, + "grad_norm": 0.7253008484840393, + "learning_rate": 3.3654635248479046e-06, + "loss": 0.6071, + "step": 8454 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.7163746953010559, + "learning_rate": 3.365121755946468e-06, + "loss": 0.6088, + "step": 8455 + }, + { + "epoch": 2.342382271468144, + "grad_norm": 0.7567439675331116, + "learning_rate": 3.3647799686762873e-06, + "loss": 0.5979, + "step": 8456 + }, + { + "epoch": 2.342659279778393, + "grad_norm": 0.7153251767158508, + "learning_rate": 3.36443816304462e-06, + "loss": 0.612, + "step": 8457 + }, + { + "epoch": 2.3429362880886426, + "grad_norm": 0.6974598169326782, + "learning_rate": 3.3640963390587223e-06, + "loss": 0.5867, + "step": 8458 + }, + { + "epoch": 2.343213296398892, + "grad_norm": 0.7750906348228455, + "learning_rate": 3.363754496725852e-06, + "loss": 0.5887, + "step": 8459 + }, + { + "epoch": 2.343490304709141, + "grad_norm": 0.69064861536026, + "learning_rate": 3.3634126360532694e-06, + "loss": 0.5824, + "step": 8460 + }, + { + "epoch": 2.3437673130193906, + "grad_norm": 0.7157204151153564, + "learning_rate": 3.3630707570482315e-06, + "loss": 0.5771, + "step": 8461 + }, + { + "epoch": 2.34404432132964, + "grad_norm": 0.7043836712837219, + "learning_rate": 3.362728859717996e-06, + "loss": 0.581, + "step": 8462 + }, + { + "epoch": 2.344321329639889, + "grad_norm": 0.7589282989501953, + "learning_rate": 3.3623869440698254e-06, + "loss": 0.5845, + "step": 8463 + }, + { + "epoch": 2.3445983379501385, + "grad_norm": 0.7066418528556824, + "learning_rate": 3.3620450101109767e-06, + "loss": 0.6094, + "step": 8464 + }, + { + "epoch": 2.344875346260388, + "grad_norm": 0.7088079452514648, + "learning_rate": 3.361703057848711e-06, + "loss": 0.5995, + "step": 8465 + }, + { + "epoch": 2.345152354570637, + "grad_norm": 0.7533881664276123, + "learning_rate": 3.361361087290289e-06, + "loss": 0.591, + "step": 8466 + }, + { + "epoch": 2.3454293628808864, + "grad_norm": 0.7128363251686096, + "learning_rate": 3.3610190984429713e-06, + "loss": 0.5998, + "step": 8467 + }, + { + "epoch": 2.345706371191136, + "grad_norm": 0.7468311190605164, + "learning_rate": 3.360677091314019e-06, + "loss": 0.6157, + "step": 8468 + }, + { + "epoch": 2.345983379501385, + "grad_norm": 0.7234650254249573, + "learning_rate": 3.360335065910695e-06, + "loss": 0.6295, + "step": 8469 + }, + { + "epoch": 2.3462603878116344, + "grad_norm": 0.7158196568489075, + "learning_rate": 3.35999302224026e-06, + "loss": 0.5866, + "step": 8470 + }, + { + "epoch": 2.346537396121884, + "grad_norm": 0.7302791476249695, + "learning_rate": 3.359650960309977e-06, + "loss": 0.5765, + "step": 8471 + }, + { + "epoch": 2.346814404432133, + "grad_norm": 0.7104723453521729, + "learning_rate": 3.3593088801271085e-06, + "loss": 0.5691, + "step": 8472 + }, + { + "epoch": 2.3470914127423823, + "grad_norm": 0.766150176525116, + "learning_rate": 3.358966781698919e-06, + "loss": 0.5465, + "step": 8473 + }, + { + "epoch": 2.3473684210526318, + "grad_norm": 0.7038397192955017, + "learning_rate": 3.3586246650326704e-06, + "loss": 0.588, + "step": 8474 + }, + { + "epoch": 2.347645429362881, + "grad_norm": 0.7408996820449829, + "learning_rate": 3.3582825301356274e-06, + "loss": 0.5938, + "step": 8475 + }, + { + "epoch": 2.3479224376731302, + "grad_norm": 0.7109339237213135, + "learning_rate": 3.3579403770150546e-06, + "loss": 0.5421, + "step": 8476 + }, + { + "epoch": 2.3481994459833793, + "grad_norm": 0.7604177594184875, + "learning_rate": 3.3575982056782167e-06, + "loss": 0.5848, + "step": 8477 + }, + { + "epoch": 2.3484764542936287, + "grad_norm": 0.7184967398643494, + "learning_rate": 3.3572560161323787e-06, + "loss": 0.5767, + "step": 8478 + }, + { + "epoch": 2.348753462603878, + "grad_norm": 0.7625906467437744, + "learning_rate": 3.3569138083848068e-06, + "loss": 0.5969, + "step": 8479 + }, + { + "epoch": 2.349030470914127, + "grad_norm": 0.6968985795974731, + "learning_rate": 3.356571582442766e-06, + "loss": 0.5782, + "step": 8480 + }, + { + "epoch": 2.3493074792243767, + "grad_norm": 0.7008188962936401, + "learning_rate": 3.356229338313524e-06, + "loss": 0.606, + "step": 8481 + }, + { + "epoch": 2.349584487534626, + "grad_norm": 0.7138946056365967, + "learning_rate": 3.355887076004345e-06, + "loss": 0.5827, + "step": 8482 + }, + { + "epoch": 2.349861495844875, + "grad_norm": 0.7512415051460266, + "learning_rate": 3.355544795522498e-06, + "loss": 0.6008, + "step": 8483 + }, + { + "epoch": 2.3501385041551246, + "grad_norm": 0.7460567355155945, + "learning_rate": 3.3552024968752504e-06, + "loss": 0.5968, + "step": 8484 + }, + { + "epoch": 2.350415512465374, + "grad_norm": 0.7085126638412476, + "learning_rate": 3.35486018006987e-06, + "loss": 0.6042, + "step": 8485 + }, + { + "epoch": 2.350692520775623, + "grad_norm": 0.7720244526863098, + "learning_rate": 3.3545178451136253e-06, + "loss": 0.6363, + "step": 8486 + }, + { + "epoch": 2.3509695290858725, + "grad_norm": 0.705949068069458, + "learning_rate": 3.354175492013784e-06, + "loss": 0.6077, + "step": 8487 + }, + { + "epoch": 2.351246537396122, + "grad_norm": 0.7648314833641052, + "learning_rate": 3.353833120777616e-06, + "loss": 0.5903, + "step": 8488 + }, + { + "epoch": 2.351523545706371, + "grad_norm": 0.7231563925743103, + "learning_rate": 3.35349073141239e-06, + "loss": 0.607, + "step": 8489 + }, + { + "epoch": 2.3518005540166205, + "grad_norm": 0.7560357451438904, + "learning_rate": 3.3531483239253765e-06, + "loss": 0.6346, + "step": 8490 + }, + { + "epoch": 2.35207756232687, + "grad_norm": 0.7295835018157959, + "learning_rate": 3.3528058983238447e-06, + "loss": 0.6109, + "step": 8491 + }, + { + "epoch": 2.352354570637119, + "grad_norm": 0.7257558703422546, + "learning_rate": 3.3524634546150673e-06, + "loss": 0.6057, + "step": 8492 + }, + { + "epoch": 2.3526315789473684, + "grad_norm": 0.7374874353408813, + "learning_rate": 3.3521209928063127e-06, + "loss": 0.5959, + "step": 8493 + }, + { + "epoch": 2.352908587257618, + "grad_norm": 0.7184003591537476, + "learning_rate": 3.3517785129048538e-06, + "loss": 0.562, + "step": 8494 + }, + { + "epoch": 2.353185595567867, + "grad_norm": 0.7235756516456604, + "learning_rate": 3.3514360149179616e-06, + "loss": 0.6034, + "step": 8495 + }, + { + "epoch": 2.3534626038781163, + "grad_norm": 0.7779785394668579, + "learning_rate": 3.3510934988529083e-06, + "loss": 0.6005, + "step": 8496 + }, + { + "epoch": 2.353739612188366, + "grad_norm": 0.7211990356445312, + "learning_rate": 3.3507509647169666e-06, + "loss": 0.5829, + "step": 8497 + }, + { + "epoch": 2.354016620498615, + "grad_norm": 0.7217147350311279, + "learning_rate": 3.35040841251741e-06, + "loss": 0.6246, + "step": 8498 + }, + { + "epoch": 2.3542936288088643, + "grad_norm": 0.7280838489532471, + "learning_rate": 3.3500658422615107e-06, + "loss": 0.6213, + "step": 8499 + }, + { + "epoch": 2.3545706371191137, + "grad_norm": 0.7369555234909058, + "learning_rate": 3.3497232539565414e-06, + "loss": 0.6067, + "step": 8500 + }, + { + "epoch": 2.3548476454293628, + "grad_norm": 0.71979159116745, + "learning_rate": 3.3493806476097797e-06, + "loss": 0.6007, + "step": 8501 + }, + { + "epoch": 2.355124653739612, + "grad_norm": 0.7194019556045532, + "learning_rate": 3.349038023228497e-06, + "loss": 0.5802, + "step": 8502 + }, + { + "epoch": 2.3554016620498617, + "grad_norm": 0.7042348980903625, + "learning_rate": 3.3486953808199685e-06, + "loss": 0.6007, + "step": 8503 + }, + { + "epoch": 2.3556786703601107, + "grad_norm": 0.7309350371360779, + "learning_rate": 3.3483527203914694e-06, + "loss": 0.6101, + "step": 8504 + }, + { + "epoch": 2.35595567867036, + "grad_norm": 0.796105146408081, + "learning_rate": 3.3480100419502764e-06, + "loss": 0.6098, + "step": 8505 + }, + { + "epoch": 2.3562326869806096, + "grad_norm": 0.7181329727172852, + "learning_rate": 3.347667345503664e-06, + "loss": 0.5815, + "step": 8506 + }, + { + "epoch": 2.3565096952908586, + "grad_norm": 0.7414490580558777, + "learning_rate": 3.3473246310589097e-06, + "loss": 0.5826, + "step": 8507 + }, + { + "epoch": 2.356786703601108, + "grad_norm": 0.7326338291168213, + "learning_rate": 3.3469818986232893e-06, + "loss": 0.6284, + "step": 8508 + }, + { + "epoch": 2.3570637119113576, + "grad_norm": 0.7372599244117737, + "learning_rate": 3.3466391482040807e-06, + "loss": 0.6585, + "step": 8509 + }, + { + "epoch": 2.3573407202216066, + "grad_norm": 0.7290738224983215, + "learning_rate": 3.34629637980856e-06, + "loss": 0.619, + "step": 8510 + }, + { + "epoch": 2.357617728531856, + "grad_norm": 0.7647110819816589, + "learning_rate": 3.345953593444007e-06, + "loss": 0.6146, + "step": 8511 + }, + { + "epoch": 2.3578947368421055, + "grad_norm": 0.7436761856079102, + "learning_rate": 3.345610789117698e-06, + "loss": 0.6048, + "step": 8512 + }, + { + "epoch": 2.3581717451523545, + "grad_norm": 0.7494994401931763, + "learning_rate": 3.3452679668369126e-06, + "loss": 0.6417, + "step": 8513 + }, + { + "epoch": 2.358448753462604, + "grad_norm": 0.7640016674995422, + "learning_rate": 3.34492512660893e-06, + "loss": 0.606, + "step": 8514 + }, + { + "epoch": 2.3587257617728534, + "grad_norm": 0.6900897026062012, + "learning_rate": 3.3445822684410295e-06, + "loss": 0.6021, + "step": 8515 + }, + { + "epoch": 2.3590027700831024, + "grad_norm": 0.7223372459411621, + "learning_rate": 3.34423939234049e-06, + "loss": 0.6225, + "step": 8516 + }, + { + "epoch": 2.359279778393352, + "grad_norm": 0.6969015598297119, + "learning_rate": 3.3438964983145928e-06, + "loss": 0.6112, + "step": 8517 + }, + { + "epoch": 2.359556786703601, + "grad_norm": 0.747728705406189, + "learning_rate": 3.343553586370618e-06, + "loss": 0.6208, + "step": 8518 + }, + { + "epoch": 2.3598337950138504, + "grad_norm": 0.7828081250190735, + "learning_rate": 3.3432106565158457e-06, + "loss": 0.596, + "step": 8519 + }, + { + "epoch": 2.3601108033241, + "grad_norm": 0.7545477747917175, + "learning_rate": 3.342867708757558e-06, + "loss": 0.6248, + "step": 8520 + }, + { + "epoch": 2.360387811634349, + "grad_norm": 0.7424569725990295, + "learning_rate": 3.3425247431030367e-06, + "loss": 0.596, + "step": 8521 + }, + { + "epoch": 2.3606648199445983, + "grad_norm": 0.7445389032363892, + "learning_rate": 3.3421817595595635e-06, + "loss": 0.6547, + "step": 8522 + }, + { + "epoch": 2.3609418282548478, + "grad_norm": 0.7122578620910645, + "learning_rate": 3.3418387581344204e-06, + "loss": 0.6121, + "step": 8523 + }, + { + "epoch": 2.361218836565097, + "grad_norm": 0.6893017888069153, + "learning_rate": 3.341495738834891e-06, + "loss": 0.5736, + "step": 8524 + }, + { + "epoch": 2.3614958448753463, + "grad_norm": 0.7213153839111328, + "learning_rate": 3.3411527016682575e-06, + "loss": 0.6046, + "step": 8525 + }, + { + "epoch": 2.3617728531855957, + "grad_norm": 0.7515587210655212, + "learning_rate": 3.340809646641805e-06, + "loss": 0.587, + "step": 8526 + }, + { + "epoch": 2.3620498614958447, + "grad_norm": 0.7319797873497009, + "learning_rate": 3.3404665737628165e-06, + "loss": 0.5814, + "step": 8527 + }, + { + "epoch": 2.362326869806094, + "grad_norm": 0.7438617944717407, + "learning_rate": 3.3401234830385753e-06, + "loss": 0.672, + "step": 8528 + }, + { + "epoch": 2.3626038781163436, + "grad_norm": 0.7243683338165283, + "learning_rate": 3.339780374476368e-06, + "loss": 0.5919, + "step": 8529 + }, + { + "epoch": 2.3628808864265927, + "grad_norm": 0.6928403377532959, + "learning_rate": 3.339437248083479e-06, + "loss": 0.5576, + "step": 8530 + }, + { + "epoch": 2.363157894736842, + "grad_norm": 0.7498994469642639, + "learning_rate": 3.339094103867193e-06, + "loss": 0.6324, + "step": 8531 + }, + { + "epoch": 2.3634349030470916, + "grad_norm": 0.7130890488624573, + "learning_rate": 3.3387509418347962e-06, + "loss": 0.589, + "step": 8532 + }, + { + "epoch": 2.3637119113573406, + "grad_norm": 0.7753434777259827, + "learning_rate": 3.338407761993575e-06, + "loss": 0.6037, + "step": 8533 + }, + { + "epoch": 2.36398891966759, + "grad_norm": 0.7679981589317322, + "learning_rate": 3.3380645643508165e-06, + "loss": 0.6238, + "step": 8534 + }, + { + "epoch": 2.3642659279778395, + "grad_norm": 0.7225174903869629, + "learning_rate": 3.3377213489138073e-06, + "loss": 0.5852, + "step": 8535 + }, + { + "epoch": 2.3645429362880885, + "grad_norm": 0.7681257724761963, + "learning_rate": 3.3373781156898333e-06, + "loss": 0.5804, + "step": 8536 + }, + { + "epoch": 2.364819944598338, + "grad_norm": 0.726750373840332, + "learning_rate": 3.337034864686185e-06, + "loss": 0.6111, + "step": 8537 + }, + { + "epoch": 2.365096952908587, + "grad_norm": 0.7270079255104065, + "learning_rate": 3.3366915959101478e-06, + "loss": 0.5785, + "step": 8538 + }, + { + "epoch": 2.3653739612188365, + "grad_norm": 0.7376360893249512, + "learning_rate": 3.3363483093690107e-06, + "loss": 0.6256, + "step": 8539 + }, + { + "epoch": 2.365650969529086, + "grad_norm": 0.7967438101768494, + "learning_rate": 3.3360050050700655e-06, + "loss": 0.6157, + "step": 8540 + }, + { + "epoch": 2.365927977839335, + "grad_norm": 0.7275079488754272, + "learning_rate": 3.3356616830205974e-06, + "loss": 0.6113, + "step": 8541 + }, + { + "epoch": 2.3662049861495844, + "grad_norm": 0.7312076091766357, + "learning_rate": 3.335318343227898e-06, + "loss": 0.5816, + "step": 8542 + }, + { + "epoch": 2.366481994459834, + "grad_norm": 0.7246550917625427, + "learning_rate": 3.3349749856992575e-06, + "loss": 0.574, + "step": 8543 + }, + { + "epoch": 2.366759002770083, + "grad_norm": 0.7144362926483154, + "learning_rate": 3.334631610441965e-06, + "loss": 0.6176, + "step": 8544 + }, + { + "epoch": 2.3670360110803323, + "grad_norm": 0.7541229724884033, + "learning_rate": 3.334288217463312e-06, + "loss": 0.5949, + "step": 8545 + }, + { + "epoch": 2.367313019390582, + "grad_norm": 0.7368307113647461, + "learning_rate": 3.333944806770591e-06, + "loss": 0.6445, + "step": 8546 + }, + { + "epoch": 2.367590027700831, + "grad_norm": 0.7364919781684875, + "learning_rate": 3.333601378371091e-06, + "loss": 0.5953, + "step": 8547 + }, + { + "epoch": 2.3678670360110803, + "grad_norm": 0.72370445728302, + "learning_rate": 3.333257932272105e-06, + "loss": 0.6163, + "step": 8548 + }, + { + "epoch": 2.3681440443213297, + "grad_norm": 0.711737871170044, + "learning_rate": 3.3329144684809257e-06, + "loss": 0.5909, + "step": 8549 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.7610693573951721, + "learning_rate": 3.3325709870048446e-06, + "loss": 0.6281, + "step": 8550 + }, + { + "epoch": 2.368698060941828, + "grad_norm": 0.7502447366714478, + "learning_rate": 3.332227487851155e-06, + "loss": 0.6048, + "step": 8551 + }, + { + "epoch": 2.3689750692520777, + "grad_norm": 0.7391875386238098, + "learning_rate": 3.331883971027151e-06, + "loss": 0.5574, + "step": 8552 + }, + { + "epoch": 2.3692520775623267, + "grad_norm": 0.7459149956703186, + "learning_rate": 3.331540436540126e-06, + "loss": 0.5775, + "step": 8553 + }, + { + "epoch": 2.369529085872576, + "grad_norm": 0.7106911540031433, + "learning_rate": 3.3311968843973738e-06, + "loss": 0.5932, + "step": 8554 + }, + { + "epoch": 2.3698060941828256, + "grad_norm": 0.7712510824203491, + "learning_rate": 3.330853314606189e-06, + "loss": 0.6059, + "step": 8555 + }, + { + "epoch": 2.3700831024930746, + "grad_norm": 0.7939676642417908, + "learning_rate": 3.3305097271738666e-06, + "loss": 0.5826, + "step": 8556 + }, + { + "epoch": 2.370360110803324, + "grad_norm": 0.7157027125358582, + "learning_rate": 3.3301661221077024e-06, + "loss": 0.5834, + "step": 8557 + }, + { + "epoch": 2.3706371191135736, + "grad_norm": 0.757570743560791, + "learning_rate": 3.329822499414991e-06, + "loss": 0.6023, + "step": 8558 + }, + { + "epoch": 2.3709141274238226, + "grad_norm": 0.7631760835647583, + "learning_rate": 3.329478859103029e-06, + "loss": 0.6508, + "step": 8559 + }, + { + "epoch": 2.371191135734072, + "grad_norm": 0.7067531943321228, + "learning_rate": 3.3291352011791116e-06, + "loss": 0.608, + "step": 8560 + }, + { + "epoch": 2.3714681440443215, + "grad_norm": 0.719627857208252, + "learning_rate": 3.3287915256505377e-06, + "loss": 0.6121, + "step": 8561 + }, + { + "epoch": 2.3717451523545705, + "grad_norm": 0.7045274376869202, + "learning_rate": 3.3284478325246027e-06, + "loss": 0.6026, + "step": 8562 + }, + { + "epoch": 2.37202216066482, + "grad_norm": 0.7519272565841675, + "learning_rate": 3.328104121808605e-06, + "loss": 0.5889, + "step": 8563 + }, + { + "epoch": 2.3722991689750694, + "grad_norm": 0.7602554559707642, + "learning_rate": 3.3277603935098413e-06, + "loss": 0.5731, + "step": 8564 + }, + { + "epoch": 2.3725761772853184, + "grad_norm": 0.7468898296356201, + "learning_rate": 3.3274166476356106e-06, + "loss": 0.5784, + "step": 8565 + }, + { + "epoch": 2.372853185595568, + "grad_norm": 0.7231388688087463, + "learning_rate": 3.3270728841932122e-06, + "loss": 0.6044, + "step": 8566 + }, + { + "epoch": 2.3731301939058174, + "grad_norm": 0.742412269115448, + "learning_rate": 3.326729103189944e-06, + "loss": 0.6142, + "step": 8567 + }, + { + "epoch": 2.3734072022160664, + "grad_norm": 0.7619256377220154, + "learning_rate": 3.326385304633105e-06, + "loss": 0.6047, + "step": 8568 + }, + { + "epoch": 2.373684210526316, + "grad_norm": 0.7784067392349243, + "learning_rate": 3.3260414885299967e-06, + "loss": 0.6268, + "step": 8569 + }, + { + "epoch": 2.3739612188365653, + "grad_norm": 0.7004120349884033, + "learning_rate": 3.3256976548879183e-06, + "loss": 0.5829, + "step": 8570 + }, + { + "epoch": 2.3742382271468143, + "grad_norm": 0.7318438291549683, + "learning_rate": 3.3253538037141697e-06, + "loss": 0.5994, + "step": 8571 + }, + { + "epoch": 2.3745152354570638, + "grad_norm": 0.755072832107544, + "learning_rate": 3.3250099350160518e-06, + "loss": 0.6282, + "step": 8572 + }, + { + "epoch": 2.3747922437673132, + "grad_norm": 0.7150866389274597, + "learning_rate": 3.324666048800866e-06, + "loss": 0.6095, + "step": 8573 + }, + { + "epoch": 2.3750692520775623, + "grad_norm": 0.7552684545516968, + "learning_rate": 3.3243221450759143e-06, + "loss": 0.6406, + "step": 8574 + }, + { + "epoch": 2.3753462603878117, + "grad_norm": 0.7518280148506165, + "learning_rate": 3.3239782238484985e-06, + "loss": 0.5867, + "step": 8575 + }, + { + "epoch": 2.375623268698061, + "grad_norm": 0.74309241771698, + "learning_rate": 3.3236342851259208e-06, + "loss": 0.5983, + "step": 8576 + }, + { + "epoch": 2.37590027700831, + "grad_norm": 0.7382872700691223, + "learning_rate": 3.3232903289154835e-06, + "loss": 0.609, + "step": 8577 + }, + { + "epoch": 2.3761772853185597, + "grad_norm": 0.7038781642913818, + "learning_rate": 3.3229463552244905e-06, + "loss": 0.5662, + "step": 8578 + }, + { + "epoch": 2.376454293628809, + "grad_norm": 0.7423188090324402, + "learning_rate": 3.3226023640602446e-06, + "loss": 0.5867, + "step": 8579 + }, + { + "epoch": 2.376731301939058, + "grad_norm": 0.7664845585823059, + "learning_rate": 3.32225835543005e-06, + "loss": 0.5634, + "step": 8580 + }, + { + "epoch": 2.3770083102493076, + "grad_norm": 0.7739835977554321, + "learning_rate": 3.3219143293412104e-06, + "loss": 0.617, + "step": 8581 + }, + { + "epoch": 2.3772853185595566, + "grad_norm": 0.727225661277771, + "learning_rate": 3.3215702858010308e-06, + "loss": 0.6212, + "step": 8582 + }, + { + "epoch": 2.377562326869806, + "grad_norm": 0.7422259449958801, + "learning_rate": 3.321226224816816e-06, + "loss": 0.596, + "step": 8583 + }, + { + "epoch": 2.3778393351800555, + "grad_norm": 0.788818359375, + "learning_rate": 3.320882146395871e-06, + "loss": 0.6313, + "step": 8584 + }, + { + "epoch": 2.3781163434903045, + "grad_norm": 0.7638080716133118, + "learning_rate": 3.320538050545502e-06, + "loss": 0.5853, + "step": 8585 + }, + { + "epoch": 2.378393351800554, + "grad_norm": 0.6938989162445068, + "learning_rate": 3.3201939372730147e-06, + "loss": 0.5933, + "step": 8586 + }, + { + "epoch": 2.3786703601108035, + "grad_norm": 0.7522276639938354, + "learning_rate": 3.3198498065857152e-06, + "loss": 0.6793, + "step": 8587 + }, + { + "epoch": 2.3789473684210525, + "grad_norm": 0.7383003234863281, + "learning_rate": 3.319505658490911e-06, + "loss": 0.5879, + "step": 8588 + }, + { + "epoch": 2.379224376731302, + "grad_norm": 0.7505053281784058, + "learning_rate": 3.319161492995909e-06, + "loss": 0.5962, + "step": 8589 + }, + { + "epoch": 2.3795013850415514, + "grad_norm": 0.7599852085113525, + "learning_rate": 3.318817310108017e-06, + "loss": 0.6705, + "step": 8590 + }, + { + "epoch": 2.3797783933518004, + "grad_norm": 0.7736787796020508, + "learning_rate": 3.3184731098345424e-06, + "loss": 0.6063, + "step": 8591 + }, + { + "epoch": 2.38005540166205, + "grad_norm": 0.7720979452133179, + "learning_rate": 3.3181288921827925e-06, + "loss": 0.6008, + "step": 8592 + }, + { + "epoch": 2.3803324099722993, + "grad_norm": 0.7370125651359558, + "learning_rate": 3.3177846571600773e-06, + "loss": 0.6201, + "step": 8593 + }, + { + "epoch": 2.3806094182825484, + "grad_norm": 0.7008771896362305, + "learning_rate": 3.317440404773706e-06, + "loss": 0.5812, + "step": 8594 + }, + { + "epoch": 2.380886426592798, + "grad_norm": 0.770255446434021, + "learning_rate": 3.3170961350309866e-06, + "loss": 0.5996, + "step": 8595 + }, + { + "epoch": 2.3811634349030473, + "grad_norm": 0.6982191801071167, + "learning_rate": 3.31675184793923e-06, + "loss": 0.515, + "step": 8596 + }, + { + "epoch": 2.3814404432132963, + "grad_norm": 0.7257846593856812, + "learning_rate": 3.3164075435057465e-06, + "loss": 0.628, + "step": 8597 + }, + { + "epoch": 2.3817174515235457, + "grad_norm": 0.7340461611747742, + "learning_rate": 3.316063221737845e-06, + "loss": 0.5919, + "step": 8598 + }, + { + "epoch": 2.381994459833795, + "grad_norm": 0.7259990572929382, + "learning_rate": 3.315718882642838e-06, + "loss": 0.6031, + "step": 8599 + }, + { + "epoch": 2.3822714681440442, + "grad_norm": 0.713935911655426, + "learning_rate": 3.3153745262280363e-06, + "loss": 0.6512, + "step": 8600 + }, + { + "epoch": 2.3825484764542937, + "grad_norm": 0.7421038746833801, + "learning_rate": 3.31503015250075e-06, + "loss": 0.6127, + "step": 8601 + }, + { + "epoch": 2.3828254847645427, + "grad_norm": 0.7182706594467163, + "learning_rate": 3.3146857614682933e-06, + "loss": 0.6394, + "step": 8602 + }, + { + "epoch": 2.383102493074792, + "grad_norm": 0.7398244738578796, + "learning_rate": 3.3143413531379763e-06, + "loss": 0.5603, + "step": 8603 + }, + { + "epoch": 2.3833795013850416, + "grad_norm": 0.717022716999054, + "learning_rate": 3.313996927517114e-06, + "loss": 0.6001, + "step": 8604 + }, + { + "epoch": 2.3836565096952906, + "grad_norm": 0.7280988097190857, + "learning_rate": 3.313652484613017e-06, + "loss": 0.625, + "step": 8605 + }, + { + "epoch": 2.38393351800554, + "grad_norm": 0.7831609845161438, + "learning_rate": 3.3133080244330007e-06, + "loss": 0.6018, + "step": 8606 + }, + { + "epoch": 2.3842105263157896, + "grad_norm": 0.7226575016975403, + "learning_rate": 3.312963546984378e-06, + "loss": 0.5894, + "step": 8607 + }, + { + "epoch": 2.3844875346260386, + "grad_norm": 0.7334833145141602, + "learning_rate": 3.312619052274463e-06, + "loss": 0.6033, + "step": 8608 + }, + { + "epoch": 2.384764542936288, + "grad_norm": 0.6648973226547241, + "learning_rate": 3.3122745403105706e-06, + "loss": 0.5626, + "step": 8609 + }, + { + "epoch": 2.3850415512465375, + "grad_norm": 0.7012839317321777, + "learning_rate": 3.3119300111000152e-06, + "loss": 0.5786, + "step": 8610 + }, + { + "epoch": 2.3853185595567865, + "grad_norm": 0.7229165434837341, + "learning_rate": 3.311585464650112e-06, + "loss": 0.5846, + "step": 8611 + }, + { + "epoch": 2.385595567867036, + "grad_norm": 0.6839649677276611, + "learning_rate": 3.3112409009681767e-06, + "loss": 0.6118, + "step": 8612 + }, + { + "epoch": 2.3858725761772854, + "grad_norm": 0.751091718673706, + "learning_rate": 3.3108963200615265e-06, + "loss": 0.5951, + "step": 8613 + }, + { + "epoch": 2.3861495844875344, + "grad_norm": 0.7311224341392517, + "learning_rate": 3.310551721937475e-06, + "loss": 0.6278, + "step": 8614 + }, + { + "epoch": 2.386426592797784, + "grad_norm": 0.7401997447013855, + "learning_rate": 3.3102071066033414e-06, + "loss": 0.6383, + "step": 8615 + }, + { + "epoch": 2.3867036011080334, + "grad_norm": 0.7039486169815063, + "learning_rate": 3.309862474066442e-06, + "loss": 0.6233, + "step": 8616 + }, + { + "epoch": 2.3869806094182824, + "grad_norm": 0.7550800442695618, + "learning_rate": 3.309517824334094e-06, + "loss": 0.6055, + "step": 8617 + }, + { + "epoch": 2.387257617728532, + "grad_norm": 0.7891804575920105, + "learning_rate": 3.309173157413615e-06, + "loss": 0.6289, + "step": 8618 + }, + { + "epoch": 2.3875346260387813, + "grad_norm": 0.7476217150688171, + "learning_rate": 3.3088284733123233e-06, + "loss": 0.6219, + "step": 8619 + }, + { + "epoch": 2.3878116343490303, + "grad_norm": 0.7575064897537231, + "learning_rate": 3.3084837720375386e-06, + "loss": 0.5688, + "step": 8620 + }, + { + "epoch": 2.38808864265928, + "grad_norm": 0.7405401468276978, + "learning_rate": 3.308139053596578e-06, + "loss": 0.6082, + "step": 8621 + }, + { + "epoch": 2.3883656509695292, + "grad_norm": 0.7421559691429138, + "learning_rate": 3.307794317996762e-06, + "loss": 0.5862, + "step": 8622 + }, + { + "epoch": 2.3886426592797783, + "grad_norm": 0.7410457134246826, + "learning_rate": 3.3074495652454087e-06, + "loss": 0.6485, + "step": 8623 + }, + { + "epoch": 2.3889196675900277, + "grad_norm": 0.7029555439949036, + "learning_rate": 3.3071047953498396e-06, + "loss": 0.5818, + "step": 8624 + }, + { + "epoch": 2.389196675900277, + "grad_norm": 0.7657647132873535, + "learning_rate": 3.3067600083173752e-06, + "loss": 0.6682, + "step": 8625 + }, + { + "epoch": 2.389473684210526, + "grad_norm": 0.7279041409492493, + "learning_rate": 3.3064152041553356e-06, + "loss": 0.5996, + "step": 8626 + }, + { + "epoch": 2.3897506925207757, + "grad_norm": 0.7268471717834473, + "learning_rate": 3.3060703828710407e-06, + "loss": 0.5638, + "step": 8627 + }, + { + "epoch": 2.390027700831025, + "grad_norm": 0.7710117697715759, + "learning_rate": 3.3057255444718137e-06, + "loss": 0.5929, + "step": 8628 + }, + { + "epoch": 2.390304709141274, + "grad_norm": 0.7515334486961365, + "learning_rate": 3.3053806889649763e-06, + "loss": 0.6153, + "step": 8629 + }, + { + "epoch": 2.3905817174515236, + "grad_norm": 0.7906890511512756, + "learning_rate": 3.30503581635785e-06, + "loss": 0.603, + "step": 8630 + }, + { + "epoch": 2.390858725761773, + "grad_norm": 0.743398904800415, + "learning_rate": 3.304690926657757e-06, + "loss": 0.602, + "step": 8631 + }, + { + "epoch": 2.391135734072022, + "grad_norm": 0.7293583154678345, + "learning_rate": 3.3043460198720213e-06, + "loss": 0.5593, + "step": 8632 + }, + { + "epoch": 2.3914127423822715, + "grad_norm": 0.7299348711967468, + "learning_rate": 3.304001096007965e-06, + "loss": 0.6329, + "step": 8633 + }, + { + "epoch": 2.391689750692521, + "grad_norm": 0.7271016240119934, + "learning_rate": 3.3036561550729123e-06, + "loss": 0.5929, + "step": 8634 + }, + { + "epoch": 2.39196675900277, + "grad_norm": 0.7364197373390198, + "learning_rate": 3.303311197074188e-06, + "loss": 0.6214, + "step": 8635 + }, + { + "epoch": 2.3922437673130195, + "grad_norm": 0.7587757110595703, + "learning_rate": 3.3029662220191146e-06, + "loss": 0.6121, + "step": 8636 + }, + { + "epoch": 2.392520775623269, + "grad_norm": 0.7601677775382996, + "learning_rate": 3.3026212299150173e-06, + "loss": 0.6149, + "step": 8637 + }, + { + "epoch": 2.392797783933518, + "grad_norm": 0.7155644297599792, + "learning_rate": 3.3022762207692222e-06, + "loss": 0.576, + "step": 8638 + }, + { + "epoch": 2.3930747922437674, + "grad_norm": 0.7605409622192383, + "learning_rate": 3.301931194589054e-06, + "loss": 0.6194, + "step": 8639 + }, + { + "epoch": 2.393351800554017, + "grad_norm": 0.7717528939247131, + "learning_rate": 3.3015861513818388e-06, + "loss": 0.5927, + "step": 8640 + }, + { + "epoch": 2.393628808864266, + "grad_norm": 0.7234730124473572, + "learning_rate": 3.3012410911549023e-06, + "loss": 0.6205, + "step": 8641 + }, + { + "epoch": 2.3939058171745153, + "grad_norm": 0.7156244516372681, + "learning_rate": 3.300896013915572e-06, + "loss": 0.6189, + "step": 8642 + }, + { + "epoch": 2.3941828254847644, + "grad_norm": 0.7031002044677734, + "learning_rate": 3.300550919671174e-06, + "loss": 0.6049, + "step": 8643 + }, + { + "epoch": 2.394459833795014, + "grad_norm": 0.7399904131889343, + "learning_rate": 3.3002058084290345e-06, + "loss": 0.6305, + "step": 8644 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.7451629638671875, + "learning_rate": 3.2998606801964837e-06, + "loss": 0.6022, + "step": 8645 + }, + { + "epoch": 2.3950138504155123, + "grad_norm": 0.7191906571388245, + "learning_rate": 3.2995155349808474e-06, + "loss": 0.636, + "step": 8646 + }, + { + "epoch": 2.3952908587257618, + "grad_norm": 0.6923609972000122, + "learning_rate": 3.299170372789454e-06, + "loss": 0.5295, + "step": 8647 + }, + { + "epoch": 2.395567867036011, + "grad_norm": 0.7502071857452393, + "learning_rate": 3.2988251936296334e-06, + "loss": 0.6359, + "step": 8648 + }, + { + "epoch": 2.3958448753462602, + "grad_norm": 0.7677696943283081, + "learning_rate": 3.298479997508713e-06, + "loss": 0.6039, + "step": 8649 + }, + { + "epoch": 2.3961218836565097, + "grad_norm": 0.7665891051292419, + "learning_rate": 3.2981347844340237e-06, + "loss": 0.5768, + "step": 8650 + }, + { + "epoch": 2.396398891966759, + "grad_norm": 0.7692534327507019, + "learning_rate": 3.2977895544128955e-06, + "loss": 0.6114, + "step": 8651 + }, + { + "epoch": 2.396675900277008, + "grad_norm": 0.7409048676490784, + "learning_rate": 3.2974443074526572e-06, + "loss": 0.5829, + "step": 8652 + }, + { + "epoch": 2.3969529085872576, + "grad_norm": 0.7364702224731445, + "learning_rate": 3.2970990435606394e-06, + "loss": 0.6205, + "step": 8653 + }, + { + "epoch": 2.397229916897507, + "grad_norm": 0.7530790567398071, + "learning_rate": 3.296753762744174e-06, + "loss": 0.5833, + "step": 8654 + }, + { + "epoch": 2.397506925207756, + "grad_norm": 0.7629604339599609, + "learning_rate": 3.296408465010591e-06, + "loss": 0.5572, + "step": 8655 + }, + { + "epoch": 2.3977839335180056, + "grad_norm": 0.7859547734260559, + "learning_rate": 3.2960631503672223e-06, + "loss": 0.647, + "step": 8656 + }, + { + "epoch": 2.398060941828255, + "grad_norm": 0.7414343953132629, + "learning_rate": 3.2957178188214006e-06, + "loss": 0.5865, + "step": 8657 + }, + { + "epoch": 2.398337950138504, + "grad_norm": 0.6949269771575928, + "learning_rate": 3.2953724703804572e-06, + "loss": 0.5801, + "step": 8658 + }, + { + "epoch": 2.3986149584487535, + "grad_norm": 0.7935546636581421, + "learning_rate": 3.295027105051725e-06, + "loss": 0.6344, + "step": 8659 + }, + { + "epoch": 2.398891966759003, + "grad_norm": 0.767785906791687, + "learning_rate": 3.2946817228425377e-06, + "loss": 0.6063, + "step": 8660 + }, + { + "epoch": 2.399168975069252, + "grad_norm": 0.7153326272964478, + "learning_rate": 3.294336323760228e-06, + "loss": 0.5846, + "step": 8661 + }, + { + "epoch": 2.3994459833795014, + "grad_norm": 0.7636194229125977, + "learning_rate": 3.293990907812129e-06, + "loss": 0.6472, + "step": 8662 + }, + { + "epoch": 2.3997229916897505, + "grad_norm": 0.7240091562271118, + "learning_rate": 3.293645475005575e-06, + "loss": 0.5596, + "step": 8663 + }, + { + "epoch": 2.4, + "grad_norm": 0.7160698771476746, + "learning_rate": 3.293300025347902e-06, + "loss": 0.5677, + "step": 8664 + }, + { + "epoch": 2.4002770083102494, + "grad_norm": 0.7526683807373047, + "learning_rate": 3.2929545588464428e-06, + "loss": 0.5953, + "step": 8665 + }, + { + "epoch": 2.4005540166204984, + "grad_norm": 0.7347311973571777, + "learning_rate": 3.292609075508533e-06, + "loss": 0.6312, + "step": 8666 + }, + { + "epoch": 2.400831024930748, + "grad_norm": 0.6851674914360046, + "learning_rate": 3.29226357534151e-06, + "loss": 0.5971, + "step": 8667 + }, + { + "epoch": 2.4011080332409973, + "grad_norm": 0.7400670051574707, + "learning_rate": 3.2919180583527065e-06, + "loss": 0.6272, + "step": 8668 + }, + { + "epoch": 2.4013850415512463, + "grad_norm": 0.7210569977760315, + "learning_rate": 3.291572524549461e-06, + "loss": 0.6248, + "step": 8669 + }, + { + "epoch": 2.401662049861496, + "grad_norm": 0.7295917272567749, + "learning_rate": 3.291226973939109e-06, + "loss": 0.575, + "step": 8670 + }, + { + "epoch": 2.4019390581717452, + "grad_norm": 0.7628531455993652, + "learning_rate": 3.2908814065289884e-06, + "loss": 0.6255, + "step": 8671 + }, + { + "epoch": 2.4022160664819943, + "grad_norm": 0.7158731818199158, + "learning_rate": 3.290535822326435e-06, + "loss": 0.6073, + "step": 8672 + }, + { + "epoch": 2.4024930747922437, + "grad_norm": 0.7516884207725525, + "learning_rate": 3.2901902213387876e-06, + "loss": 0.6121, + "step": 8673 + }, + { + "epoch": 2.402770083102493, + "grad_norm": 0.7457286715507507, + "learning_rate": 3.2898446035733845e-06, + "loss": 0.6163, + "step": 8674 + }, + { + "epoch": 2.403047091412742, + "grad_norm": 0.7692743539810181, + "learning_rate": 3.289498969037563e-06, + "loss": 0.6117, + "step": 8675 + }, + { + "epoch": 2.4033240997229917, + "grad_norm": 0.7491902709007263, + "learning_rate": 3.289153317738662e-06, + "loss": 0.6537, + "step": 8676 + }, + { + "epoch": 2.403601108033241, + "grad_norm": 0.730591893196106, + "learning_rate": 3.288807649684021e-06, + "loss": 0.6004, + "step": 8677 + }, + { + "epoch": 2.40387811634349, + "grad_norm": 0.7152724862098694, + "learning_rate": 3.288461964880979e-06, + "loss": 0.5616, + "step": 8678 + }, + { + "epoch": 2.4041551246537396, + "grad_norm": 0.7206489443778992, + "learning_rate": 3.2881162633368757e-06, + "loss": 0.5998, + "step": 8679 + }, + { + "epoch": 2.404432132963989, + "grad_norm": 0.7845282554626465, + "learning_rate": 3.2877705450590525e-06, + "loss": 0.5912, + "step": 8680 + }, + { + "epoch": 2.404709141274238, + "grad_norm": 0.7430789470672607, + "learning_rate": 3.287424810054848e-06, + "loss": 0.619, + "step": 8681 + }, + { + "epoch": 2.4049861495844875, + "grad_norm": 0.7434074878692627, + "learning_rate": 3.2870790583316035e-06, + "loss": 0.6069, + "step": 8682 + }, + { + "epoch": 2.405263157894737, + "grad_norm": 0.7171652913093567, + "learning_rate": 3.286733289896662e-06, + "loss": 0.6173, + "step": 8683 + }, + { + "epoch": 2.405540166204986, + "grad_norm": 0.7580751180648804, + "learning_rate": 3.286387504757363e-06, + "loss": 0.59, + "step": 8684 + }, + { + "epoch": 2.4058171745152355, + "grad_norm": 0.7650940418243408, + "learning_rate": 3.286041702921048e-06, + "loss": 0.589, + "step": 8685 + }, + { + "epoch": 2.406094182825485, + "grad_norm": 0.7545360922813416, + "learning_rate": 3.2856958843950626e-06, + "loss": 0.5474, + "step": 8686 + }, + { + "epoch": 2.406371191135734, + "grad_norm": 0.7186492681503296, + "learning_rate": 3.2853500491867452e-06, + "loss": 0.6071, + "step": 8687 + }, + { + "epoch": 2.4066481994459834, + "grad_norm": 0.742397665977478, + "learning_rate": 3.2850041973034417e-06, + "loss": 0.6075, + "step": 8688 + }, + { + "epoch": 2.406925207756233, + "grad_norm": 0.6936323046684265, + "learning_rate": 3.284658328752494e-06, + "loss": 0.5361, + "step": 8689 + }, + { + "epoch": 2.407202216066482, + "grad_norm": 0.7993597388267517, + "learning_rate": 3.2843124435412464e-06, + "loss": 0.598, + "step": 8690 + }, + { + "epoch": 2.4074792243767313, + "grad_norm": 0.738192617893219, + "learning_rate": 3.2839665416770422e-06, + "loss": 0.6012, + "step": 8691 + }, + { + "epoch": 2.407756232686981, + "grad_norm": 0.7682150602340698, + "learning_rate": 3.283620623167227e-06, + "loss": 0.5988, + "step": 8692 + }, + { + "epoch": 2.40803324099723, + "grad_norm": 0.6933428049087524, + "learning_rate": 3.2832746880191447e-06, + "loss": 0.6201, + "step": 8693 + }, + { + "epoch": 2.4083102493074793, + "grad_norm": 0.7243341207504272, + "learning_rate": 3.2829287362401398e-06, + "loss": 0.5975, + "step": 8694 + }, + { + "epoch": 2.4085872576177287, + "grad_norm": 0.7616162300109863, + "learning_rate": 3.282582767837559e-06, + "loss": 0.5779, + "step": 8695 + }, + { + "epoch": 2.4088642659279778, + "grad_norm": 0.7368990182876587, + "learning_rate": 3.2822367828187475e-06, + "loss": 0.5882, + "step": 8696 + }, + { + "epoch": 2.409141274238227, + "grad_norm": 0.7216224670410156, + "learning_rate": 3.281890781191051e-06, + "loss": 0.5886, + "step": 8697 + }, + { + "epoch": 2.4094182825484767, + "grad_norm": 0.739101767539978, + "learning_rate": 3.2815447629618167e-06, + "loss": 0.6104, + "step": 8698 + }, + { + "epoch": 2.4096952908587257, + "grad_norm": 0.7256513833999634, + "learning_rate": 3.281198728138392e-06, + "loss": 0.5413, + "step": 8699 + }, + { + "epoch": 2.409972299168975, + "grad_norm": 0.7555835247039795, + "learning_rate": 3.2808526767281223e-06, + "loss": 0.6377, + "step": 8700 + }, + { + "epoch": 2.4102493074792246, + "grad_norm": 0.7214328050613403, + "learning_rate": 3.280506608738357e-06, + "loss": 0.5582, + "step": 8701 + }, + { + "epoch": 2.4105263157894736, + "grad_norm": 0.7244924306869507, + "learning_rate": 3.2801605241764432e-06, + "loss": 0.6037, + "step": 8702 + }, + { + "epoch": 2.410803324099723, + "grad_norm": 0.7230759859085083, + "learning_rate": 3.2798144230497287e-06, + "loss": 0.6108, + "step": 8703 + }, + { + "epoch": 2.4110803324099725, + "grad_norm": 0.7757418751716614, + "learning_rate": 3.2794683053655623e-06, + "loss": 0.6112, + "step": 8704 + }, + { + "epoch": 2.4113573407202216, + "grad_norm": 0.756781816482544, + "learning_rate": 3.2791221711312945e-06, + "loss": 0.5969, + "step": 8705 + }, + { + "epoch": 2.411634349030471, + "grad_norm": 0.7342284917831421, + "learning_rate": 3.2787760203542716e-06, + "loss": 0.6036, + "step": 8706 + }, + { + "epoch": 2.41191135734072, + "grad_norm": 0.7624217867851257, + "learning_rate": 3.2784298530418464e-06, + "loss": 0.6183, + "step": 8707 + }, + { + "epoch": 2.4121883656509695, + "grad_norm": 0.7700241208076477, + "learning_rate": 3.278083669201367e-06, + "loss": 0.5969, + "step": 8708 + }, + { + "epoch": 2.412465373961219, + "grad_norm": 0.7754845023155212, + "learning_rate": 3.277737468840184e-06, + "loss": 0.63, + "step": 8709 + }, + { + "epoch": 2.412742382271468, + "grad_norm": 0.7279965877532959, + "learning_rate": 3.277391251965649e-06, + "loss": 0.601, + "step": 8710 + }, + { + "epoch": 2.4130193905817174, + "grad_norm": 0.7578972578048706, + "learning_rate": 3.2770450185851123e-06, + "loss": 0.6033, + "step": 8711 + }, + { + "epoch": 2.413296398891967, + "grad_norm": 0.7431296110153198, + "learning_rate": 3.2766987687059254e-06, + "loss": 0.6556, + "step": 8712 + }, + { + "epoch": 2.413573407202216, + "grad_norm": 0.7281125783920288, + "learning_rate": 3.27635250233544e-06, + "loss": 0.6182, + "step": 8713 + }, + { + "epoch": 2.4138504155124654, + "grad_norm": 0.7503303289413452, + "learning_rate": 3.2760062194810083e-06, + "loss": 0.6364, + "step": 8714 + }, + { + "epoch": 2.414127423822715, + "grad_norm": 0.7167266607284546, + "learning_rate": 3.2756599201499834e-06, + "loss": 0.635, + "step": 8715 + }, + { + "epoch": 2.414404432132964, + "grad_norm": 0.7465100288391113, + "learning_rate": 3.2753136043497175e-06, + "loss": 0.5931, + "step": 8716 + }, + { + "epoch": 2.4146814404432133, + "grad_norm": 0.7193043828010559, + "learning_rate": 3.2749672720875636e-06, + "loss": 0.6033, + "step": 8717 + }, + { + "epoch": 2.4149584487534628, + "grad_norm": 0.7016171216964722, + "learning_rate": 3.274620923370876e-06, + "loss": 0.5477, + "step": 8718 + }, + { + "epoch": 2.415235457063712, + "grad_norm": 0.7321090698242188, + "learning_rate": 3.2742745582070068e-06, + "loss": 0.588, + "step": 8719 + }, + { + "epoch": 2.4155124653739612, + "grad_norm": 0.7451436519622803, + "learning_rate": 3.2739281766033126e-06, + "loss": 0.6086, + "step": 8720 + }, + { + "epoch": 2.4157894736842107, + "grad_norm": 0.7388331294059753, + "learning_rate": 3.2735817785671466e-06, + "loss": 0.5931, + "step": 8721 + }, + { + "epoch": 2.4160664819944597, + "grad_norm": 0.7376699447631836, + "learning_rate": 3.2732353641058633e-06, + "loss": 0.5939, + "step": 8722 + }, + { + "epoch": 2.416343490304709, + "grad_norm": 0.771159827709198, + "learning_rate": 3.272888933226819e-06, + "loss": 0.5893, + "step": 8723 + }, + { + "epoch": 2.416620498614958, + "grad_norm": 0.7518683671951294, + "learning_rate": 3.272542485937369e-06, + "loss": 0.6219, + "step": 8724 + }, + { + "epoch": 2.4168975069252077, + "grad_norm": 0.7549920082092285, + "learning_rate": 3.272196022244869e-06, + "loss": 0.5993, + "step": 8725 + }, + { + "epoch": 2.417174515235457, + "grad_norm": 0.7153391242027283, + "learning_rate": 3.2718495421566754e-06, + "loss": 0.612, + "step": 8726 + }, + { + "epoch": 2.417451523545706, + "grad_norm": 0.747154951095581, + "learning_rate": 3.271503045680145e-06, + "loss": 0.5827, + "step": 8727 + }, + { + "epoch": 2.4177285318559556, + "grad_norm": 0.7624956369400024, + "learning_rate": 3.2711565328226354e-06, + "loss": 0.6285, + "step": 8728 + }, + { + "epoch": 2.418005540166205, + "grad_norm": 0.731533408164978, + "learning_rate": 3.2708100035915027e-06, + "loss": 0.5908, + "step": 8729 + }, + { + "epoch": 2.418282548476454, + "grad_norm": 0.7518516182899475, + "learning_rate": 3.270463457994105e-06, + "loss": 0.618, + "step": 8730 + }, + { + "epoch": 2.4185595567867035, + "grad_norm": 0.7448464035987854, + "learning_rate": 3.270116896037801e-06, + "loss": 0.5555, + "step": 8731 + }, + { + "epoch": 2.418836565096953, + "grad_norm": 0.7565663456916809, + "learning_rate": 3.269770317729948e-06, + "loss": 0.6204, + "step": 8732 + }, + { + "epoch": 2.419113573407202, + "grad_norm": 0.730437695980072, + "learning_rate": 3.2694237230779056e-06, + "loss": 0.5792, + "step": 8733 + }, + { + "epoch": 2.4193905817174515, + "grad_norm": 0.7168110609054565, + "learning_rate": 3.269077112089033e-06, + "loss": 0.5732, + "step": 8734 + }, + { + "epoch": 2.419667590027701, + "grad_norm": 0.725572407245636, + "learning_rate": 3.268730484770689e-06, + "loss": 0.5353, + "step": 8735 + }, + { + "epoch": 2.41994459833795, + "grad_norm": 0.7277836799621582, + "learning_rate": 3.268383841130234e-06, + "loss": 0.6352, + "step": 8736 + }, + { + "epoch": 2.4202216066481994, + "grad_norm": 0.7155618667602539, + "learning_rate": 3.268037181175028e-06, + "loss": 0.5453, + "step": 8737 + }, + { + "epoch": 2.420498614958449, + "grad_norm": 0.7514188289642334, + "learning_rate": 3.2676905049124306e-06, + "loss": 0.5491, + "step": 8738 + }, + { + "epoch": 2.420775623268698, + "grad_norm": 0.776049792766571, + "learning_rate": 3.267343812349803e-06, + "loss": 0.5975, + "step": 8739 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.7240673303604126, + "learning_rate": 3.266997103494508e-06, + "loss": 0.6213, + "step": 8740 + }, + { + "epoch": 2.421329639889197, + "grad_norm": 0.721198320388794, + "learning_rate": 3.266650378353905e-06, + "loss": 0.5853, + "step": 8741 + }, + { + "epoch": 2.421606648199446, + "grad_norm": 0.7547255754470825, + "learning_rate": 3.266303636935357e-06, + "loss": 0.589, + "step": 8742 + }, + { + "epoch": 2.4218836565096953, + "grad_norm": 0.7153939604759216, + "learning_rate": 3.2659568792462254e-06, + "loss": 0.576, + "step": 8743 + }, + { + "epoch": 2.4221606648199447, + "grad_norm": 0.7191527485847473, + "learning_rate": 3.2656101052938737e-06, + "loss": 0.6107, + "step": 8744 + }, + { + "epoch": 2.4224376731301938, + "grad_norm": 0.6901411414146423, + "learning_rate": 3.2652633150856637e-06, + "loss": 0.5964, + "step": 8745 + }, + { + "epoch": 2.422714681440443, + "grad_norm": 0.7672993540763855, + "learning_rate": 3.2649165086289597e-06, + "loss": 0.6053, + "step": 8746 + }, + { + "epoch": 2.4229916897506927, + "grad_norm": 0.7194650769233704, + "learning_rate": 3.2645696859311248e-06, + "loss": 0.5777, + "step": 8747 + }, + { + "epoch": 2.4232686980609417, + "grad_norm": 0.7330414652824402, + "learning_rate": 3.264222846999523e-06, + "loss": 0.6037, + "step": 8748 + }, + { + "epoch": 2.423545706371191, + "grad_norm": 0.6991344094276428, + "learning_rate": 3.263875991841518e-06, + "loss": 0.5679, + "step": 8749 + }, + { + "epoch": 2.4238227146814406, + "grad_norm": 0.7010491490364075, + "learning_rate": 3.2635291204644755e-06, + "loss": 0.5512, + "step": 8750 + }, + { + "epoch": 2.4240997229916896, + "grad_norm": 0.7519541382789612, + "learning_rate": 3.26318223287576e-06, + "loss": 0.6287, + "step": 8751 + }, + { + "epoch": 2.424376731301939, + "grad_norm": 0.7270293235778809, + "learning_rate": 3.2628353290827365e-06, + "loss": 0.5173, + "step": 8752 + }, + { + "epoch": 2.4246537396121886, + "grad_norm": 0.7756695747375488, + "learning_rate": 3.262488409092771e-06, + "loss": 0.57, + "step": 8753 + }, + { + "epoch": 2.4249307479224376, + "grad_norm": 0.7207167744636536, + "learning_rate": 3.2621414729132296e-06, + "loss": 0.6317, + "step": 8754 + }, + { + "epoch": 2.425207756232687, + "grad_norm": 0.699981153011322, + "learning_rate": 3.261794520551478e-06, + "loss": 0.5664, + "step": 8755 + }, + { + "epoch": 2.4254847645429365, + "grad_norm": 0.7614565491676331, + "learning_rate": 3.2614475520148836e-06, + "loss": 0.592, + "step": 8756 + }, + { + "epoch": 2.4257617728531855, + "grad_norm": 0.7084278464317322, + "learning_rate": 3.2611005673108133e-06, + "loss": 0.6096, + "step": 8757 + }, + { + "epoch": 2.426038781163435, + "grad_norm": 0.6804847121238708, + "learning_rate": 3.2607535664466334e-06, + "loss": 0.5873, + "step": 8758 + }, + { + "epoch": 2.4263157894736844, + "grad_norm": 0.7468960881233215, + "learning_rate": 3.260406549429714e-06, + "loss": 0.6394, + "step": 8759 + }, + { + "epoch": 2.4265927977839334, + "grad_norm": 0.7546526789665222, + "learning_rate": 3.26005951626742e-06, + "loss": 0.6261, + "step": 8760 + }, + { + "epoch": 2.426869806094183, + "grad_norm": 0.751997172832489, + "learning_rate": 3.259712466967123e-06, + "loss": 0.6334, + "step": 8761 + }, + { + "epoch": 2.4271468144044324, + "grad_norm": 0.7271677255630493, + "learning_rate": 3.25936540153619e-06, + "loss": 0.5969, + "step": 8762 + }, + { + "epoch": 2.4274238227146814, + "grad_norm": 0.7059834599494934, + "learning_rate": 3.2590183199819896e-06, + "loss": 0.5882, + "step": 8763 + }, + { + "epoch": 2.427700831024931, + "grad_norm": 0.7132251858711243, + "learning_rate": 3.258671222311892e-06, + "loss": 0.5784, + "step": 8764 + }, + { + "epoch": 2.4279778393351803, + "grad_norm": 0.7054438591003418, + "learning_rate": 3.2583241085332674e-06, + "loss": 0.6, + "step": 8765 + }, + { + "epoch": 2.4282548476454293, + "grad_norm": 0.7704301476478577, + "learning_rate": 3.257976978653485e-06, + "loss": 0.5941, + "step": 8766 + }, + { + "epoch": 2.4285318559556788, + "grad_norm": 0.7076848745346069, + "learning_rate": 3.257629832679916e-06, + "loss": 0.6003, + "step": 8767 + }, + { + "epoch": 2.428808864265928, + "grad_norm": 0.6868513226509094, + "learning_rate": 3.2572826706199304e-06, + "loss": 0.5847, + "step": 8768 + }, + { + "epoch": 2.4290858725761773, + "grad_norm": 0.7200565338134766, + "learning_rate": 3.2569354924809005e-06, + "loss": 0.5691, + "step": 8769 + }, + { + "epoch": 2.4293628808864267, + "grad_norm": 0.722827672958374, + "learning_rate": 3.2565882982701967e-06, + "loss": 0.5868, + "step": 8770 + }, + { + "epoch": 2.4296398891966757, + "grad_norm": 0.7694891691207886, + "learning_rate": 3.256241087995191e-06, + "loss": 0.6318, + "step": 8771 + }, + { + "epoch": 2.429916897506925, + "grad_norm": 0.7697635293006897, + "learning_rate": 3.255893861663257e-06, + "loss": 0.5825, + "step": 8772 + }, + { + "epoch": 2.4301939058171746, + "grad_norm": 0.7312008142471313, + "learning_rate": 3.255546619281765e-06, + "loss": 0.5826, + "step": 8773 + }, + { + "epoch": 2.4304709141274237, + "grad_norm": 0.7236773371696472, + "learning_rate": 3.2551993608580884e-06, + "loss": 0.5953, + "step": 8774 + }, + { + "epoch": 2.430747922437673, + "grad_norm": 0.7374091148376465, + "learning_rate": 3.2548520863996013e-06, + "loss": 0.6046, + "step": 8775 + }, + { + "epoch": 2.4310249307479226, + "grad_norm": 0.746926486492157, + "learning_rate": 3.254504795913677e-06, + "loss": 0.6044, + "step": 8776 + }, + { + "epoch": 2.4313019390581716, + "grad_norm": 0.7748494148254395, + "learning_rate": 3.2541574894076884e-06, + "loss": 0.5798, + "step": 8777 + }, + { + "epoch": 2.431578947368421, + "grad_norm": 0.7435228228569031, + "learning_rate": 3.253810166889011e-06, + "loss": 0.5884, + "step": 8778 + }, + { + "epoch": 2.4318559556786705, + "grad_norm": 0.756584882736206, + "learning_rate": 3.2534628283650184e-06, + "loss": 0.5781, + "step": 8779 + }, + { + "epoch": 2.4321329639889195, + "grad_norm": 0.7341037392616272, + "learning_rate": 3.2531154738430857e-06, + "loss": 0.5717, + "step": 8780 + }, + { + "epoch": 2.432409972299169, + "grad_norm": 0.7264748811721802, + "learning_rate": 3.252768103330589e-06, + "loss": 0.6186, + "step": 8781 + }, + { + "epoch": 2.4326869806094185, + "grad_norm": 0.715746283531189, + "learning_rate": 3.252420716834902e-06, + "loss": 0.5697, + "step": 8782 + }, + { + "epoch": 2.4329639889196675, + "grad_norm": 0.8117491006851196, + "learning_rate": 3.2520733143634027e-06, + "loss": 0.6284, + "step": 8783 + }, + { + "epoch": 2.433240997229917, + "grad_norm": 0.7050234079360962, + "learning_rate": 3.251725895923466e-06, + "loss": 0.625, + "step": 8784 + }, + { + "epoch": 2.4335180055401664, + "grad_norm": 0.7163100838661194, + "learning_rate": 3.2513784615224686e-06, + "loss": 0.6722, + "step": 8785 + }, + { + "epoch": 2.4337950138504154, + "grad_norm": 0.7334553003311157, + "learning_rate": 3.2510310111677877e-06, + "loss": 0.6164, + "step": 8786 + }, + { + "epoch": 2.434072022160665, + "grad_norm": 0.7212862968444824, + "learning_rate": 3.2506835448668007e-06, + "loss": 0.6006, + "step": 8787 + }, + { + "epoch": 2.434349030470914, + "grad_norm": 0.7204136252403259, + "learning_rate": 3.2503360626268855e-06, + "loss": 0.6021, + "step": 8788 + }, + { + "epoch": 2.4346260387811633, + "grad_norm": 0.7287476062774658, + "learning_rate": 3.2499885644554185e-06, + "loss": 0.6392, + "step": 8789 + }, + { + "epoch": 2.434903047091413, + "grad_norm": 0.7399448156356812, + "learning_rate": 3.249641050359779e-06, + "loss": 0.6185, + "step": 8790 + }, + { + "epoch": 2.435180055401662, + "grad_norm": 0.7647336721420288, + "learning_rate": 3.2492935203473468e-06, + "loss": 0.635, + "step": 8791 + }, + { + "epoch": 2.4354570637119113, + "grad_norm": 0.7237324118614197, + "learning_rate": 3.2489459744254986e-06, + "loss": 0.612, + "step": 8792 + }, + { + "epoch": 2.4357340720221607, + "grad_norm": 0.7611182332038879, + "learning_rate": 3.248598412601615e-06, + "loss": 0.6438, + "step": 8793 + }, + { + "epoch": 2.4360110803324098, + "grad_norm": 0.7452676296234131, + "learning_rate": 3.248250834883076e-06, + "loss": 0.5965, + "step": 8794 + }, + { + "epoch": 2.436288088642659, + "grad_norm": 0.7790424227714539, + "learning_rate": 3.24790324127726e-06, + "loss": 0.5996, + "step": 8795 + }, + { + "epoch": 2.4365650969529087, + "grad_norm": 0.7863867878913879, + "learning_rate": 3.247555631791549e-06, + "loss": 0.5955, + "step": 8796 + }, + { + "epoch": 2.4368421052631577, + "grad_norm": 0.7386239171028137, + "learning_rate": 3.2472080064333226e-06, + "loss": 0.586, + "step": 8797 + }, + { + "epoch": 2.437119113573407, + "grad_norm": 0.7536293864250183, + "learning_rate": 3.2468603652099624e-06, + "loss": 0.6069, + "step": 8798 + }, + { + "epoch": 2.4373961218836566, + "grad_norm": 0.7368284463882446, + "learning_rate": 3.246512708128849e-06, + "loss": 0.6208, + "step": 8799 + }, + { + "epoch": 2.4376731301939056, + "grad_norm": 0.7683560252189636, + "learning_rate": 3.2461650351973644e-06, + "loss": 0.5284, + "step": 8800 + }, + { + "epoch": 2.437950138504155, + "grad_norm": 0.7497748732566833, + "learning_rate": 3.2458173464228905e-06, + "loss": 0.6377, + "step": 8801 + }, + { + "epoch": 2.4382271468144046, + "grad_norm": 0.763282060623169, + "learning_rate": 3.2454696418128103e-06, + "loss": 0.602, + "step": 8802 + }, + { + "epoch": 2.4385041551246536, + "grad_norm": 0.6915131211280823, + "learning_rate": 3.245121921374505e-06, + "loss": 0.6292, + "step": 8803 + }, + { + "epoch": 2.438781163434903, + "grad_norm": 0.7452968955039978, + "learning_rate": 3.2447741851153596e-06, + "loss": 0.5869, + "step": 8804 + }, + { + "epoch": 2.4390581717451525, + "grad_norm": 0.7962399125099182, + "learning_rate": 3.2444264330427555e-06, + "loss": 0.627, + "step": 8805 + }, + { + "epoch": 2.4393351800554015, + "grad_norm": 0.7612031698226929, + "learning_rate": 3.244078665164077e-06, + "loss": 0.6268, + "step": 8806 + }, + { + "epoch": 2.439612188365651, + "grad_norm": 0.6910748481750488, + "learning_rate": 3.243730881486709e-06, + "loss": 0.5159, + "step": 8807 + }, + { + "epoch": 2.4398891966759004, + "grad_norm": 0.7467472553253174, + "learning_rate": 3.2433830820180346e-06, + "loss": 0.5723, + "step": 8808 + }, + { + "epoch": 2.4401662049861494, + "grad_norm": 0.7672812938690186, + "learning_rate": 3.243035266765439e-06, + "loss": 0.6108, + "step": 8809 + }, + { + "epoch": 2.440443213296399, + "grad_norm": 0.7103387117385864, + "learning_rate": 3.2426874357363077e-06, + "loss": 0.593, + "step": 8810 + }, + { + "epoch": 2.4407202216066484, + "grad_norm": 0.7184141278266907, + "learning_rate": 3.242339588938025e-06, + "loss": 0.6287, + "step": 8811 + }, + { + "epoch": 2.4409972299168974, + "grad_norm": 0.7140890955924988, + "learning_rate": 3.2419917263779765e-06, + "loss": 0.6144, + "step": 8812 + }, + { + "epoch": 2.441274238227147, + "grad_norm": 0.7000736594200134, + "learning_rate": 3.24164384806355e-06, + "loss": 0.5956, + "step": 8813 + }, + { + "epoch": 2.4415512465373963, + "grad_norm": 0.709296703338623, + "learning_rate": 3.24129595400213e-06, + "loss": 0.5779, + "step": 8814 + }, + { + "epoch": 2.4418282548476453, + "grad_norm": 0.7582641839981079, + "learning_rate": 3.2409480442011042e-06, + "loss": 0.6205, + "step": 8815 + }, + { + "epoch": 2.442105263157895, + "grad_norm": 0.7473920583724976, + "learning_rate": 3.2406001186678594e-06, + "loss": 0.6211, + "step": 8816 + }, + { + "epoch": 2.4423822714681442, + "grad_norm": 0.700025200843811, + "learning_rate": 3.2402521774097824e-06, + "loss": 0.5868, + "step": 8817 + }, + { + "epoch": 2.4426592797783933, + "grad_norm": 0.7863488793373108, + "learning_rate": 3.239904220434261e-06, + "loss": 0.7038, + "step": 8818 + }, + { + "epoch": 2.4429362880886427, + "grad_norm": 0.7783229351043701, + "learning_rate": 3.2395562477486836e-06, + "loss": 0.5894, + "step": 8819 + }, + { + "epoch": 2.443213296398892, + "grad_norm": 0.7442140579223633, + "learning_rate": 3.239208259360439e-06, + "loss": 0.5693, + "step": 8820 + }, + { + "epoch": 2.443490304709141, + "grad_norm": 0.6942600011825562, + "learning_rate": 3.2388602552769145e-06, + "loss": 0.5629, + "step": 8821 + }, + { + "epoch": 2.4437673130193907, + "grad_norm": 0.7239096760749817, + "learning_rate": 3.2385122355055004e-06, + "loss": 0.5711, + "step": 8822 + }, + { + "epoch": 2.44404432132964, + "grad_norm": 0.7572760581970215, + "learning_rate": 3.2381642000535858e-06, + "loss": 0.6027, + "step": 8823 + }, + { + "epoch": 2.444321329639889, + "grad_norm": 0.6888085603713989, + "learning_rate": 3.2378161489285597e-06, + "loss": 0.5529, + "step": 8824 + }, + { + "epoch": 2.4445983379501386, + "grad_norm": 0.7307559251785278, + "learning_rate": 3.2374680821378122e-06, + "loss": 0.6143, + "step": 8825 + }, + { + "epoch": 2.444875346260388, + "grad_norm": 0.7402287125587463, + "learning_rate": 3.2371199996887357e-06, + "loss": 0.6132, + "step": 8826 + }, + { + "epoch": 2.445152354570637, + "grad_norm": 0.7784111499786377, + "learning_rate": 3.2367719015887177e-06, + "loss": 0.5761, + "step": 8827 + }, + { + "epoch": 2.4454293628808865, + "grad_norm": 0.7317233681678772, + "learning_rate": 3.2364237878451517e-06, + "loss": 0.5782, + "step": 8828 + }, + { + "epoch": 2.4457063711911355, + "grad_norm": 0.7439118027687073, + "learning_rate": 3.2360756584654275e-06, + "loss": 0.5967, + "step": 8829 + }, + { + "epoch": 2.445983379501385, + "grad_norm": 0.7780766487121582, + "learning_rate": 3.235727513456938e-06, + "loss": 0.5966, + "step": 8830 + }, + { + "epoch": 2.4462603878116345, + "grad_norm": 0.7411060929298401, + "learning_rate": 3.2353793528270727e-06, + "loss": 0.604, + "step": 8831 + }, + { + "epoch": 2.4465373961218835, + "grad_norm": 0.7590367197990417, + "learning_rate": 3.2350311765832275e-06, + "loss": 0.5924, + "step": 8832 + }, + { + "epoch": 2.446814404432133, + "grad_norm": 0.7714030742645264, + "learning_rate": 3.234682984732793e-06, + "loss": 0.6387, + "step": 8833 + }, + { + "epoch": 2.4470914127423824, + "grad_norm": 0.7148301005363464, + "learning_rate": 3.234334777283162e-06, + "loss": 0.5936, + "step": 8834 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.7669836282730103, + "learning_rate": 3.2339865542417292e-06, + "loss": 0.6355, + "step": 8835 + }, + { + "epoch": 2.447645429362881, + "grad_norm": 0.7615107297897339, + "learning_rate": 3.233638315615887e-06, + "loss": 0.6287, + "step": 8836 + }, + { + "epoch": 2.4479224376731303, + "grad_norm": 0.7076432108879089, + "learning_rate": 3.2332900614130296e-06, + "loss": 0.6353, + "step": 8837 + }, + { + "epoch": 2.4481994459833794, + "grad_norm": 0.7279842495918274, + "learning_rate": 3.2329417916405527e-06, + "loss": 0.5932, + "step": 8838 + }, + { + "epoch": 2.448476454293629, + "grad_norm": 0.7167196869850159, + "learning_rate": 3.232593506305849e-06, + "loss": 0.5794, + "step": 8839 + }, + { + "epoch": 2.4487534626038783, + "grad_norm": 0.7963582277297974, + "learning_rate": 3.232245205416314e-06, + "loss": 0.6319, + "step": 8840 + }, + { + "epoch": 2.4490304709141273, + "grad_norm": 0.7460035681724548, + "learning_rate": 3.2318968889793434e-06, + "loss": 0.5953, + "step": 8841 + }, + { + "epoch": 2.4493074792243767, + "grad_norm": 0.7167977690696716, + "learning_rate": 3.231548557002333e-06, + "loss": 0.5879, + "step": 8842 + }, + { + "epoch": 2.449584487534626, + "grad_norm": 0.752393901348114, + "learning_rate": 3.231200209492679e-06, + "loss": 0.6175, + "step": 8843 + }, + { + "epoch": 2.4498614958448752, + "grad_norm": 0.7277638912200928, + "learning_rate": 3.2308518464577767e-06, + "loss": 0.5695, + "step": 8844 + }, + { + "epoch": 2.4501385041551247, + "grad_norm": 0.7235784530639648, + "learning_rate": 3.230503467905024e-06, + "loss": 0.5852, + "step": 8845 + }, + { + "epoch": 2.450415512465374, + "grad_norm": 0.7358263731002808, + "learning_rate": 3.2301550738418165e-06, + "loss": 0.5507, + "step": 8846 + }, + { + "epoch": 2.450692520775623, + "grad_norm": 0.7292805910110474, + "learning_rate": 3.229806664275552e-06, + "loss": 0.6298, + "step": 8847 + }, + { + "epoch": 2.4509695290858726, + "grad_norm": 0.7070525288581848, + "learning_rate": 3.2294582392136293e-06, + "loss": 0.5415, + "step": 8848 + }, + { + "epoch": 2.4512465373961216, + "grad_norm": 0.7506101727485657, + "learning_rate": 3.229109798663444e-06, + "loss": 0.599, + "step": 8849 + }, + { + "epoch": 2.451523545706371, + "grad_norm": 0.7339098453521729, + "learning_rate": 3.2287613426323966e-06, + "loss": 0.5921, + "step": 8850 + }, + { + "epoch": 2.4518005540166206, + "grad_norm": 0.8022693991661072, + "learning_rate": 3.228412871127885e-06, + "loss": 0.5857, + "step": 8851 + }, + { + "epoch": 2.4520775623268696, + "grad_norm": 0.7564173340797424, + "learning_rate": 3.228064384157308e-06, + "loss": 0.6334, + "step": 8852 + }, + { + "epoch": 2.452354570637119, + "grad_norm": 0.7244262099266052, + "learning_rate": 3.227715881728064e-06, + "loss": 0.6374, + "step": 8853 + }, + { + "epoch": 2.4526315789473685, + "grad_norm": 0.7423130869865417, + "learning_rate": 3.227367363847554e-06, + "loss": 0.6213, + "step": 8854 + }, + { + "epoch": 2.4529085872576175, + "grad_norm": 0.7089356780052185, + "learning_rate": 3.2270188305231776e-06, + "loss": 0.5893, + "step": 8855 + }, + { + "epoch": 2.453185595567867, + "grad_norm": 0.7374478578567505, + "learning_rate": 3.2266702817623348e-06, + "loss": 0.5772, + "step": 8856 + }, + { + "epoch": 2.4534626038781164, + "grad_norm": 0.7225603461265564, + "learning_rate": 3.2263217175724255e-06, + "loss": 0.5922, + "step": 8857 + }, + { + "epoch": 2.4537396121883654, + "grad_norm": 0.7521680593490601, + "learning_rate": 3.225973137960852e-06, + "loss": 0.6135, + "step": 8858 + }, + { + "epoch": 2.454016620498615, + "grad_norm": 0.7221606373786926, + "learning_rate": 3.2256245429350142e-06, + "loss": 0.6049, + "step": 8859 + }, + { + "epoch": 2.4542936288088644, + "grad_norm": 0.7255619168281555, + "learning_rate": 3.2252759325023153e-06, + "loss": 0.605, + "step": 8860 + }, + { + "epoch": 2.4545706371191134, + "grad_norm": 0.7641960382461548, + "learning_rate": 3.224927306670156e-06, + "loss": 0.5981, + "step": 8861 + }, + { + "epoch": 2.454847645429363, + "grad_norm": 0.7062257528305054, + "learning_rate": 3.224578665445938e-06, + "loss": 0.5689, + "step": 8862 + }, + { + "epoch": 2.4551246537396123, + "grad_norm": 0.6948105096817017, + "learning_rate": 3.2242300088370646e-06, + "loss": 0.5594, + "step": 8863 + }, + { + "epoch": 2.4554016620498613, + "grad_norm": 0.6900063157081604, + "learning_rate": 3.2238813368509393e-06, + "loss": 0.5439, + "step": 8864 + }, + { + "epoch": 2.455678670360111, + "grad_norm": 0.7369881272315979, + "learning_rate": 3.2235326494949644e-06, + "loss": 0.5593, + "step": 8865 + }, + { + "epoch": 2.4559556786703602, + "grad_norm": 0.69451904296875, + "learning_rate": 3.2231839467765423e-06, + "loss": 0.5791, + "step": 8866 + }, + { + "epoch": 2.4562326869806093, + "grad_norm": 0.7775591611862183, + "learning_rate": 3.2228352287030807e-06, + "loss": 0.5762, + "step": 8867 + }, + { + "epoch": 2.4565096952908587, + "grad_norm": 0.7555267214775085, + "learning_rate": 3.2224864952819797e-06, + "loss": 0.5774, + "step": 8868 + }, + { + "epoch": 2.456786703601108, + "grad_norm": 0.7609298229217529, + "learning_rate": 3.2221377465206457e-06, + "loss": 0.5823, + "step": 8869 + }, + { + "epoch": 2.457063711911357, + "grad_norm": 0.7338734269142151, + "learning_rate": 3.221788982426483e-06, + "loss": 0.5866, + "step": 8870 + }, + { + "epoch": 2.4573407202216067, + "grad_norm": 0.7334607243537903, + "learning_rate": 3.2214402030068974e-06, + "loss": 0.6116, + "step": 8871 + }, + { + "epoch": 2.457617728531856, + "grad_norm": 0.7804694771766663, + "learning_rate": 3.2210914082692935e-06, + "loss": 0.6157, + "step": 8872 + }, + { + "epoch": 2.457894736842105, + "grad_norm": 0.682929515838623, + "learning_rate": 3.220742598221078e-06, + "loss": 0.5643, + "step": 8873 + }, + { + "epoch": 2.4581717451523546, + "grad_norm": 0.7627580761909485, + "learning_rate": 3.220393772869656e-06, + "loss": 0.5662, + "step": 8874 + }, + { + "epoch": 2.458448753462604, + "grad_norm": 0.741961658000946, + "learning_rate": 3.2200449322224347e-06, + "loss": 0.6184, + "step": 8875 + }, + { + "epoch": 2.458725761772853, + "grad_norm": 0.7630350589752197, + "learning_rate": 3.2196960762868206e-06, + "loss": 0.5638, + "step": 8876 + }, + { + "epoch": 2.4590027700831025, + "grad_norm": 0.7240167856216431, + "learning_rate": 3.2193472050702214e-06, + "loss": 0.5876, + "step": 8877 + }, + { + "epoch": 2.459279778393352, + "grad_norm": 0.7393755912780762, + "learning_rate": 3.218998318580043e-06, + "loss": 0.5972, + "step": 8878 + }, + { + "epoch": 2.459556786703601, + "grad_norm": 0.7540607452392578, + "learning_rate": 3.2186494168236947e-06, + "loss": 0.5818, + "step": 8879 + }, + { + "epoch": 2.4598337950138505, + "grad_norm": 0.6882795095443726, + "learning_rate": 3.2183004998085852e-06, + "loss": 0.5603, + "step": 8880 + }, + { + "epoch": 2.4601108033241, + "grad_norm": 0.7658278346061707, + "learning_rate": 3.2179515675421203e-06, + "loss": 0.6163, + "step": 8881 + }, + { + "epoch": 2.460387811634349, + "grad_norm": 0.7589587569236755, + "learning_rate": 3.2176026200317105e-06, + "loss": 0.6216, + "step": 8882 + }, + { + "epoch": 2.4606648199445984, + "grad_norm": 0.733665943145752, + "learning_rate": 3.2172536572847653e-06, + "loss": 0.5882, + "step": 8883 + }, + { + "epoch": 2.460941828254848, + "grad_norm": 0.7620385885238647, + "learning_rate": 3.216904679308692e-06, + "loss": 0.5885, + "step": 8884 + }, + { + "epoch": 2.461218836565097, + "grad_norm": 0.7346259355545044, + "learning_rate": 3.2165556861109027e-06, + "loss": 0.5972, + "step": 8885 + }, + { + "epoch": 2.4614958448753463, + "grad_norm": 0.7112541794776917, + "learning_rate": 3.216206677698806e-06, + "loss": 0.5705, + "step": 8886 + }, + { + "epoch": 2.461772853185596, + "grad_norm": 0.7470067739486694, + "learning_rate": 3.215857654079812e-06, + "loss": 0.6043, + "step": 8887 + }, + { + "epoch": 2.462049861495845, + "grad_norm": 0.7251924872398376, + "learning_rate": 3.2155086152613325e-06, + "loss": 0.5956, + "step": 8888 + }, + { + "epoch": 2.4623268698060943, + "grad_norm": 0.772384762763977, + "learning_rate": 3.215159561250778e-06, + "loss": 0.594, + "step": 8889 + }, + { + "epoch": 2.4626038781163437, + "grad_norm": 0.9380196332931519, + "learning_rate": 3.214810492055559e-06, + "loss": 0.6098, + "step": 8890 + }, + { + "epoch": 2.4628808864265928, + "grad_norm": 0.7411641478538513, + "learning_rate": 3.214461407683088e-06, + "loss": 0.5922, + "step": 8891 + }, + { + "epoch": 2.463157894736842, + "grad_norm": 0.7432118654251099, + "learning_rate": 3.2141123081407773e-06, + "loss": 0.6242, + "step": 8892 + }, + { + "epoch": 2.4634349030470912, + "grad_norm": 0.7496050000190735, + "learning_rate": 3.213763193436038e-06, + "loss": 0.6358, + "step": 8893 + }, + { + "epoch": 2.4637119113573407, + "grad_norm": 0.7449604868888855, + "learning_rate": 3.2134140635762834e-06, + "loss": 0.6158, + "step": 8894 + }, + { + "epoch": 2.46398891966759, + "grad_norm": 0.6991225481033325, + "learning_rate": 3.213064918568927e-06, + "loss": 0.5943, + "step": 8895 + }, + { + "epoch": 2.464265927977839, + "grad_norm": 0.7275388240814209, + "learning_rate": 3.2127157584213806e-06, + "loss": 0.6057, + "step": 8896 + }, + { + "epoch": 2.4645429362880886, + "grad_norm": 0.6911985278129578, + "learning_rate": 3.21236658314106e-06, + "loss": 0.5863, + "step": 8897 + }, + { + "epoch": 2.464819944598338, + "grad_norm": 0.7673735618591309, + "learning_rate": 3.212017392735376e-06, + "loss": 0.6264, + "step": 8898 + }, + { + "epoch": 2.465096952908587, + "grad_norm": 0.7564421892166138, + "learning_rate": 3.2116681872117455e-06, + "loss": 0.6232, + "step": 8899 + }, + { + "epoch": 2.4653739612188366, + "grad_norm": 0.7045862078666687, + "learning_rate": 3.2113189665775812e-06, + "loss": 0.5353, + "step": 8900 + }, + { + "epoch": 2.465650969529086, + "grad_norm": 0.7565885782241821, + "learning_rate": 3.2109697308402994e-06, + "loss": 0.5873, + "step": 8901 + }, + { + "epoch": 2.465927977839335, + "grad_norm": 0.7609415054321289, + "learning_rate": 3.210620480007315e-06, + "loss": 0.6316, + "step": 8902 + }, + { + "epoch": 2.4662049861495845, + "grad_norm": 0.7758066654205322, + "learning_rate": 3.210271214086042e-06, + "loss": 0.6035, + "step": 8903 + }, + { + "epoch": 2.466481994459834, + "grad_norm": 0.7386062741279602, + "learning_rate": 3.209921933083897e-06, + "loss": 0.6171, + "step": 8904 + }, + { + "epoch": 2.466759002770083, + "grad_norm": 0.7325729131698608, + "learning_rate": 3.2095726370082974e-06, + "loss": 0.606, + "step": 8905 + }, + { + "epoch": 2.4670360110803324, + "grad_norm": 0.8654530048370361, + "learning_rate": 3.209223325866659e-06, + "loss": 0.6013, + "step": 8906 + }, + { + "epoch": 2.467313019390582, + "grad_norm": 0.7589396238327026, + "learning_rate": 3.208873999666397e-06, + "loss": 0.5761, + "step": 8907 + }, + { + "epoch": 2.467590027700831, + "grad_norm": 0.7238092422485352, + "learning_rate": 3.2085246584149303e-06, + "loss": 0.6005, + "step": 8908 + }, + { + "epoch": 2.4678670360110804, + "grad_norm": 0.7196837067604065, + "learning_rate": 3.208175302119676e-06, + "loss": 0.5775, + "step": 8909 + }, + { + "epoch": 2.46814404432133, + "grad_norm": 0.740156888961792, + "learning_rate": 3.2078259307880518e-06, + "loss": 0.6192, + "step": 8910 + }, + { + "epoch": 2.468421052631579, + "grad_norm": 0.7320531010627747, + "learning_rate": 3.207476544427474e-06, + "loss": 0.5975, + "step": 8911 + }, + { + "epoch": 2.4686980609418283, + "grad_norm": 0.7081652879714966, + "learning_rate": 3.207127143045364e-06, + "loss": 0.6177, + "step": 8912 + }, + { + "epoch": 2.4689750692520773, + "grad_norm": 0.7383638620376587, + "learning_rate": 3.2067777266491383e-06, + "loss": 0.5823, + "step": 8913 + }, + { + "epoch": 2.469252077562327, + "grad_norm": 0.7729959487915039, + "learning_rate": 3.2064282952462166e-06, + "loss": 0.6159, + "step": 8914 + }, + { + "epoch": 2.4695290858725762, + "grad_norm": 0.7924338579177856, + "learning_rate": 3.206078848844019e-06, + "loss": 0.6329, + "step": 8915 + }, + { + "epoch": 2.4698060941828253, + "grad_norm": 0.7191593647003174, + "learning_rate": 3.2057293874499634e-06, + "loss": 0.5837, + "step": 8916 + }, + { + "epoch": 2.4700831024930747, + "grad_norm": 0.7165887355804443, + "learning_rate": 3.2053799110714704e-06, + "loss": 0.5895, + "step": 8917 + }, + { + "epoch": 2.470360110803324, + "grad_norm": 0.7286561131477356, + "learning_rate": 3.205030419715962e-06, + "loss": 0.5786, + "step": 8918 + }, + { + "epoch": 2.470637119113573, + "grad_norm": 0.7367630004882812, + "learning_rate": 3.204680913390856e-06, + "loss": 0.5704, + "step": 8919 + }, + { + "epoch": 2.4709141274238227, + "grad_norm": 0.7660843729972839, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.6413, + "step": 8920 + }, + { + "epoch": 2.471191135734072, + "grad_norm": 0.7649071216583252, + "learning_rate": 3.2039818558615397e-06, + "loss": 0.57, + "step": 8921 + }, + { + "epoch": 2.471468144044321, + "grad_norm": 0.7629241347312927, + "learning_rate": 3.203632304672172e-06, + "loss": 0.639, + "step": 8922 + }, + { + "epoch": 2.4717451523545706, + "grad_norm": 0.7380210161209106, + "learning_rate": 3.2032827385428932e-06, + "loss": 0.585, + "step": 8923 + }, + { + "epoch": 2.47202216066482, + "grad_norm": 0.7167991995811462, + "learning_rate": 3.2029331574811263e-06, + "loss": 0.5925, + "step": 8924 + }, + { + "epoch": 2.472299168975069, + "grad_norm": 0.7697342038154602, + "learning_rate": 3.2025835614942933e-06, + "loss": 0.5853, + "step": 8925 + }, + { + "epoch": 2.4725761772853185, + "grad_norm": 0.7063153982162476, + "learning_rate": 3.202233950589817e-06, + "loss": 0.5851, + "step": 8926 + }, + { + "epoch": 2.472853185595568, + "grad_norm": 0.7513769268989563, + "learning_rate": 3.201884324775121e-06, + "loss": 0.5711, + "step": 8927 + }, + { + "epoch": 2.473130193905817, + "grad_norm": 0.7194318771362305, + "learning_rate": 3.2015346840576277e-06, + "loss": 0.5436, + "step": 8928 + }, + { + "epoch": 2.4734072022160665, + "grad_norm": 0.7521985769271851, + "learning_rate": 3.201185028444762e-06, + "loss": 0.5963, + "step": 8929 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.7365816235542297, + "learning_rate": 3.2008353579439477e-06, + "loss": 0.595, + "step": 8930 + }, + { + "epoch": 2.473961218836565, + "grad_norm": 0.7061067819595337, + "learning_rate": 3.200485672562609e-06, + "loss": 0.5693, + "step": 8931 + }, + { + "epoch": 2.4742382271468144, + "grad_norm": 0.7480983138084412, + "learning_rate": 3.2001359723081698e-06, + "loss": 0.5948, + "step": 8932 + }, + { + "epoch": 2.474515235457064, + "grad_norm": 0.7211030721664429, + "learning_rate": 3.199786257188057e-06, + "loss": 0.6069, + "step": 8933 + }, + { + "epoch": 2.474792243767313, + "grad_norm": 0.7504557371139526, + "learning_rate": 3.1994365272096943e-06, + "loss": 0.5919, + "step": 8934 + }, + { + "epoch": 2.4750692520775623, + "grad_norm": 0.7413660883903503, + "learning_rate": 3.1990867823805076e-06, + "loss": 0.6089, + "step": 8935 + }, + { + "epoch": 2.475346260387812, + "grad_norm": 0.74480140209198, + "learning_rate": 3.198737022707924e-06, + "loss": 0.6138, + "step": 8936 + }, + { + "epoch": 2.475623268698061, + "grad_norm": 0.7864667773246765, + "learning_rate": 3.1983872481993687e-06, + "loss": 0.595, + "step": 8937 + }, + { + "epoch": 2.4759002770083103, + "grad_norm": 0.73521888256073, + "learning_rate": 3.1980374588622692e-06, + "loss": 0.6243, + "step": 8938 + }, + { + "epoch": 2.4761772853185597, + "grad_norm": 0.7209713459014893, + "learning_rate": 3.197687654704051e-06, + "loss": 0.6126, + "step": 8939 + }, + { + "epoch": 2.4764542936288088, + "grad_norm": 0.7106951475143433, + "learning_rate": 3.1973378357321427e-06, + "loss": 0.5393, + "step": 8940 + }, + { + "epoch": 2.476731301939058, + "grad_norm": 0.7514813542366028, + "learning_rate": 3.1969880019539713e-06, + "loss": 0.598, + "step": 8941 + }, + { + "epoch": 2.4770083102493077, + "grad_norm": 0.7596821188926697, + "learning_rate": 3.1966381533769642e-06, + "loss": 0.6087, + "step": 8942 + }, + { + "epoch": 2.4772853185595567, + "grad_norm": 0.7739108204841614, + "learning_rate": 3.1962882900085507e-06, + "loss": 0.6094, + "step": 8943 + }, + { + "epoch": 2.477562326869806, + "grad_norm": 0.7667077779769897, + "learning_rate": 3.1959384118561587e-06, + "loss": 0.591, + "step": 8944 + }, + { + "epoch": 2.4778393351800556, + "grad_norm": 0.7343790531158447, + "learning_rate": 3.195588518927216e-06, + "loss": 0.5652, + "step": 8945 + }, + { + "epoch": 2.4781163434903046, + "grad_norm": 0.7375099062919617, + "learning_rate": 3.195238611229154e-06, + "loss": 0.5776, + "step": 8946 + }, + { + "epoch": 2.478393351800554, + "grad_norm": 0.7456963062286377, + "learning_rate": 3.1948886887694004e-06, + "loss": 0.6226, + "step": 8947 + }, + { + "epoch": 2.4786703601108035, + "grad_norm": 0.7197065353393555, + "learning_rate": 3.194538751555385e-06, + "loss": 0.598, + "step": 8948 + }, + { + "epoch": 2.4789473684210526, + "grad_norm": 0.715035617351532, + "learning_rate": 3.194188799594538e-06, + "loss": 0.5993, + "step": 8949 + }, + { + "epoch": 2.479224376731302, + "grad_norm": 0.763393759727478, + "learning_rate": 3.193838832894291e-06, + "loss": 0.5925, + "step": 8950 + }, + { + "epoch": 2.4795013850415515, + "grad_norm": 0.7357152104377747, + "learning_rate": 3.1934888514620733e-06, + "loss": 0.6367, + "step": 8951 + }, + { + "epoch": 2.4797783933518005, + "grad_norm": 0.746059000492096, + "learning_rate": 3.1931388553053156e-06, + "loss": 0.6413, + "step": 8952 + }, + { + "epoch": 2.48005540166205, + "grad_norm": 0.7911154627799988, + "learning_rate": 3.1927888444314514e-06, + "loss": 0.6082, + "step": 8953 + }, + { + "epoch": 2.480332409972299, + "grad_norm": 0.7596408128738403, + "learning_rate": 3.1924388188479093e-06, + "loss": 0.574, + "step": 8954 + }, + { + "epoch": 2.4806094182825484, + "grad_norm": 0.6973130702972412, + "learning_rate": 3.1920887785621233e-06, + "loss": 0.5997, + "step": 8955 + }, + { + "epoch": 2.480886426592798, + "grad_norm": 0.7324879765510559, + "learning_rate": 3.191738723581526e-06, + "loss": 0.5778, + "step": 8956 + }, + { + "epoch": 2.481163434903047, + "grad_norm": 0.7372378706932068, + "learning_rate": 3.1913886539135486e-06, + "loss": 0.592, + "step": 8957 + }, + { + "epoch": 2.4814404432132964, + "grad_norm": 0.7409850358963013, + "learning_rate": 3.1910385695656237e-06, + "loss": 0.6055, + "step": 8958 + }, + { + "epoch": 2.481717451523546, + "grad_norm": 0.7677171230316162, + "learning_rate": 3.1906884705451857e-06, + "loss": 0.6248, + "step": 8959 + }, + { + "epoch": 2.481994459833795, + "grad_norm": 0.7217593193054199, + "learning_rate": 3.190338356859668e-06, + "loss": 0.5707, + "step": 8960 + }, + { + "epoch": 2.4822714681440443, + "grad_norm": 0.704062819480896, + "learning_rate": 3.189988228516504e-06, + "loss": 0.5925, + "step": 8961 + }, + { + "epoch": 2.4825484764542938, + "grad_norm": 0.7098830938339233, + "learning_rate": 3.1896380855231274e-06, + "loss": 0.5572, + "step": 8962 + }, + { + "epoch": 2.482825484764543, + "grad_norm": 0.7203060984611511, + "learning_rate": 3.189287927886973e-06, + "loss": 0.5544, + "step": 8963 + }, + { + "epoch": 2.4831024930747922, + "grad_norm": 0.6978738903999329, + "learning_rate": 3.1889377556154765e-06, + "loss": 0.6224, + "step": 8964 + }, + { + "epoch": 2.4833795013850417, + "grad_norm": 0.7589989900588989, + "learning_rate": 3.1885875687160716e-06, + "loss": 0.6335, + "step": 8965 + }, + { + "epoch": 2.4836565096952907, + "grad_norm": 0.7540053725242615, + "learning_rate": 3.188237367196194e-06, + "loss": 0.594, + "step": 8966 + }, + { + "epoch": 2.48393351800554, + "grad_norm": 0.7392476201057434, + "learning_rate": 3.18788715106328e-06, + "loss": 0.6004, + "step": 8967 + }, + { + "epoch": 2.4842105263157896, + "grad_norm": 0.7759470343589783, + "learning_rate": 3.1875369203247648e-06, + "loss": 0.5797, + "step": 8968 + }, + { + "epoch": 2.4844875346260387, + "grad_norm": 0.7690457701683044, + "learning_rate": 3.1871866749880846e-06, + "loss": 0.6422, + "step": 8969 + }, + { + "epoch": 2.484764542936288, + "grad_norm": 0.7595497369766235, + "learning_rate": 3.1868364150606774e-06, + "loss": 0.6317, + "step": 8970 + }, + { + "epoch": 2.4850415512465376, + "grad_norm": 0.7643975615501404, + "learning_rate": 3.186486140549978e-06, + "loss": 0.6209, + "step": 8971 + }, + { + "epoch": 2.4853185595567866, + "grad_norm": 0.7429293394088745, + "learning_rate": 3.1861358514634257e-06, + "loss": 0.65, + "step": 8972 + }, + { + "epoch": 2.485595567867036, + "grad_norm": 0.6742926836013794, + "learning_rate": 3.1857855478084564e-06, + "loss": 0.5493, + "step": 8973 + }, + { + "epoch": 2.485872576177285, + "grad_norm": 0.7188777327537537, + "learning_rate": 3.185435229592509e-06, + "loss": 0.6283, + "step": 8974 + }, + { + "epoch": 2.4861495844875345, + "grad_norm": 0.7193275094032288, + "learning_rate": 3.1850848968230213e-06, + "loss": 0.6176, + "step": 8975 + }, + { + "epoch": 2.486426592797784, + "grad_norm": 0.738929033279419, + "learning_rate": 3.1847345495074312e-06, + "loss": 0.5802, + "step": 8976 + }, + { + "epoch": 2.486703601108033, + "grad_norm": 0.7202889919281006, + "learning_rate": 3.1843841876531783e-06, + "loss": 0.6494, + "step": 8977 + }, + { + "epoch": 2.4869806094182825, + "grad_norm": 0.720928430557251, + "learning_rate": 3.1840338112677017e-06, + "loss": 0.5937, + "step": 8978 + }, + { + "epoch": 2.487257617728532, + "grad_norm": 0.7134568691253662, + "learning_rate": 3.18368342035844e-06, + "loss": 0.5703, + "step": 8979 + }, + { + "epoch": 2.487534626038781, + "grad_norm": 0.7694113850593567, + "learning_rate": 3.1833330149328333e-06, + "loss": 0.5544, + "step": 8980 + }, + { + "epoch": 2.4878116343490304, + "grad_norm": 0.7342956066131592, + "learning_rate": 3.182982594998322e-06, + "loss": 0.6107, + "step": 8981 + }, + { + "epoch": 2.48808864265928, + "grad_norm": 0.7699294090270996, + "learning_rate": 3.1826321605623456e-06, + "loss": 0.5856, + "step": 8982 + }, + { + "epoch": 2.488365650969529, + "grad_norm": 0.7589293122291565, + "learning_rate": 3.1822817116323455e-06, + "loss": 0.6173, + "step": 8983 + }, + { + "epoch": 2.4886426592797783, + "grad_norm": 0.715154767036438, + "learning_rate": 3.181931248215762e-06, + "loss": 0.6067, + "step": 8984 + }, + { + "epoch": 2.488919667590028, + "grad_norm": 0.7071780562400818, + "learning_rate": 3.181580770320038e-06, + "loss": 0.5683, + "step": 8985 + }, + { + "epoch": 2.489196675900277, + "grad_norm": 0.760847806930542, + "learning_rate": 3.1812302779526124e-06, + "loss": 0.5814, + "step": 8986 + }, + { + "epoch": 2.4894736842105263, + "grad_norm": 0.7390440106391907, + "learning_rate": 3.1808797711209282e-06, + "loss": 0.5837, + "step": 8987 + }, + { + "epoch": 2.4897506925207757, + "grad_norm": 0.7559418678283691, + "learning_rate": 3.180529249832428e-06, + "loss": 0.6051, + "step": 8988 + }, + { + "epoch": 2.4900277008310248, + "grad_norm": 0.7289136648178101, + "learning_rate": 3.1801787140945544e-06, + "loss": 0.6074, + "step": 8989 + }, + { + "epoch": 2.490304709141274, + "grad_norm": 0.7169703841209412, + "learning_rate": 3.179828163914749e-06, + "loss": 0.5931, + "step": 8990 + }, + { + "epoch": 2.4905817174515237, + "grad_norm": 0.7092621922492981, + "learning_rate": 3.1794775993004557e-06, + "loss": 0.6474, + "step": 8991 + }, + { + "epoch": 2.4908587257617727, + "grad_norm": 0.7630212903022766, + "learning_rate": 3.1791270202591183e-06, + "loss": 0.6145, + "step": 8992 + }, + { + "epoch": 2.491135734072022, + "grad_norm": 0.6899539232254028, + "learning_rate": 3.1787764267981793e-06, + "loss": 0.5043, + "step": 8993 + }, + { + "epoch": 2.4914127423822716, + "grad_norm": 0.7707846760749817, + "learning_rate": 3.178425818925084e-06, + "loss": 0.6317, + "step": 8994 + }, + { + "epoch": 2.4916897506925206, + "grad_norm": 0.7168049216270447, + "learning_rate": 3.1780751966472754e-06, + "loss": 0.6151, + "step": 8995 + }, + { + "epoch": 2.49196675900277, + "grad_norm": 0.7072448134422302, + "learning_rate": 3.1777245599721993e-06, + "loss": 0.5583, + "step": 8996 + }, + { + "epoch": 2.4922437673130196, + "grad_norm": 0.744780957698822, + "learning_rate": 3.1773739089073e-06, + "loss": 0.5981, + "step": 8997 + }, + { + "epoch": 2.4925207756232686, + "grad_norm": 0.7633280158042908, + "learning_rate": 3.1770232434600223e-06, + "loss": 0.5987, + "step": 8998 + }, + { + "epoch": 2.492797783933518, + "grad_norm": 0.7523921132087708, + "learning_rate": 3.1766725636378125e-06, + "loss": 0.6084, + "step": 8999 + }, + { + "epoch": 2.4930747922437675, + "grad_norm": 0.7315114140510559, + "learning_rate": 3.1763218694481164e-06, + "loss": 0.6382, + "step": 9000 + }, + { + "epoch": 2.4933518005540165, + "grad_norm": 0.7941367030143738, + "learning_rate": 3.1759711608983794e-06, + "loss": 0.6669, + "step": 9001 + }, + { + "epoch": 2.493628808864266, + "grad_norm": 0.7714728116989136, + "learning_rate": 3.1756204379960494e-06, + "loss": 0.6014, + "step": 9002 + }, + { + "epoch": 2.4939058171745154, + "grad_norm": 0.7583754062652588, + "learning_rate": 3.1752697007485706e-06, + "loss": 0.5995, + "step": 9003 + }, + { + "epoch": 2.4941828254847644, + "grad_norm": 0.7508134841918945, + "learning_rate": 3.1749189491633926e-06, + "loss": 0.614, + "step": 9004 + }, + { + "epoch": 2.494459833795014, + "grad_norm": 0.7639035582542419, + "learning_rate": 3.174568183247962e-06, + "loss": 0.6044, + "step": 9005 + }, + { + "epoch": 2.4947368421052634, + "grad_norm": 0.7041856646537781, + "learning_rate": 3.174217403009725e-06, + "loss": 0.5785, + "step": 9006 + }, + { + "epoch": 2.4950138504155124, + "grad_norm": 0.7462358474731445, + "learning_rate": 3.173866608456132e-06, + "loss": 0.617, + "step": 9007 + }, + { + "epoch": 2.495290858725762, + "grad_norm": 0.7660013437271118, + "learning_rate": 3.17351579959463e-06, + "loss": 0.6425, + "step": 9008 + }, + { + "epoch": 2.4955678670360113, + "grad_norm": 0.7612264156341553, + "learning_rate": 3.1731649764326665e-06, + "loss": 0.6364, + "step": 9009 + }, + { + "epoch": 2.4958448753462603, + "grad_norm": 0.7336184978485107, + "learning_rate": 3.172814138977692e-06, + "loss": 0.6149, + "step": 9010 + }, + { + "epoch": 2.4961218836565098, + "grad_norm": 0.7028403878211975, + "learning_rate": 3.1724632872371566e-06, + "loss": 0.5825, + "step": 9011 + }, + { + "epoch": 2.4963988919667592, + "grad_norm": 0.7467078566551208, + "learning_rate": 3.1721124212185063e-06, + "loss": 0.5607, + "step": 9012 + }, + { + "epoch": 2.4966759002770083, + "grad_norm": 0.7464378476142883, + "learning_rate": 3.1717615409291936e-06, + "loss": 0.5978, + "step": 9013 + }, + { + "epoch": 2.4969529085872577, + "grad_norm": 0.7410820722579956, + "learning_rate": 3.171410646376668e-06, + "loss": 0.5892, + "step": 9014 + }, + { + "epoch": 2.497229916897507, + "grad_norm": 0.737551212310791, + "learning_rate": 3.17105973756838e-06, + "loss": 0.5904, + "step": 9015 + }, + { + "epoch": 2.497506925207756, + "grad_norm": 0.7431277632713318, + "learning_rate": 3.1707088145117794e-06, + "loss": 0.6267, + "step": 9016 + }, + { + "epoch": 2.4977839335180057, + "grad_norm": 0.7089725136756897, + "learning_rate": 3.170357877214319e-06, + "loss": 0.5503, + "step": 9017 + }, + { + "epoch": 2.4980609418282547, + "grad_norm": 0.7238456606864929, + "learning_rate": 3.1700069256834478e-06, + "loss": 0.5906, + "step": 9018 + }, + { + "epoch": 2.498337950138504, + "grad_norm": 0.7766544818878174, + "learning_rate": 3.1696559599266192e-06, + "loss": 0.578, + "step": 9019 + }, + { + "epoch": 2.4986149584487536, + "grad_norm": 0.7459467649459839, + "learning_rate": 3.1693049799512843e-06, + "loss": 0.6228, + "step": 9020 + }, + { + "epoch": 2.4988919667590026, + "grad_norm": 0.7257552146911621, + "learning_rate": 3.168953985764895e-06, + "loss": 0.6254, + "step": 9021 + }, + { + "epoch": 2.499168975069252, + "grad_norm": 0.7231073379516602, + "learning_rate": 3.1686029773749045e-06, + "loss": 0.5736, + "step": 9022 + }, + { + "epoch": 2.4994459833795015, + "grad_norm": 0.6939492225646973, + "learning_rate": 3.1682519547887658e-06, + "loss": 0.5798, + "step": 9023 + }, + { + "epoch": 2.4997229916897505, + "grad_norm": 0.7410045862197876, + "learning_rate": 3.1679009180139315e-06, + "loss": 0.612, + "step": 9024 + }, + { + "epoch": 2.5, + "grad_norm": 0.7503851652145386, + "learning_rate": 3.1675498670578544e-06, + "loss": 0.6032, + "step": 9025 + }, + { + "epoch": 2.5002770083102495, + "grad_norm": 0.7446761727333069, + "learning_rate": 3.16719880192799e-06, + "loss": 0.6122, + "step": 9026 + }, + { + "epoch": 2.5005540166204985, + "grad_norm": 0.7108980417251587, + "learning_rate": 3.16684772263179e-06, + "loss": 0.6324, + "step": 9027 + }, + { + "epoch": 2.500831024930748, + "grad_norm": 0.7736824750900269, + "learning_rate": 3.1664966291767106e-06, + "loss": 0.6475, + "step": 9028 + }, + { + "epoch": 2.501108033240997, + "grad_norm": 0.7655525803565979, + "learning_rate": 3.166145521570206e-06, + "loss": 0.5897, + "step": 9029 + }, + { + "epoch": 2.5013850415512464, + "grad_norm": 0.7319421172142029, + "learning_rate": 3.1657943998197305e-06, + "loss": 0.6392, + "step": 9030 + }, + { + "epoch": 2.501662049861496, + "grad_norm": 0.7983322143554688, + "learning_rate": 3.165443263932739e-06, + "loss": 0.5927, + "step": 9031 + }, + { + "epoch": 2.501939058171745, + "grad_norm": 0.7041431665420532, + "learning_rate": 3.165092113916688e-06, + "loss": 0.5515, + "step": 9032 + }, + { + "epoch": 2.5022160664819943, + "grad_norm": 0.709709882736206, + "learning_rate": 3.1647409497790333e-06, + "loss": 0.6061, + "step": 9033 + }, + { + "epoch": 2.502493074792244, + "grad_norm": 0.7182940244674683, + "learning_rate": 3.1643897715272306e-06, + "loss": 0.6129, + "step": 9034 + }, + { + "epoch": 2.502770083102493, + "grad_norm": 0.7012434601783752, + "learning_rate": 3.164038579168736e-06, + "loss": 0.5421, + "step": 9035 + }, + { + "epoch": 2.5030470914127423, + "grad_norm": 0.7288820743560791, + "learning_rate": 3.1636873727110073e-06, + "loss": 0.6085, + "step": 9036 + }, + { + "epoch": 2.5033240997229917, + "grad_norm": 0.7410088777542114, + "learning_rate": 3.1633361521615e-06, + "loss": 0.5921, + "step": 9037 + }, + { + "epoch": 2.5036011080332408, + "grad_norm": 0.6875979900360107, + "learning_rate": 3.1629849175276726e-06, + "loss": 0.5737, + "step": 9038 + }, + { + "epoch": 2.5038781163434902, + "grad_norm": 0.7268178462982178, + "learning_rate": 3.162633668816983e-06, + "loss": 0.6238, + "step": 9039 + }, + { + "epoch": 2.5041551246537397, + "grad_norm": 0.7688652276992798, + "learning_rate": 3.162282406036887e-06, + "loss": 0.6311, + "step": 9040 + }, + { + "epoch": 2.5044321329639887, + "grad_norm": 0.7122343182563782, + "learning_rate": 3.161931129194845e-06, + "loss": 0.5949, + "step": 9041 + }, + { + "epoch": 2.504709141274238, + "grad_norm": 0.7419912219047546, + "learning_rate": 3.1615798382983155e-06, + "loss": 0.5822, + "step": 9042 + }, + { + "epoch": 2.5049861495844876, + "grad_norm": 0.7522318959236145, + "learning_rate": 3.1612285333547556e-06, + "loss": 0.6055, + "step": 9043 + }, + { + "epoch": 2.5052631578947366, + "grad_norm": 0.7779367566108704, + "learning_rate": 3.160877214371625e-06, + "loss": 0.6163, + "step": 9044 + }, + { + "epoch": 2.505540166204986, + "grad_norm": 0.6923019289970398, + "learning_rate": 3.160525881356385e-06, + "loss": 0.5621, + "step": 9045 + }, + { + "epoch": 2.5058171745152356, + "grad_norm": 0.7429254055023193, + "learning_rate": 3.1601745343164917e-06, + "loss": 0.6012, + "step": 9046 + }, + { + "epoch": 2.5060941828254846, + "grad_norm": 0.713860034942627, + "learning_rate": 3.159823173259408e-06, + "loss": 0.579, + "step": 9047 + }, + { + "epoch": 2.506371191135734, + "grad_norm": 0.7542449235916138, + "learning_rate": 3.1594717981925945e-06, + "loss": 0.5522, + "step": 9048 + }, + { + "epoch": 2.5066481994459835, + "grad_norm": 0.7367417216300964, + "learning_rate": 3.159120409123509e-06, + "loss": 0.5463, + "step": 9049 + }, + { + "epoch": 2.5069252077562325, + "grad_norm": 0.714651882648468, + "learning_rate": 3.1587690060596143e-06, + "loss": 0.606, + "step": 9050 + }, + { + "epoch": 2.507202216066482, + "grad_norm": 0.7805426120758057, + "learning_rate": 3.1584175890083724e-06, + "loss": 0.6347, + "step": 9051 + }, + { + "epoch": 2.5074792243767314, + "grad_norm": 0.68819260597229, + "learning_rate": 3.1580661579772425e-06, + "loss": 0.6517, + "step": 9052 + }, + { + "epoch": 2.5077562326869804, + "grad_norm": 0.7432016730308533, + "learning_rate": 3.1577147129736875e-06, + "loss": 0.5857, + "step": 9053 + }, + { + "epoch": 2.50803324099723, + "grad_norm": 0.7592369914054871, + "learning_rate": 3.1573632540051702e-06, + "loss": 0.6106, + "step": 9054 + }, + { + "epoch": 2.5083102493074794, + "grad_norm": 0.7354898452758789, + "learning_rate": 3.157011781079152e-06, + "loss": 0.5963, + "step": 9055 + }, + { + "epoch": 2.5085872576177284, + "grad_norm": 0.7453470826148987, + "learning_rate": 3.156660294203096e-06, + "loss": 0.6188, + "step": 9056 + }, + { + "epoch": 2.508864265927978, + "grad_norm": 0.70748370885849, + "learning_rate": 3.156308793384465e-06, + "loss": 0.6119, + "step": 9057 + }, + { + "epoch": 2.5091412742382273, + "grad_norm": 0.7526542544364929, + "learning_rate": 3.1559572786307225e-06, + "loss": 0.5699, + "step": 9058 + }, + { + "epoch": 2.5094182825484763, + "grad_norm": 0.7344613075256348, + "learning_rate": 3.1556057499493307e-06, + "loss": 0.56, + "step": 9059 + }, + { + "epoch": 2.509695290858726, + "grad_norm": 0.7244347333908081, + "learning_rate": 3.1552542073477554e-06, + "loss": 0.6521, + "step": 9060 + }, + { + "epoch": 2.5099722991689752, + "grad_norm": 0.8829190731048584, + "learning_rate": 3.1549026508334607e-06, + "loss": 0.627, + "step": 9061 + }, + { + "epoch": 2.5102493074792243, + "grad_norm": 0.7521383762359619, + "learning_rate": 3.154551080413909e-06, + "loss": 0.5615, + "step": 9062 + }, + { + "epoch": 2.5105263157894737, + "grad_norm": 0.7531755566596985, + "learning_rate": 3.1541994960965674e-06, + "loss": 0.5935, + "step": 9063 + }, + { + "epoch": 2.510803324099723, + "grad_norm": 0.7439650893211365, + "learning_rate": 3.153847897888899e-06, + "loss": 0.5532, + "step": 9064 + }, + { + "epoch": 2.511080332409972, + "grad_norm": 0.746944010257721, + "learning_rate": 3.153496285798371e-06, + "loss": 0.5943, + "step": 9065 + }, + { + "epoch": 2.5113573407202217, + "grad_norm": 0.7312531471252441, + "learning_rate": 3.153144659832447e-06, + "loss": 0.6373, + "step": 9066 + }, + { + "epoch": 2.511634349030471, + "grad_norm": 0.741093099117279, + "learning_rate": 3.152793019998594e-06, + "loss": 0.5853, + "step": 9067 + }, + { + "epoch": 2.51191135734072, + "grad_norm": 0.7265978455543518, + "learning_rate": 3.1524413663042784e-06, + "loss": 0.5412, + "step": 9068 + }, + { + "epoch": 2.5121883656509696, + "grad_norm": 0.7318276166915894, + "learning_rate": 3.1520896987569666e-06, + "loss": 0.5864, + "step": 9069 + }, + { + "epoch": 2.512465373961219, + "grad_norm": 0.6997889280319214, + "learning_rate": 3.151738017364125e-06, + "loss": 0.5453, + "step": 9070 + }, + { + "epoch": 2.512742382271468, + "grad_norm": 0.7524303197860718, + "learning_rate": 3.1513863221332207e-06, + "loss": 0.6075, + "step": 9071 + }, + { + "epoch": 2.5130193905817175, + "grad_norm": 0.7480543255805969, + "learning_rate": 3.1510346130717217e-06, + "loss": 0.6009, + "step": 9072 + }, + { + "epoch": 2.513296398891967, + "grad_norm": 0.7288585901260376, + "learning_rate": 3.1506828901870958e-06, + "loss": 0.5811, + "step": 9073 + }, + { + "epoch": 2.513573407202216, + "grad_norm": 0.6908002495765686, + "learning_rate": 3.1503311534868097e-06, + "loss": 0.5221, + "step": 9074 + }, + { + "epoch": 2.5138504155124655, + "grad_norm": 0.7216440439224243, + "learning_rate": 3.149979402978332e-06, + "loss": 0.5988, + "step": 9075 + }, + { + "epoch": 2.514127423822715, + "grad_norm": 0.7588446140289307, + "learning_rate": 3.1496276386691327e-06, + "loss": 0.6346, + "step": 9076 + }, + { + "epoch": 2.514404432132964, + "grad_norm": 0.7544839382171631, + "learning_rate": 3.1492758605666796e-06, + "loss": 0.5734, + "step": 9077 + }, + { + "epoch": 2.5146814404432134, + "grad_norm": 0.7451032996177673, + "learning_rate": 3.1489240686784417e-06, + "loss": 0.6202, + "step": 9078 + }, + { + "epoch": 2.514958448753463, + "grad_norm": 0.7149298787117004, + "learning_rate": 3.1485722630118874e-06, + "loss": 0.5596, + "step": 9079 + }, + { + "epoch": 2.515235457063712, + "grad_norm": 0.7400791049003601, + "learning_rate": 3.1482204435744903e-06, + "loss": 0.6237, + "step": 9080 + }, + { + "epoch": 2.5155124653739613, + "grad_norm": 0.7374815940856934, + "learning_rate": 3.1478686103737154e-06, + "loss": 0.6092, + "step": 9081 + }, + { + "epoch": 2.515789473684211, + "grad_norm": 0.8207628726959229, + "learning_rate": 3.1475167634170365e-06, + "loss": 0.6153, + "step": 9082 + }, + { + "epoch": 2.51606648199446, + "grad_norm": 0.6817259788513184, + "learning_rate": 3.1471649027119232e-06, + "loss": 0.5391, + "step": 9083 + }, + { + "epoch": 2.5163434903047093, + "grad_norm": 0.737742006778717, + "learning_rate": 3.1468130282658467e-06, + "loss": 0.59, + "step": 9084 + }, + { + "epoch": 2.5166204986149583, + "grad_norm": 0.7260866761207581, + "learning_rate": 3.1464611400862766e-06, + "loss": 0.6319, + "step": 9085 + }, + { + "epoch": 2.5168975069252078, + "grad_norm": 0.6829667687416077, + "learning_rate": 3.146109238180686e-06, + "loss": 0.5478, + "step": 9086 + }, + { + "epoch": 2.517174515235457, + "grad_norm": 0.7200605869293213, + "learning_rate": 3.1457573225565472e-06, + "loss": 0.5868, + "step": 9087 + }, + { + "epoch": 2.5174515235457062, + "grad_norm": 0.7318903207778931, + "learning_rate": 3.145405393221331e-06, + "loss": 0.5995, + "step": 9088 + }, + { + "epoch": 2.5177285318559557, + "grad_norm": 0.6964850425720215, + "learning_rate": 3.145053450182509e-06, + "loss": 0.5842, + "step": 9089 + }, + { + "epoch": 2.518005540166205, + "grad_norm": 0.67149817943573, + "learning_rate": 3.144701493447556e-06, + "loss": 0.5927, + "step": 9090 + }, + { + "epoch": 2.518282548476454, + "grad_norm": 0.7514351606369019, + "learning_rate": 3.1443495230239442e-06, + "loss": 0.6169, + "step": 9091 + }, + { + "epoch": 2.5185595567867036, + "grad_norm": 0.7472149729728699, + "learning_rate": 3.1439975389191456e-06, + "loss": 0.6059, + "step": 9092 + }, + { + "epoch": 2.5188365650969526, + "grad_norm": 0.7219860553741455, + "learning_rate": 3.1436455411406356e-06, + "loss": 0.5874, + "step": 9093 + }, + { + "epoch": 2.519113573407202, + "grad_norm": 0.7607834935188293, + "learning_rate": 3.1432935296958865e-06, + "loss": 0.5884, + "step": 9094 + }, + { + "epoch": 2.5193905817174516, + "grad_norm": 0.7781618237495422, + "learning_rate": 3.1429415045923727e-06, + "loss": 0.6411, + "step": 9095 + }, + { + "epoch": 2.5196675900277006, + "grad_norm": 0.7018707394599915, + "learning_rate": 3.14258946583757e-06, + "loss": 0.5536, + "step": 9096 + }, + { + "epoch": 2.51994459833795, + "grad_norm": 0.7120254039764404, + "learning_rate": 3.142237413438951e-06, + "loss": 0.618, + "step": 9097 + }, + { + "epoch": 2.5202216066481995, + "grad_norm": 0.734156608581543, + "learning_rate": 3.1418853474039913e-06, + "loss": 0.5684, + "step": 9098 + }, + { + "epoch": 2.5204986149584485, + "grad_norm": 0.7341066002845764, + "learning_rate": 3.141533267740168e-06, + "loss": 0.5693, + "step": 9099 + }, + { + "epoch": 2.520775623268698, + "grad_norm": 0.7034329771995544, + "learning_rate": 3.1411811744549535e-06, + "loss": 0.5534, + "step": 9100 + }, + { + "epoch": 2.5210526315789474, + "grad_norm": 0.7042141556739807, + "learning_rate": 3.140829067555826e-06, + "loss": 0.5826, + "step": 9101 + }, + { + "epoch": 2.5213296398891965, + "grad_norm": 0.742813229560852, + "learning_rate": 3.1404769470502606e-06, + "loss": 0.6049, + "step": 9102 + }, + { + "epoch": 2.521606648199446, + "grad_norm": 0.7536957859992981, + "learning_rate": 3.140124812945734e-06, + "loss": 0.6111, + "step": 9103 + }, + { + "epoch": 2.5218836565096954, + "grad_norm": 0.7326040863990784, + "learning_rate": 3.139772665249723e-06, + "loss": 0.5774, + "step": 9104 + }, + { + "epoch": 2.5221606648199444, + "grad_norm": 0.7297189831733704, + "learning_rate": 3.1394205039697052e-06, + "loss": 0.6173, + "step": 9105 + }, + { + "epoch": 2.522437673130194, + "grad_norm": 0.7193797826766968, + "learning_rate": 3.1390683291131563e-06, + "loss": 0.5445, + "step": 9106 + }, + { + "epoch": 2.5227146814404433, + "grad_norm": 0.7528589367866516, + "learning_rate": 3.138716140687555e-06, + "loss": 0.5849, + "step": 9107 + }, + { + "epoch": 2.5229916897506923, + "grad_norm": 0.7352349758148193, + "learning_rate": 3.138363938700379e-06, + "loss": 0.6135, + "step": 9108 + }, + { + "epoch": 2.523268698060942, + "grad_norm": 0.7288463115692139, + "learning_rate": 3.138011723159107e-06, + "loss": 0.592, + "step": 9109 + }, + { + "epoch": 2.5235457063711912, + "grad_norm": 0.7227898836135864, + "learning_rate": 3.137659494071216e-06, + "loss": 0.5576, + "step": 9110 + }, + { + "epoch": 2.5238227146814403, + "grad_norm": 0.7009102702140808, + "learning_rate": 3.137307251444185e-06, + "loss": 0.568, + "step": 9111 + }, + { + "epoch": 2.5240997229916897, + "grad_norm": 0.7480496168136597, + "learning_rate": 3.136954995285495e-06, + "loss": 0.6032, + "step": 9112 + }, + { + "epoch": 2.524376731301939, + "grad_norm": 0.7623210549354553, + "learning_rate": 3.1366027256026226e-06, + "loss": 0.6047, + "step": 9113 + }, + { + "epoch": 2.524653739612188, + "grad_norm": 0.730629563331604, + "learning_rate": 3.136250442403049e-06, + "loss": 0.6057, + "step": 9114 + }, + { + "epoch": 2.5249307479224377, + "grad_norm": 0.7473331689834595, + "learning_rate": 3.1358981456942543e-06, + "loss": 0.5614, + "step": 9115 + }, + { + "epoch": 2.525207756232687, + "grad_norm": 0.7419679164886475, + "learning_rate": 3.1355458354837183e-06, + "loss": 0.609, + "step": 9116 + }, + { + "epoch": 2.525484764542936, + "grad_norm": 0.751901388168335, + "learning_rate": 3.13519351177892e-06, + "loss": 0.5983, + "step": 9117 + }, + { + "epoch": 2.5257617728531856, + "grad_norm": 0.7652584314346313, + "learning_rate": 3.134841174587342e-06, + "loss": 0.5973, + "step": 9118 + }, + { + "epoch": 2.526038781163435, + "grad_norm": 0.7338992953300476, + "learning_rate": 3.1344888239164647e-06, + "loss": 0.5932, + "step": 9119 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6915652751922607, + "learning_rate": 3.1341364597737684e-06, + "loss": 0.5841, + "step": 9120 + }, + { + "epoch": 2.5265927977839335, + "grad_norm": 0.7430443167686462, + "learning_rate": 3.1337840821667363e-06, + "loss": 0.6248, + "step": 9121 + }, + { + "epoch": 2.526869806094183, + "grad_norm": 0.7357264161109924, + "learning_rate": 3.1334316911028508e-06, + "loss": 0.5925, + "step": 9122 + }, + { + "epoch": 2.527146814404432, + "grad_norm": 0.6949090361595154, + "learning_rate": 3.1330792865895914e-06, + "loss": 0.602, + "step": 9123 + }, + { + "epoch": 2.5274238227146815, + "grad_norm": 0.7181215286254883, + "learning_rate": 3.1327268686344426e-06, + "loss": 0.5912, + "step": 9124 + }, + { + "epoch": 2.527700831024931, + "grad_norm": 0.7383121252059937, + "learning_rate": 3.1323744372448867e-06, + "loss": 0.588, + "step": 9125 + }, + { + "epoch": 2.52797783933518, + "grad_norm": 0.7196062803268433, + "learning_rate": 3.1320219924284055e-06, + "loss": 0.5496, + "step": 9126 + }, + { + "epoch": 2.5282548476454294, + "grad_norm": 0.7350928783416748, + "learning_rate": 3.1316695341924847e-06, + "loss": 0.577, + "step": 9127 + }, + { + "epoch": 2.528531855955679, + "grad_norm": 0.6696033477783203, + "learning_rate": 3.1313170625446064e-06, + "loss": 0.5497, + "step": 9128 + }, + { + "epoch": 2.528808864265928, + "grad_norm": 0.7050279974937439, + "learning_rate": 3.1309645774922544e-06, + "loss": 0.5604, + "step": 9129 + }, + { + "epoch": 2.5290858725761773, + "grad_norm": 0.7316223382949829, + "learning_rate": 3.130612079042913e-06, + "loss": 0.5885, + "step": 9130 + }, + { + "epoch": 2.529362880886427, + "grad_norm": 0.7581227421760559, + "learning_rate": 3.130259567204067e-06, + "loss": 0.6254, + "step": 9131 + }, + { + "epoch": 2.529639889196676, + "grad_norm": 0.7304221391677856, + "learning_rate": 3.1299070419832006e-06, + "loss": 0.6362, + "step": 9132 + }, + { + "epoch": 2.5299168975069253, + "grad_norm": 0.7422270178794861, + "learning_rate": 3.1295545033877987e-06, + "loss": 0.5861, + "step": 9133 + }, + { + "epoch": 2.5301939058171747, + "grad_norm": 0.7402244210243225, + "learning_rate": 3.1292019514253478e-06, + "loss": 0.5923, + "step": 9134 + }, + { + "epoch": 2.5304709141274238, + "grad_norm": 0.7696836590766907, + "learning_rate": 3.1288493861033327e-06, + "loss": 0.5708, + "step": 9135 + }, + { + "epoch": 2.530747922437673, + "grad_norm": 0.7355846762657166, + "learning_rate": 3.1284968074292388e-06, + "loss": 0.5911, + "step": 9136 + }, + { + "epoch": 2.5310249307479227, + "grad_norm": 0.7229421734809875, + "learning_rate": 3.1281442154105528e-06, + "loss": 0.6072, + "step": 9137 + }, + { + "epoch": 2.5313019390581717, + "grad_norm": 0.7466177344322205, + "learning_rate": 3.1277916100547613e-06, + "loss": 0.6076, + "step": 9138 + }, + { + "epoch": 2.531578947368421, + "grad_norm": 0.7761610746383667, + "learning_rate": 3.12743899136935e-06, + "loss": 0.5949, + "step": 9139 + }, + { + "epoch": 2.5318559556786706, + "grad_norm": 0.7328271269798279, + "learning_rate": 3.1270863593618066e-06, + "loss": 0.5951, + "step": 9140 + }, + { + "epoch": 2.5321329639889196, + "grad_norm": 0.7607014775276184, + "learning_rate": 3.1267337140396193e-06, + "loss": 0.5789, + "step": 9141 + }, + { + "epoch": 2.532409972299169, + "grad_norm": 0.7491313815116882, + "learning_rate": 3.126381055410274e-06, + "loss": 0.635, + "step": 9142 + }, + { + "epoch": 2.5326869806094185, + "grad_norm": 0.7202014327049255, + "learning_rate": 3.1260283834812595e-06, + "loss": 0.6184, + "step": 9143 + }, + { + "epoch": 2.5329639889196676, + "grad_norm": 0.7408089637756348, + "learning_rate": 3.1256756982600642e-06, + "loss": 0.5675, + "step": 9144 + }, + { + "epoch": 2.533240997229917, + "grad_norm": 0.7064687013626099, + "learning_rate": 3.125322999754175e-06, + "loss": 0.5815, + "step": 9145 + }, + { + "epoch": 2.533518005540166, + "grad_norm": 0.7416212558746338, + "learning_rate": 3.124970287971082e-06, + "loss": 0.6292, + "step": 9146 + }, + { + "epoch": 2.5337950138504155, + "grad_norm": 0.7549963593482971, + "learning_rate": 3.1246175629182744e-06, + "loss": 0.592, + "step": 9147 + }, + { + "epoch": 2.534072022160665, + "grad_norm": 0.6834939122200012, + "learning_rate": 3.12426482460324e-06, + "loss": 0.5995, + "step": 9148 + }, + { + "epoch": 2.534349030470914, + "grad_norm": 0.7434266209602356, + "learning_rate": 3.12391207303347e-06, + "loss": 0.6189, + "step": 9149 + }, + { + "epoch": 2.5346260387811634, + "grad_norm": 0.718361496925354, + "learning_rate": 3.1235593082164527e-06, + "loss": 0.5777, + "step": 9150 + }, + { + "epoch": 2.534903047091413, + "grad_norm": 0.7535122036933899, + "learning_rate": 3.123206530159679e-06, + "loss": 0.6064, + "step": 9151 + }, + { + "epoch": 2.535180055401662, + "grad_norm": 0.7786555290222168, + "learning_rate": 3.1228537388706392e-06, + "loss": 0.5837, + "step": 9152 + }, + { + "epoch": 2.5354570637119114, + "grad_norm": 0.7389187812805176, + "learning_rate": 3.122500934356825e-06, + "loss": 0.5584, + "step": 9153 + }, + { + "epoch": 2.5357340720221604, + "grad_norm": 0.7314004898071289, + "learning_rate": 3.122148116625725e-06, + "loss": 0.5931, + "step": 9154 + }, + { + "epoch": 2.53601108033241, + "grad_norm": 0.7248198390007019, + "learning_rate": 3.121795285684832e-06, + "loss": 0.6388, + "step": 9155 + }, + { + "epoch": 2.5362880886426593, + "grad_norm": 0.6863377690315247, + "learning_rate": 3.1214424415416373e-06, + "loss": 0.5441, + "step": 9156 + }, + { + "epoch": 2.5365650969529083, + "grad_norm": 0.7189471125602722, + "learning_rate": 3.1210895842036326e-06, + "loss": 0.5627, + "step": 9157 + }, + { + "epoch": 2.536842105263158, + "grad_norm": 0.736508309841156, + "learning_rate": 3.1207367136783096e-06, + "loss": 0.588, + "step": 9158 + }, + { + "epoch": 2.5371191135734072, + "grad_norm": 0.7444504499435425, + "learning_rate": 3.1203838299731607e-06, + "loss": 0.5952, + "step": 9159 + }, + { + "epoch": 2.5373961218836563, + "grad_norm": 0.718096911907196, + "learning_rate": 3.12003093309568e-06, + "loss": 0.5898, + "step": 9160 + }, + { + "epoch": 2.5376731301939057, + "grad_norm": 0.7805988192558289, + "learning_rate": 3.119678023053358e-06, + "loss": 0.5813, + "step": 9161 + }, + { + "epoch": 2.537950138504155, + "grad_norm": 0.7315322756767273, + "learning_rate": 3.1193250998536894e-06, + "loss": 0.5954, + "step": 9162 + }, + { + "epoch": 2.538227146814404, + "grad_norm": 0.744376540184021, + "learning_rate": 3.118972163504168e-06, + "loss": 0.638, + "step": 9163 + }, + { + "epoch": 2.5385041551246537, + "grad_norm": 0.7060050964355469, + "learning_rate": 3.1186192140122863e-06, + "loss": 0.5638, + "step": 9164 + }, + { + "epoch": 2.538781163434903, + "grad_norm": 0.7389654517173767, + "learning_rate": 3.118266251385539e-06, + "loss": 0.5786, + "step": 9165 + }, + { + "epoch": 2.539058171745152, + "grad_norm": 0.8068351745605469, + "learning_rate": 3.1179132756314205e-06, + "loss": 0.649, + "step": 9166 + }, + { + "epoch": 2.5393351800554016, + "grad_norm": 0.7584858536720276, + "learning_rate": 3.117560286757425e-06, + "loss": 0.5953, + "step": 9167 + }, + { + "epoch": 2.539612188365651, + "grad_norm": 0.703183114528656, + "learning_rate": 3.117207284771048e-06, + "loss": 0.6066, + "step": 9168 + }, + { + "epoch": 2.5398891966759, + "grad_norm": 0.6991854906082153, + "learning_rate": 3.116854269679783e-06, + "loss": 0.545, + "step": 9169 + }, + { + "epoch": 2.5401662049861495, + "grad_norm": 0.7469720244407654, + "learning_rate": 3.116501241491128e-06, + "loss": 0.5961, + "step": 9170 + }, + { + "epoch": 2.540443213296399, + "grad_norm": 0.7148388028144836, + "learning_rate": 3.116148200212576e-06, + "loss": 0.5065, + "step": 9171 + }, + { + "epoch": 2.540720221606648, + "grad_norm": 0.7173408269882202, + "learning_rate": 3.115795145851625e-06, + "loss": 0.5957, + "step": 9172 + }, + { + "epoch": 2.5409972299168975, + "grad_norm": 0.7517775893211365, + "learning_rate": 3.11544207841577e-06, + "loss": 0.5684, + "step": 9173 + }, + { + "epoch": 2.541274238227147, + "grad_norm": 0.743894636631012, + "learning_rate": 3.115088997912508e-06, + "loss": 0.5854, + "step": 9174 + }, + { + "epoch": 2.541551246537396, + "grad_norm": 0.7796741724014282, + "learning_rate": 3.1147359043493365e-06, + "loss": 0.6541, + "step": 9175 + }, + { + "epoch": 2.5418282548476454, + "grad_norm": 0.7304288744926453, + "learning_rate": 3.1143827977337516e-06, + "loss": 0.5804, + "step": 9176 + }, + { + "epoch": 2.542105263157895, + "grad_norm": 0.722074031829834, + "learning_rate": 3.114029678073251e-06, + "loss": 0.596, + "step": 9177 + }, + { + "epoch": 2.542382271468144, + "grad_norm": 0.7742172479629517, + "learning_rate": 3.113676545375332e-06, + "loss": 0.5868, + "step": 9178 + }, + { + "epoch": 2.5426592797783933, + "grad_norm": 0.7246047854423523, + "learning_rate": 3.1133233996474933e-06, + "loss": 0.6096, + "step": 9179 + }, + { + "epoch": 2.542936288088643, + "grad_norm": 0.8202196359634399, + "learning_rate": 3.1129702408972316e-06, + "loss": 0.5459, + "step": 9180 + }, + { + "epoch": 2.543213296398892, + "grad_norm": 0.7334191799163818, + "learning_rate": 3.1126170691320466e-06, + "loss": 0.598, + "step": 9181 + }, + { + "epoch": 2.5434903047091413, + "grad_norm": 0.7115674614906311, + "learning_rate": 3.1122638843594377e-06, + "loss": 0.6247, + "step": 9182 + }, + { + "epoch": 2.5437673130193907, + "grad_norm": 0.6812747716903687, + "learning_rate": 3.1119106865869026e-06, + "loss": 0.5424, + "step": 9183 + }, + { + "epoch": 2.5440443213296398, + "grad_norm": 0.760249137878418, + "learning_rate": 3.1115574758219404e-06, + "loss": 0.6014, + "step": 9184 + }, + { + "epoch": 2.544321329639889, + "grad_norm": 0.7050676345825195, + "learning_rate": 3.111204252072052e-06, + "loss": 0.5766, + "step": 9185 + }, + { + "epoch": 2.5445983379501387, + "grad_norm": 0.7156957983970642, + "learning_rate": 3.1108510153447352e-06, + "loss": 0.5683, + "step": 9186 + }, + { + "epoch": 2.5448753462603877, + "grad_norm": 0.744422197341919, + "learning_rate": 3.1104977656474923e-06, + "loss": 0.587, + "step": 9187 + }, + { + "epoch": 2.545152354570637, + "grad_norm": 0.7329248785972595, + "learning_rate": 3.110144502987823e-06, + "loss": 0.5842, + "step": 9188 + }, + { + "epoch": 2.5454293628808866, + "grad_norm": 0.7504559755325317, + "learning_rate": 3.109791227373227e-06, + "loss": 0.614, + "step": 9189 + }, + { + "epoch": 2.5457063711911356, + "grad_norm": 0.7142932415008545, + "learning_rate": 3.109437938811207e-06, + "loss": 0.5663, + "step": 9190 + }, + { + "epoch": 2.545983379501385, + "grad_norm": 0.7647702693939209, + "learning_rate": 3.1090846373092624e-06, + "loss": 0.6442, + "step": 9191 + }, + { + "epoch": 2.5462603878116346, + "grad_norm": 0.7035728096961975, + "learning_rate": 3.108731322874896e-06, + "loss": 0.6074, + "step": 9192 + }, + { + "epoch": 2.5465373961218836, + "grad_norm": 0.7685317397117615, + "learning_rate": 3.108377995515609e-06, + "loss": 0.6127, + "step": 9193 + }, + { + "epoch": 2.546814404432133, + "grad_norm": 0.7221921682357788, + "learning_rate": 3.108024655238903e-06, + "loss": 0.5644, + "step": 9194 + }, + { + "epoch": 2.5470914127423825, + "grad_norm": 0.7649849653244019, + "learning_rate": 3.107671302052282e-06, + "loss": 0.6145, + "step": 9195 + }, + { + "epoch": 2.5473684210526315, + "grad_norm": 0.7878473401069641, + "learning_rate": 3.1073179359632465e-06, + "loss": 0.5651, + "step": 9196 + }, + { + "epoch": 2.547645429362881, + "grad_norm": 0.7036198377609253, + "learning_rate": 3.1069645569793004e-06, + "loss": 0.5796, + "step": 9197 + }, + { + "epoch": 2.5479224376731304, + "grad_norm": 0.7540354132652283, + "learning_rate": 3.1066111651079473e-06, + "loss": 0.6188, + "step": 9198 + }, + { + "epoch": 2.5481994459833794, + "grad_norm": 0.7185037732124329, + "learning_rate": 3.106257760356689e-06, + "loss": 0.5986, + "step": 9199 + }, + { + "epoch": 2.548476454293629, + "grad_norm": 0.7159909605979919, + "learning_rate": 3.1059043427330317e-06, + "loss": 0.5796, + "step": 9200 + }, + { + "epoch": 2.5487534626038784, + "grad_norm": 0.7194716930389404, + "learning_rate": 3.1055509122444768e-06, + "loss": 0.5852, + "step": 9201 + }, + { + "epoch": 2.5490304709141274, + "grad_norm": 0.7099069356918335, + "learning_rate": 3.1051974688985292e-06, + "loss": 0.5592, + "step": 9202 + }, + { + "epoch": 2.549307479224377, + "grad_norm": 0.7702147960662842, + "learning_rate": 3.1048440127026946e-06, + "loss": 0.5597, + "step": 9203 + }, + { + "epoch": 2.5495844875346263, + "grad_norm": 0.742659866809845, + "learning_rate": 3.1044905436644772e-06, + "loss": 0.5495, + "step": 9204 + }, + { + "epoch": 2.5498614958448753, + "grad_norm": 0.7312626242637634, + "learning_rate": 3.1041370617913818e-06, + "loss": 0.6404, + "step": 9205 + }, + { + "epoch": 2.5501385041551248, + "grad_norm": 0.7318844795227051, + "learning_rate": 3.1037835670909134e-06, + "loss": 0.6033, + "step": 9206 + }, + { + "epoch": 2.5504155124653742, + "grad_norm": 0.7092210650444031, + "learning_rate": 3.1034300595705784e-06, + "loss": 0.5582, + "step": 9207 + }, + { + "epoch": 2.5506925207756233, + "grad_norm": 0.7274510860443115, + "learning_rate": 3.1030765392378813e-06, + "loss": 0.6148, + "step": 9208 + }, + { + "epoch": 2.5509695290858727, + "grad_norm": 0.7525596022605896, + "learning_rate": 3.10272300610033e-06, + "loss": 0.5924, + "step": 9209 + }, + { + "epoch": 2.5512465373961217, + "grad_norm": 0.7422453761100769, + "learning_rate": 3.10236946016543e-06, + "loss": 0.5386, + "step": 9210 + }, + { + "epoch": 2.551523545706371, + "grad_norm": 0.7085767984390259, + "learning_rate": 3.1020159014406885e-06, + "loss": 0.5287, + "step": 9211 + }, + { + "epoch": 2.5518005540166206, + "grad_norm": 0.7217498421669006, + "learning_rate": 3.101662329933611e-06, + "loss": 0.5654, + "step": 9212 + }, + { + "epoch": 2.5520775623268697, + "grad_norm": 0.7595999836921692, + "learning_rate": 3.1013087456517064e-06, + "loss": 0.6237, + "step": 9213 + }, + { + "epoch": 2.552354570637119, + "grad_norm": 0.7279402017593384, + "learning_rate": 3.1009551486024814e-06, + "loss": 0.6141, + "step": 9214 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.7226688861846924, + "learning_rate": 3.100601538793444e-06, + "loss": 0.6513, + "step": 9215 + }, + { + "epoch": 2.5529085872576176, + "grad_norm": 0.741881787776947, + "learning_rate": 3.100247916232102e-06, + "loss": 0.603, + "step": 9216 + }, + { + "epoch": 2.553185595567867, + "grad_norm": 0.7893657684326172, + "learning_rate": 3.0998942809259643e-06, + "loss": 0.6009, + "step": 9217 + }, + { + "epoch": 2.553462603878116, + "grad_norm": 0.7880337238311768, + "learning_rate": 3.099540632882539e-06, + "loss": 0.6172, + "step": 9218 + }, + { + "epoch": 2.5537396121883655, + "grad_norm": 0.7472859621047974, + "learning_rate": 3.0991869721093342e-06, + "loss": 0.5933, + "step": 9219 + }, + { + "epoch": 2.554016620498615, + "grad_norm": 0.7323741316795349, + "learning_rate": 3.098833298613861e-06, + "loss": 0.6109, + "step": 9220 + }, + { + "epoch": 2.554293628808864, + "grad_norm": 0.7406541109085083, + "learning_rate": 3.0984796124036267e-06, + "loss": 0.5547, + "step": 9221 + }, + { + "epoch": 2.5545706371191135, + "grad_norm": 0.7627723813056946, + "learning_rate": 3.0981259134861426e-06, + "loss": 0.6178, + "step": 9222 + }, + { + "epoch": 2.554847645429363, + "grad_norm": 0.7222011089324951, + "learning_rate": 3.097772201868917e-06, + "loss": 0.5889, + "step": 9223 + }, + { + "epoch": 2.555124653739612, + "grad_norm": 0.7353097200393677, + "learning_rate": 3.0974184775594616e-06, + "loss": 0.5875, + "step": 9224 + }, + { + "epoch": 2.5554016620498614, + "grad_norm": 0.7108656167984009, + "learning_rate": 3.097064740565286e-06, + "loss": 0.5561, + "step": 9225 + }, + { + "epoch": 2.555678670360111, + "grad_norm": 0.7538282871246338, + "learning_rate": 3.096710990893902e-06, + "loss": 0.6066, + "step": 9226 + }, + { + "epoch": 2.55595567867036, + "grad_norm": 0.7404365539550781, + "learning_rate": 3.0963572285528186e-06, + "loss": 0.6072, + "step": 9227 + }, + { + "epoch": 2.5562326869806093, + "grad_norm": 0.7235316038131714, + "learning_rate": 3.0960034535495493e-06, + "loss": 0.632, + "step": 9228 + }, + { + "epoch": 2.556509695290859, + "grad_norm": 0.744694173336029, + "learning_rate": 3.0956496658916048e-06, + "loss": 0.6258, + "step": 9229 + }, + { + "epoch": 2.556786703601108, + "grad_norm": 0.7643329501152039, + "learning_rate": 3.0952958655864957e-06, + "loss": 0.6022, + "step": 9230 + }, + { + "epoch": 2.5570637119113573, + "grad_norm": 0.7612898349761963, + "learning_rate": 3.0949420526417357e-06, + "loss": 0.6229, + "step": 9231 + }, + { + "epoch": 2.5573407202216067, + "grad_norm": 0.7228426933288574, + "learning_rate": 3.094588227064837e-06, + "loss": 0.6418, + "step": 9232 + }, + { + "epoch": 2.5576177285318558, + "grad_norm": 0.7141867280006409, + "learning_rate": 3.0942343888633114e-06, + "loss": 0.5661, + "step": 9233 + }, + { + "epoch": 2.557894736842105, + "grad_norm": 0.7086227536201477, + "learning_rate": 3.0938805380446717e-06, + "loss": 0.582, + "step": 9234 + }, + { + "epoch": 2.5581717451523547, + "grad_norm": 0.6948062181472778, + "learning_rate": 3.0935266746164326e-06, + "loss": 0.5512, + "step": 9235 + }, + { + "epoch": 2.5584487534626037, + "grad_norm": 0.7016825675964355, + "learning_rate": 3.093172798586106e-06, + "loss": 0.6331, + "step": 9236 + }, + { + "epoch": 2.558725761772853, + "grad_norm": 0.774544358253479, + "learning_rate": 3.092818909961206e-06, + "loss": 0.5849, + "step": 9237 + }, + { + "epoch": 2.5590027700831026, + "grad_norm": 0.7048248052597046, + "learning_rate": 3.092465008749247e-06, + "loss": 0.5755, + "step": 9238 + }, + { + "epoch": 2.5592797783933516, + "grad_norm": 0.7674547433853149, + "learning_rate": 3.0921110949577425e-06, + "loss": 0.6052, + "step": 9239 + }, + { + "epoch": 2.559556786703601, + "grad_norm": 0.7589477896690369, + "learning_rate": 3.091757168594208e-06, + "loss": 0.6178, + "step": 9240 + }, + { + "epoch": 2.5598337950138506, + "grad_norm": 0.7594172358512878, + "learning_rate": 3.0914032296661566e-06, + "loss": 0.6464, + "step": 9241 + }, + { + "epoch": 2.5601108033240996, + "grad_norm": 0.7501422166824341, + "learning_rate": 3.0910492781811053e-06, + "loss": 0.623, + "step": 9242 + }, + { + "epoch": 2.560387811634349, + "grad_norm": 0.7276908755302429, + "learning_rate": 3.0906953141465677e-06, + "loss": 0.5813, + "step": 9243 + }, + { + "epoch": 2.5606648199445985, + "grad_norm": 0.7989957928657532, + "learning_rate": 3.09034133757006e-06, + "loss": 0.6081, + "step": 9244 + }, + { + "epoch": 2.5609418282548475, + "grad_norm": 0.754078209400177, + "learning_rate": 3.0899873484590994e-06, + "loss": 0.5448, + "step": 9245 + }, + { + "epoch": 2.561218836565097, + "grad_norm": 0.7445813417434692, + "learning_rate": 3.0896333468212e-06, + "loss": 0.5775, + "step": 9246 + }, + { + "epoch": 2.5614958448753464, + "grad_norm": 0.7109984159469604, + "learning_rate": 3.089279332663878e-06, + "loss": 0.5937, + "step": 9247 + }, + { + "epoch": 2.5617728531855954, + "grad_norm": 0.712151050567627, + "learning_rate": 3.088925305994652e-06, + "loss": 0.6057, + "step": 9248 + }, + { + "epoch": 2.562049861495845, + "grad_norm": 0.7844370603561401, + "learning_rate": 3.0885712668210376e-06, + "loss": 0.5883, + "step": 9249 + }, + { + "epoch": 2.5623268698060944, + "grad_norm": 0.747381329536438, + "learning_rate": 3.0882172151505523e-06, + "loss": 0.589, + "step": 9250 + }, + { + "epoch": 2.5626038781163434, + "grad_norm": 0.7583556175231934, + "learning_rate": 3.0878631509907127e-06, + "loss": 0.5968, + "step": 9251 + }, + { + "epoch": 2.562880886426593, + "grad_norm": 0.7440918684005737, + "learning_rate": 3.0875090743490383e-06, + "loss": 0.5754, + "step": 9252 + }, + { + "epoch": 2.5631578947368423, + "grad_norm": 0.7422070503234863, + "learning_rate": 3.087154985233045e-06, + "loss": 0.5908, + "step": 9253 + }, + { + "epoch": 2.5634349030470913, + "grad_norm": 0.8005231618881226, + "learning_rate": 3.086800883650252e-06, + "loss": 0.6444, + "step": 9254 + }, + { + "epoch": 2.563711911357341, + "grad_norm": 0.7100370526313782, + "learning_rate": 3.0864467696081785e-06, + "loss": 0.5453, + "step": 9255 + }, + { + "epoch": 2.5639889196675902, + "grad_norm": 0.6949264407157898, + "learning_rate": 3.086092643114342e-06, + "loss": 0.5595, + "step": 9256 + }, + { + "epoch": 2.5642659279778393, + "grad_norm": 0.7308492064476013, + "learning_rate": 3.085738504176261e-06, + "loss": 0.5984, + "step": 9257 + }, + { + "epoch": 2.5645429362880887, + "grad_norm": 0.7675203680992126, + "learning_rate": 3.085384352801457e-06, + "loss": 0.6014, + "step": 9258 + }, + { + "epoch": 2.564819944598338, + "grad_norm": 0.7616082429885864, + "learning_rate": 3.0850301889974477e-06, + "loss": 0.5905, + "step": 9259 + }, + { + "epoch": 2.565096952908587, + "grad_norm": 0.7435650825500488, + "learning_rate": 3.0846760127717534e-06, + "loss": 0.5649, + "step": 9260 + }, + { + "epoch": 2.5653739612188367, + "grad_norm": 0.7471524477005005, + "learning_rate": 3.084321824131895e-06, + "loss": 0.5982, + "step": 9261 + }, + { + "epoch": 2.565650969529086, + "grad_norm": 0.7317099571228027, + "learning_rate": 3.083967623085391e-06, + "loss": 0.5763, + "step": 9262 + }, + { + "epoch": 2.565927977839335, + "grad_norm": 0.7465954422950745, + "learning_rate": 3.0836134096397642e-06, + "loss": 0.6037, + "step": 9263 + }, + { + "epoch": 2.5662049861495846, + "grad_norm": 0.7400816082954407, + "learning_rate": 3.0832591838025337e-06, + "loss": 0.603, + "step": 9264 + }, + { + "epoch": 2.566481994459834, + "grad_norm": 0.7448139190673828, + "learning_rate": 3.0829049455812215e-06, + "loss": 0.5512, + "step": 9265 + }, + { + "epoch": 2.566759002770083, + "grad_norm": 0.7737401723861694, + "learning_rate": 3.0825506949833477e-06, + "loss": 0.6541, + "step": 9266 + }, + { + "epoch": 2.5670360110803325, + "grad_norm": 0.7257832884788513, + "learning_rate": 3.082196432016436e-06, + "loss": 0.5674, + "step": 9267 + }, + { + "epoch": 2.567313019390582, + "grad_norm": 0.7949074506759644, + "learning_rate": 3.0818421566880075e-06, + "loss": 0.6083, + "step": 9268 + }, + { + "epoch": 2.567590027700831, + "grad_norm": 0.703048050403595, + "learning_rate": 3.0814878690055836e-06, + "loss": 0.5917, + "step": 9269 + }, + { + "epoch": 2.5678670360110805, + "grad_norm": 0.7743631601333618, + "learning_rate": 3.081133568976687e-06, + "loss": 0.605, + "step": 9270 + }, + { + "epoch": 2.5681440443213295, + "grad_norm": 0.7685450911521912, + "learning_rate": 3.0807792566088417e-06, + "loss": 0.6056, + "step": 9271 + }, + { + "epoch": 2.568421052631579, + "grad_norm": 0.6992116570472717, + "learning_rate": 3.0804249319095686e-06, + "loss": 0.5336, + "step": 9272 + }, + { + "epoch": 2.5686980609418284, + "grad_norm": 0.7721379995346069, + "learning_rate": 3.0800705948863922e-06, + "loss": 0.5926, + "step": 9273 + }, + { + "epoch": 2.5689750692520774, + "grad_norm": 0.6962541937828064, + "learning_rate": 3.0797162455468367e-06, + "loss": 0.5648, + "step": 9274 + }, + { + "epoch": 2.569252077562327, + "grad_norm": 0.7683770656585693, + "learning_rate": 3.0793618838984233e-06, + "loss": 0.5939, + "step": 9275 + }, + { + "epoch": 2.5695290858725763, + "grad_norm": 0.709267795085907, + "learning_rate": 3.0790075099486787e-06, + "loss": 0.5971, + "step": 9276 + }, + { + "epoch": 2.5698060941828254, + "grad_norm": 0.7617084383964539, + "learning_rate": 3.078653123705126e-06, + "loss": 0.5788, + "step": 9277 + }, + { + "epoch": 2.570083102493075, + "grad_norm": 0.7246497273445129, + "learning_rate": 3.0782987251752894e-06, + "loss": 0.6082, + "step": 9278 + }, + { + "epoch": 2.570360110803324, + "grad_norm": 0.7413263916969299, + "learning_rate": 3.077944314366694e-06, + "loss": 0.6204, + "step": 9279 + }, + { + "epoch": 2.5706371191135733, + "grad_norm": 0.7515942454338074, + "learning_rate": 3.0775898912868653e-06, + "loss": 0.5992, + "step": 9280 + }, + { + "epoch": 2.5709141274238227, + "grad_norm": 0.7281467318534851, + "learning_rate": 3.077235455943328e-06, + "loss": 0.6147, + "step": 9281 + }, + { + "epoch": 2.5711911357340718, + "grad_norm": 0.746329128742218, + "learning_rate": 3.0768810083436072e-06, + "loss": 0.5948, + "step": 9282 + }, + { + "epoch": 2.5714681440443212, + "grad_norm": 0.7518007755279541, + "learning_rate": 3.076526548495231e-06, + "loss": 0.572, + "step": 9283 + }, + { + "epoch": 2.5717451523545707, + "grad_norm": 0.7037748098373413, + "learning_rate": 3.0761720764057222e-06, + "loss": 0.5791, + "step": 9284 + }, + { + "epoch": 2.5720221606648197, + "grad_norm": 0.7124174237251282, + "learning_rate": 3.0758175920826096e-06, + "loss": 0.558, + "step": 9285 + }, + { + "epoch": 2.572299168975069, + "grad_norm": 0.7646774649620056, + "learning_rate": 3.0754630955334187e-06, + "loss": 0.5765, + "step": 9286 + }, + { + "epoch": 2.5725761772853186, + "grad_norm": 0.7965149283409119, + "learning_rate": 3.075108586765677e-06, + "loss": 0.5765, + "step": 9287 + }, + { + "epoch": 2.5728531855955676, + "grad_norm": 0.7802436351776123, + "learning_rate": 3.0747540657869113e-06, + "loss": 0.598, + "step": 9288 + }, + { + "epoch": 2.573130193905817, + "grad_norm": 0.742306649684906, + "learning_rate": 3.0743995326046487e-06, + "loss": 0.564, + "step": 9289 + }, + { + "epoch": 2.5734072022160666, + "grad_norm": 0.7256410717964172, + "learning_rate": 3.0740449872264178e-06, + "loss": 0.6033, + "step": 9290 + }, + { + "epoch": 2.5736842105263156, + "grad_norm": 0.7125851511955261, + "learning_rate": 3.0736904296597457e-06, + "loss": 0.5585, + "step": 9291 + }, + { + "epoch": 2.573961218836565, + "grad_norm": 0.7355688810348511, + "learning_rate": 3.07333585991216e-06, + "loss": 0.5999, + "step": 9292 + }, + { + "epoch": 2.5742382271468145, + "grad_norm": 0.7512667775154114, + "learning_rate": 3.072981277991191e-06, + "loss": 0.5896, + "step": 9293 + }, + { + "epoch": 2.5745152354570635, + "grad_norm": 0.7280569076538086, + "learning_rate": 3.0726266839043656e-06, + "loss": 0.5661, + "step": 9294 + }, + { + "epoch": 2.574792243767313, + "grad_norm": 0.707859218120575, + "learning_rate": 3.0722720776592136e-06, + "loss": 0.5933, + "step": 9295 + }, + { + "epoch": 2.5750692520775624, + "grad_norm": 0.6893876194953918, + "learning_rate": 3.071917459263264e-06, + "loss": 0.5667, + "step": 9296 + }, + { + "epoch": 2.5753462603878114, + "grad_norm": 0.7232053875923157, + "learning_rate": 3.071562828724046e-06, + "loss": 0.6296, + "step": 9297 + }, + { + "epoch": 2.575623268698061, + "grad_norm": 0.7339138984680176, + "learning_rate": 3.0712081860490887e-06, + "loss": 0.6231, + "step": 9298 + }, + { + "epoch": 2.5759002770083104, + "grad_norm": 0.7207698822021484, + "learning_rate": 3.0708535312459244e-06, + "loss": 0.6124, + "step": 9299 + }, + { + "epoch": 2.5761772853185594, + "grad_norm": 0.7031732797622681, + "learning_rate": 3.070498864322081e-06, + "loss": 0.5907, + "step": 9300 + }, + { + "epoch": 2.576454293628809, + "grad_norm": 0.7313705086708069, + "learning_rate": 3.07014418528509e-06, + "loss": 0.6229, + "step": 9301 + }, + { + "epoch": 2.5767313019390583, + "grad_norm": 0.7237061262130737, + "learning_rate": 3.069789494142482e-06, + "loss": 0.6196, + "step": 9302 + }, + { + "epoch": 2.5770083102493073, + "grad_norm": 0.7333438992500305, + "learning_rate": 3.069434790901788e-06, + "loss": 0.575, + "step": 9303 + }, + { + "epoch": 2.577285318559557, + "grad_norm": 0.7455644011497498, + "learning_rate": 3.069080075570539e-06, + "loss": 0.5693, + "step": 9304 + }, + { + "epoch": 2.5775623268698062, + "grad_norm": 0.7772599458694458, + "learning_rate": 3.068725348156267e-06, + "loss": 0.5994, + "step": 9305 + }, + { + "epoch": 2.5778393351800553, + "grad_norm": 0.7441378831863403, + "learning_rate": 3.0683706086665038e-06, + "loss": 0.6199, + "step": 9306 + }, + { + "epoch": 2.5781163434903047, + "grad_norm": 0.7142274379730225, + "learning_rate": 3.0680158571087805e-06, + "loss": 0.6052, + "step": 9307 + }, + { + "epoch": 2.578393351800554, + "grad_norm": 0.7183087468147278, + "learning_rate": 3.06766109349063e-06, + "loss": 0.5608, + "step": 9308 + }, + { + "epoch": 2.578670360110803, + "grad_norm": 0.7334474325180054, + "learning_rate": 3.067306317819585e-06, + "loss": 0.5904, + "step": 9309 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.7224025726318359, + "learning_rate": 3.066951530103178e-06, + "loss": 0.5993, + "step": 9310 + }, + { + "epoch": 2.579224376731302, + "grad_norm": 0.7499139308929443, + "learning_rate": 3.0665967303489426e-06, + "loss": 0.6156, + "step": 9311 + }, + { + "epoch": 2.579501385041551, + "grad_norm": 0.7297726273536682, + "learning_rate": 3.0662419185644117e-06, + "loss": 0.6286, + "step": 9312 + }, + { + "epoch": 2.5797783933518006, + "grad_norm": 0.7476484775543213, + "learning_rate": 3.0658870947571184e-06, + "loss": 0.5896, + "step": 9313 + }, + { + "epoch": 2.58005540166205, + "grad_norm": 0.7497901916503906, + "learning_rate": 3.0655322589345963e-06, + "loss": 0.6288, + "step": 9314 + }, + { + "epoch": 2.580332409972299, + "grad_norm": 0.73838871717453, + "learning_rate": 3.0651774111043818e-06, + "loss": 0.5779, + "step": 9315 + }, + { + "epoch": 2.5806094182825485, + "grad_norm": 0.7194763422012329, + "learning_rate": 3.0648225512740067e-06, + "loss": 0.5829, + "step": 9316 + }, + { + "epoch": 2.580886426592798, + "grad_norm": 0.7514410614967346, + "learning_rate": 3.0644676794510063e-06, + "loss": 0.6221, + "step": 9317 + }, + { + "epoch": 2.581163434903047, + "grad_norm": 0.7325095534324646, + "learning_rate": 3.0641127956429157e-06, + "loss": 0.5811, + "step": 9318 + }, + { + "epoch": 2.5814404432132965, + "grad_norm": 0.7283647656440735, + "learning_rate": 3.06375789985727e-06, + "loss": 0.5841, + "step": 9319 + }, + { + "epoch": 2.581717451523546, + "grad_norm": 0.7078214287757874, + "learning_rate": 3.063402992101604e-06, + "loss": 0.6195, + "step": 9320 + }, + { + "epoch": 2.581994459833795, + "grad_norm": 0.7164523601531982, + "learning_rate": 3.063048072383454e-06, + "loss": 0.6172, + "step": 9321 + }, + { + "epoch": 2.5822714681440444, + "grad_norm": 0.7628939747810364, + "learning_rate": 3.0626931407103553e-06, + "loss": 0.6604, + "step": 9322 + }, + { + "epoch": 2.582548476454294, + "grad_norm": 0.7950290441513062, + "learning_rate": 3.0623381970898446e-06, + "loss": 0.6, + "step": 9323 + }, + { + "epoch": 2.582825484764543, + "grad_norm": 0.7289746403694153, + "learning_rate": 3.0619832415294576e-06, + "loss": 0.5875, + "step": 9324 + }, + { + "epoch": 2.5831024930747923, + "grad_norm": 0.7681859731674194, + "learning_rate": 3.061628274036732e-06, + "loss": 0.5503, + "step": 9325 + }, + { + "epoch": 2.583379501385042, + "grad_norm": 0.7553620338439941, + "learning_rate": 3.0612732946192032e-06, + "loss": 0.5748, + "step": 9326 + }, + { + "epoch": 2.583656509695291, + "grad_norm": 0.7522404789924622, + "learning_rate": 3.0609183032844093e-06, + "loss": 0.5743, + "step": 9327 + }, + { + "epoch": 2.5839335180055403, + "grad_norm": 0.740872859954834, + "learning_rate": 3.060563300039887e-06, + "loss": 0.5524, + "step": 9328 + }, + { + "epoch": 2.5842105263157897, + "grad_norm": 0.745161235332489, + "learning_rate": 3.0602082848931746e-06, + "loss": 0.6281, + "step": 9329 + }, + { + "epoch": 2.5844875346260388, + "grad_norm": 0.7245493531227112, + "learning_rate": 3.0598532578518097e-06, + "loss": 0.6164, + "step": 9330 + }, + { + "epoch": 2.584764542936288, + "grad_norm": 0.7285045981407166, + "learning_rate": 3.05949821892333e-06, + "loss": 0.5903, + "step": 9331 + }, + { + "epoch": 2.5850415512465377, + "grad_norm": 0.7477935552597046, + "learning_rate": 3.0591431681152756e-06, + "loss": 0.6193, + "step": 9332 + }, + { + "epoch": 2.5853185595567867, + "grad_norm": 0.7486900687217712, + "learning_rate": 3.058788105435182e-06, + "loss": 0.6019, + "step": 9333 + }, + { + "epoch": 2.585595567867036, + "grad_norm": 0.7068623900413513, + "learning_rate": 3.058433030890591e-06, + "loss": 0.5654, + "step": 9334 + }, + { + "epoch": 2.585872576177285, + "grad_norm": 0.7580755949020386, + "learning_rate": 3.058077944489041e-06, + "loss": 0.613, + "step": 9335 + }, + { + "epoch": 2.5861495844875346, + "grad_norm": 0.7915595173835754, + "learning_rate": 3.057722846238071e-06, + "loss": 0.6224, + "step": 9336 + }, + { + "epoch": 2.586426592797784, + "grad_norm": 0.772760272026062, + "learning_rate": 3.0573677361452204e-06, + "loss": 0.6323, + "step": 9337 + }, + { + "epoch": 2.586703601108033, + "grad_norm": 0.7564798593521118, + "learning_rate": 3.057012614218029e-06, + "loss": 0.5997, + "step": 9338 + }, + { + "epoch": 2.5869806094182826, + "grad_norm": 0.7556514143943787, + "learning_rate": 3.0566574804640375e-06, + "loss": 0.5907, + "step": 9339 + }, + { + "epoch": 2.587257617728532, + "grad_norm": 0.7544880509376526, + "learning_rate": 3.056302334890786e-06, + "loss": 0.5859, + "step": 9340 + }, + { + "epoch": 2.587534626038781, + "grad_norm": 0.8055458068847656, + "learning_rate": 3.0559471775058156e-06, + "loss": 0.6009, + "step": 9341 + }, + { + "epoch": 2.5878116343490305, + "grad_norm": 0.7201597690582275, + "learning_rate": 3.0555920083166673e-06, + "loss": 0.5794, + "step": 9342 + }, + { + "epoch": 2.5880886426592795, + "grad_norm": 0.7319862842559814, + "learning_rate": 3.0552368273308814e-06, + "loss": 0.5893, + "step": 9343 + }, + { + "epoch": 2.588365650969529, + "grad_norm": 0.7258013486862183, + "learning_rate": 3.0548816345559995e-06, + "loss": 0.5683, + "step": 9344 + }, + { + "epoch": 2.5886426592797784, + "grad_norm": 0.7262520790100098, + "learning_rate": 3.0545264299995643e-06, + "loss": 0.5766, + "step": 9345 + }, + { + "epoch": 2.5889196675900275, + "grad_norm": 0.7422440052032471, + "learning_rate": 3.0541712136691158e-06, + "loss": 0.6025, + "step": 9346 + }, + { + "epoch": 2.589196675900277, + "grad_norm": 0.7541458606719971, + "learning_rate": 3.0538159855721984e-06, + "loss": 0.5908, + "step": 9347 + }, + { + "epoch": 2.5894736842105264, + "grad_norm": 0.7304244637489319, + "learning_rate": 3.053460745716352e-06, + "loss": 0.581, + "step": 9348 + }, + { + "epoch": 2.5897506925207754, + "grad_norm": 0.7334722280502319, + "learning_rate": 3.0531054941091214e-06, + "loss": 0.5827, + "step": 9349 + }, + { + "epoch": 2.590027700831025, + "grad_norm": 0.7334140539169312, + "learning_rate": 3.0527502307580486e-06, + "loss": 0.6097, + "step": 9350 + }, + { + "epoch": 2.5903047091412743, + "grad_norm": 0.7113667130470276, + "learning_rate": 3.052394955670677e-06, + "loss": 0.6159, + "step": 9351 + }, + { + "epoch": 2.5905817174515233, + "grad_norm": 0.7234874367713928, + "learning_rate": 3.0520396688545496e-06, + "loss": 0.6341, + "step": 9352 + }, + { + "epoch": 2.590858725761773, + "grad_norm": 0.7299479842185974, + "learning_rate": 3.0516843703172107e-06, + "loss": 0.5955, + "step": 9353 + }, + { + "epoch": 2.5911357340720222, + "grad_norm": 0.7276083827018738, + "learning_rate": 3.051329060066203e-06, + "loss": 0.5695, + "step": 9354 + }, + { + "epoch": 2.5914127423822713, + "grad_norm": 0.6600974798202515, + "learning_rate": 3.050973738109072e-06, + "loss": 0.5695, + "step": 9355 + }, + { + "epoch": 2.5916897506925207, + "grad_norm": 0.7582259178161621, + "learning_rate": 3.050618404453361e-06, + "loss": 0.625, + "step": 9356 + }, + { + "epoch": 2.59196675900277, + "grad_norm": 0.8048502206802368, + "learning_rate": 3.050263059106615e-06, + "loss": 0.6124, + "step": 9357 + }, + { + "epoch": 2.592243767313019, + "grad_norm": 0.7244232296943665, + "learning_rate": 3.04990770207638e-06, + "loss": 0.5737, + "step": 9358 + }, + { + "epoch": 2.5925207756232687, + "grad_norm": 0.750846803188324, + "learning_rate": 3.0495523333701993e-06, + "loss": 0.5174, + "step": 9359 + }, + { + "epoch": 2.592797783933518, + "grad_norm": 0.7714466452598572, + "learning_rate": 3.0491969529956202e-06, + "loss": 0.593, + "step": 9360 + }, + { + "epoch": 2.593074792243767, + "grad_norm": 0.7113026976585388, + "learning_rate": 3.0488415609601863e-06, + "loss": 0.5794, + "step": 9361 + }, + { + "epoch": 2.5933518005540166, + "grad_norm": 0.7664023637771606, + "learning_rate": 3.0484861572714446e-06, + "loss": 0.6544, + "step": 9362 + }, + { + "epoch": 2.593628808864266, + "grad_norm": 0.7229735851287842, + "learning_rate": 3.048130741936942e-06, + "loss": 0.578, + "step": 9363 + }, + { + "epoch": 2.593905817174515, + "grad_norm": 0.7353859543800354, + "learning_rate": 3.0477753149642232e-06, + "loss": 0.6325, + "step": 9364 + }, + { + "epoch": 2.5941828254847645, + "grad_norm": 0.7365750670433044, + "learning_rate": 3.0474198763608347e-06, + "loss": 0.6022, + "step": 9365 + }, + { + "epoch": 2.594459833795014, + "grad_norm": 0.7197463512420654, + "learning_rate": 3.0470644261343262e-06, + "loss": 0.5467, + "step": 9366 + }, + { + "epoch": 2.594736842105263, + "grad_norm": 0.7440528273582458, + "learning_rate": 3.0467089642922415e-06, + "loss": 0.6267, + "step": 9367 + }, + { + "epoch": 2.5950138504155125, + "grad_norm": 0.7311400175094604, + "learning_rate": 3.04635349084213e-06, + "loss": 0.5595, + "step": 9368 + }, + { + "epoch": 2.595290858725762, + "grad_norm": 0.7559048533439636, + "learning_rate": 3.0459980057915383e-06, + "loss": 0.5791, + "step": 9369 + }, + { + "epoch": 2.595567867036011, + "grad_norm": 0.7741128206253052, + "learning_rate": 3.045642509148015e-06, + "loss": 0.6215, + "step": 9370 + }, + { + "epoch": 2.5958448753462604, + "grad_norm": 0.7354574799537659, + "learning_rate": 3.0452870009191067e-06, + "loss": 0.5868, + "step": 9371 + }, + { + "epoch": 2.59612188365651, + "grad_norm": 0.7495721578598022, + "learning_rate": 3.0449314811123636e-06, + "loss": 0.587, + "step": 9372 + }, + { + "epoch": 2.596398891966759, + "grad_norm": 0.7539669275283813, + "learning_rate": 3.044575949735334e-06, + "loss": 0.6182, + "step": 9373 + }, + { + "epoch": 2.5966759002770083, + "grad_norm": 0.736501157283783, + "learning_rate": 3.0442204067955657e-06, + "loss": 0.6197, + "step": 9374 + }, + { + "epoch": 2.596952908587258, + "grad_norm": 0.7483559250831604, + "learning_rate": 3.0438648523006088e-06, + "loss": 0.5887, + "step": 9375 + }, + { + "epoch": 2.597229916897507, + "grad_norm": 0.7232039570808411, + "learning_rate": 3.043509286258012e-06, + "loss": 0.579, + "step": 9376 + }, + { + "epoch": 2.5975069252077563, + "grad_norm": 0.7397487759590149, + "learning_rate": 3.0431537086753247e-06, + "loss": 0.5991, + "step": 9377 + }, + { + "epoch": 2.5977839335180057, + "grad_norm": 0.782886266708374, + "learning_rate": 3.0427981195600975e-06, + "loss": 0.6576, + "step": 9378 + }, + { + "epoch": 2.5980609418282548, + "grad_norm": 0.8020239472389221, + "learning_rate": 3.042442518919881e-06, + "loss": 0.5827, + "step": 9379 + }, + { + "epoch": 2.598337950138504, + "grad_norm": 0.6829942464828491, + "learning_rate": 3.0420869067622227e-06, + "loss": 0.5674, + "step": 9380 + }, + { + "epoch": 2.5986149584487537, + "grad_norm": 0.7796286940574646, + "learning_rate": 3.041731283094676e-06, + "loss": 0.6177, + "step": 9381 + }, + { + "epoch": 2.5988919667590027, + "grad_norm": 0.7237001657485962, + "learning_rate": 3.0413756479247915e-06, + "loss": 0.5829, + "step": 9382 + }, + { + "epoch": 2.599168975069252, + "grad_norm": 0.7349637746810913, + "learning_rate": 3.041020001260118e-06, + "loss": 0.6106, + "step": 9383 + }, + { + "epoch": 2.5994459833795016, + "grad_norm": 0.7296071648597717, + "learning_rate": 3.0406643431082088e-06, + "loss": 0.5722, + "step": 9384 + }, + { + "epoch": 2.5997229916897506, + "grad_norm": 0.7325433492660522, + "learning_rate": 3.0403086734766152e-06, + "loss": 0.5514, + "step": 9385 + }, + { + "epoch": 2.6, + "grad_norm": 0.7508443593978882, + "learning_rate": 3.0399529923728887e-06, + "loss": 0.5739, + "step": 9386 + }, + { + "epoch": 2.6002770083102495, + "grad_norm": 0.7678627371788025, + "learning_rate": 3.0395972998045802e-06, + "loss": 0.5893, + "step": 9387 + }, + { + "epoch": 2.6005540166204986, + "grad_norm": 0.9463872909545898, + "learning_rate": 3.0392415957792448e-06, + "loss": 0.5723, + "step": 9388 + }, + { + "epoch": 2.600831024930748, + "grad_norm": 0.7246379256248474, + "learning_rate": 3.038885880304432e-06, + "loss": 0.5698, + "step": 9389 + }, + { + "epoch": 2.6011080332409975, + "grad_norm": 0.7395390868186951, + "learning_rate": 3.0385301533876965e-06, + "loss": 0.6199, + "step": 9390 + }, + { + "epoch": 2.6013850415512465, + "grad_norm": 0.748449444770813, + "learning_rate": 3.0381744150365897e-06, + "loss": 0.6023, + "step": 9391 + }, + { + "epoch": 2.601662049861496, + "grad_norm": 0.7404671907424927, + "learning_rate": 3.0378186652586667e-06, + "loss": 0.5641, + "step": 9392 + }, + { + "epoch": 2.6019390581717454, + "grad_norm": 0.7181746363639832, + "learning_rate": 3.0374629040614788e-06, + "loss": 0.597, + "step": 9393 + }, + { + "epoch": 2.6022160664819944, + "grad_norm": 0.7396783232688904, + "learning_rate": 3.0371071314525822e-06, + "loss": 0.5861, + "step": 9394 + }, + { + "epoch": 2.602493074792244, + "grad_norm": 0.7946869730949402, + "learning_rate": 3.0367513474395293e-06, + "loss": 0.5751, + "step": 9395 + }, + { + "epoch": 2.602770083102493, + "grad_norm": 0.730797529220581, + "learning_rate": 3.0363955520298742e-06, + "loss": 0.604, + "step": 9396 + }, + { + "epoch": 2.6030470914127424, + "grad_norm": 0.7194575667381287, + "learning_rate": 3.0360397452311713e-06, + "loss": 0.5726, + "step": 9397 + }, + { + "epoch": 2.603324099722992, + "grad_norm": 0.727661669254303, + "learning_rate": 3.035683927050977e-06, + "loss": 0.6121, + "step": 9398 + }, + { + "epoch": 2.603601108033241, + "grad_norm": 0.7224810719490051, + "learning_rate": 3.0353280974968443e-06, + "loss": 0.618, + "step": 9399 + }, + { + "epoch": 2.6038781163434903, + "grad_norm": 0.7539339661598206, + "learning_rate": 3.034972256576328e-06, + "loss": 0.6185, + "step": 9400 + }, + { + "epoch": 2.6041551246537398, + "grad_norm": 0.7253374457359314, + "learning_rate": 3.034616404296986e-06, + "loss": 0.5817, + "step": 9401 + }, + { + "epoch": 2.604432132963989, + "grad_norm": 0.7376832365989685, + "learning_rate": 3.0342605406663717e-06, + "loss": 0.6031, + "step": 9402 + }, + { + "epoch": 2.6047091412742382, + "grad_norm": 0.7316452860832214, + "learning_rate": 3.033904665692042e-06, + "loss": 0.5959, + "step": 9403 + }, + { + "epoch": 2.6049861495844873, + "grad_norm": 0.717576265335083, + "learning_rate": 3.033548779381553e-06, + "loss": 0.5744, + "step": 9404 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.6863055229187012, + "learning_rate": 3.0331928817424603e-06, + "loss": 0.5895, + "step": 9405 + }, + { + "epoch": 2.605540166204986, + "grad_norm": 0.7601767778396606, + "learning_rate": 3.0328369727823216e-06, + "loss": 0.6063, + "step": 9406 + }, + { + "epoch": 2.605817174515235, + "grad_norm": 0.7255947589874268, + "learning_rate": 3.0324810525086933e-06, + "loss": 0.5959, + "step": 9407 + }, + { + "epoch": 2.6060941828254847, + "grad_norm": 0.6987281441688538, + "learning_rate": 3.0321251209291323e-06, + "loss": 0.5663, + "step": 9408 + }, + { + "epoch": 2.606371191135734, + "grad_norm": 0.7383803725242615, + "learning_rate": 3.031769178051196e-06, + "loss": 0.5965, + "step": 9409 + }, + { + "epoch": 2.606648199445983, + "grad_norm": 0.7562844753265381, + "learning_rate": 3.0314132238824416e-06, + "loss": 0.5968, + "step": 9410 + }, + { + "epoch": 2.6069252077562326, + "grad_norm": 0.7574225664138794, + "learning_rate": 3.031057258430429e-06, + "loss": 0.5961, + "step": 9411 + }, + { + "epoch": 2.607202216066482, + "grad_norm": 0.6951255798339844, + "learning_rate": 3.0307012817027124e-06, + "loss": 0.5847, + "step": 9412 + }, + { + "epoch": 2.607479224376731, + "grad_norm": 0.7327682971954346, + "learning_rate": 3.030345293706854e-06, + "loss": 0.5954, + "step": 9413 + }, + { + "epoch": 2.6077562326869805, + "grad_norm": 0.7225331664085388, + "learning_rate": 3.02998929445041e-06, + "loss": 0.5196, + "step": 9414 + }, + { + "epoch": 2.60803324099723, + "grad_norm": 0.7270516157150269, + "learning_rate": 3.02963328394094e-06, + "loss": 0.5885, + "step": 9415 + }, + { + "epoch": 2.608310249307479, + "grad_norm": 0.7316294312477112, + "learning_rate": 3.0292772621860023e-06, + "loss": 0.5801, + "step": 9416 + }, + { + "epoch": 2.6085872576177285, + "grad_norm": 0.76264488697052, + "learning_rate": 3.0289212291931576e-06, + "loss": 0.597, + "step": 9417 + }, + { + "epoch": 2.608864265927978, + "grad_norm": 0.7400818467140198, + "learning_rate": 3.028565184969964e-06, + "loss": 0.6227, + "step": 9418 + }, + { + "epoch": 2.609141274238227, + "grad_norm": 0.7234652042388916, + "learning_rate": 3.0282091295239814e-06, + "loss": 0.56, + "step": 9419 + }, + { + "epoch": 2.6094182825484764, + "grad_norm": 0.75355064868927, + "learning_rate": 3.0278530628627713e-06, + "loss": 0.5878, + "step": 9420 + }, + { + "epoch": 2.609695290858726, + "grad_norm": 0.6999250054359436, + "learning_rate": 3.0274969849938914e-06, + "loss": 0.5948, + "step": 9421 + }, + { + "epoch": 2.609972299168975, + "grad_norm": 0.7420984506607056, + "learning_rate": 3.0271408959249037e-06, + "loss": 0.5955, + "step": 9422 + }, + { + "epoch": 2.6102493074792243, + "grad_norm": 0.7665450572967529, + "learning_rate": 3.026784795663369e-06, + "loss": 0.6236, + "step": 9423 + }, + { + "epoch": 2.610526315789474, + "grad_norm": 0.7386185526847839, + "learning_rate": 3.026428684216848e-06, + "loss": 0.6541, + "step": 9424 + }, + { + "epoch": 2.610803324099723, + "grad_norm": 0.7392768263816833, + "learning_rate": 3.0260725615929e-06, + "loss": 0.62, + "step": 9425 + }, + { + "epoch": 2.6110803324099723, + "grad_norm": 0.7245485186576843, + "learning_rate": 3.02571642779909e-06, + "loss": 0.6487, + "step": 9426 + }, + { + "epoch": 2.6113573407202217, + "grad_norm": 0.7127525806427002, + "learning_rate": 3.0253602828429774e-06, + "loss": 0.5901, + "step": 9427 + }, + { + "epoch": 2.6116343490304708, + "grad_norm": 0.7617471814155579, + "learning_rate": 3.0250041267321234e-06, + "loss": 0.6056, + "step": 9428 + }, + { + "epoch": 2.61191135734072, + "grad_norm": 0.7574014067649841, + "learning_rate": 3.0246479594740918e-06, + "loss": 0.6256, + "step": 9429 + }, + { + "epoch": 2.6121883656509697, + "grad_norm": 0.6903671026229858, + "learning_rate": 3.0242917810764444e-06, + "loss": 0.5489, + "step": 9430 + }, + { + "epoch": 2.6124653739612187, + "grad_norm": 0.6944217085838318, + "learning_rate": 3.0239355915467433e-06, + "loss": 0.5895, + "step": 9431 + }, + { + "epoch": 2.612742382271468, + "grad_norm": 0.7788039445877075, + "learning_rate": 3.0235793908925514e-06, + "loss": 0.5921, + "step": 9432 + }, + { + "epoch": 2.6130193905817176, + "grad_norm": 0.7304327487945557, + "learning_rate": 3.023223179121433e-06, + "loss": 0.5859, + "step": 9433 + }, + { + "epoch": 2.6132963988919666, + "grad_norm": 0.7489771842956543, + "learning_rate": 3.022866956240949e-06, + "loss": 0.5972, + "step": 9434 + }, + { + "epoch": 2.613573407202216, + "grad_norm": 0.7207261323928833, + "learning_rate": 3.022510722258665e-06, + "loss": 0.627, + "step": 9435 + }, + { + "epoch": 2.6138504155124656, + "grad_norm": 0.7087815403938293, + "learning_rate": 3.022154477182144e-06, + "loss": 0.5543, + "step": 9436 + }, + { + "epoch": 2.6141274238227146, + "grad_norm": 0.7843639254570007, + "learning_rate": 3.0217982210189496e-06, + "loss": 0.5903, + "step": 9437 + }, + { + "epoch": 2.614404432132964, + "grad_norm": 0.7218259572982788, + "learning_rate": 3.021441953776646e-06, + "loss": 0.5908, + "step": 9438 + }, + { + "epoch": 2.6146814404432135, + "grad_norm": 0.7581648230552673, + "learning_rate": 3.0210856754627993e-06, + "loss": 0.5855, + "step": 9439 + }, + { + "epoch": 2.6149584487534625, + "grad_norm": 0.752878725528717, + "learning_rate": 3.0207293860849725e-06, + "loss": 0.5708, + "step": 9440 + }, + { + "epoch": 2.615235457063712, + "grad_norm": 0.6773368716239929, + "learning_rate": 3.02037308565073e-06, + "loss": 0.5578, + "step": 9441 + }, + { + "epoch": 2.6155124653739614, + "grad_norm": 0.875789225101471, + "learning_rate": 3.02001677416764e-06, + "loss": 0.5907, + "step": 9442 + }, + { + "epoch": 2.6157894736842104, + "grad_norm": 0.7268041372299194, + "learning_rate": 3.0196604516432646e-06, + "loss": 0.6047, + "step": 9443 + }, + { + "epoch": 2.61606648199446, + "grad_norm": 0.7566415667533875, + "learning_rate": 3.0193041180851712e-06, + "loss": 0.5801, + "step": 9444 + }, + { + "epoch": 2.6163434903047094, + "grad_norm": 0.6960709095001221, + "learning_rate": 3.0189477735009255e-06, + "loss": 0.562, + "step": 9445 + }, + { + "epoch": 2.6166204986149584, + "grad_norm": 0.7445232272148132, + "learning_rate": 3.0185914178980924e-06, + "loss": 0.6094, + "step": 9446 + }, + { + "epoch": 2.616897506925208, + "grad_norm": 0.7602598667144775, + "learning_rate": 3.0182350512842396e-06, + "loss": 0.6036, + "step": 9447 + }, + { + "epoch": 2.6171745152354573, + "grad_norm": 0.7294239401817322, + "learning_rate": 3.017878673666933e-06, + "loss": 0.575, + "step": 9448 + }, + { + "epoch": 2.6174515235457063, + "grad_norm": 0.7791168689727783, + "learning_rate": 3.01752228505374e-06, + "loss": 0.6117, + "step": 9449 + }, + { + "epoch": 2.6177285318559558, + "grad_norm": 0.7321231365203857, + "learning_rate": 3.0171658854522274e-06, + "loss": 0.582, + "step": 9450 + }, + { + "epoch": 2.6180055401662052, + "grad_norm": 0.7250687479972839, + "learning_rate": 3.0168094748699616e-06, + "loss": 0.5873, + "step": 9451 + }, + { + "epoch": 2.6182825484764543, + "grad_norm": 0.7413502931594849, + "learning_rate": 3.0164530533145128e-06, + "loss": 0.6043, + "step": 9452 + }, + { + "epoch": 2.6185595567867037, + "grad_norm": 0.6866628527641296, + "learning_rate": 3.0160966207934444e-06, + "loss": 0.5894, + "step": 9453 + }, + { + "epoch": 2.618836565096953, + "grad_norm": 0.7689675092697144, + "learning_rate": 3.015740177314328e-06, + "loss": 0.6129, + "step": 9454 + }, + { + "epoch": 2.619113573407202, + "grad_norm": 0.7070875763893127, + "learning_rate": 3.0153837228847306e-06, + "loss": 0.5956, + "step": 9455 + }, + { + "epoch": 2.6193905817174516, + "grad_norm": 0.7495928406715393, + "learning_rate": 3.01502725751222e-06, + "loss": 0.5748, + "step": 9456 + }, + { + "epoch": 2.6196675900277007, + "grad_norm": 0.7458643913269043, + "learning_rate": 3.0146707812043657e-06, + "loss": 0.6489, + "step": 9457 + }, + { + "epoch": 2.61994459833795, + "grad_norm": 0.7419530153274536, + "learning_rate": 3.014314293968737e-06, + "loss": 0.6332, + "step": 9458 + }, + { + "epoch": 2.6202216066481996, + "grad_norm": 0.6879041194915771, + "learning_rate": 3.013957795812902e-06, + "loss": 0.5733, + "step": 9459 + }, + { + "epoch": 2.6204986149584486, + "grad_norm": 0.7298100590705872, + "learning_rate": 3.0136012867444298e-06, + "loss": 0.6445, + "step": 9460 + }, + { + "epoch": 2.620775623268698, + "grad_norm": 0.7113451957702637, + "learning_rate": 3.013244766770892e-06, + "loss": 0.581, + "step": 9461 + }, + { + "epoch": 2.6210526315789475, + "grad_norm": 0.7236753702163696, + "learning_rate": 3.0128882358998562e-06, + "loss": 0.5927, + "step": 9462 + }, + { + "epoch": 2.6213296398891965, + "grad_norm": 0.7375840544700623, + "learning_rate": 3.012531694138894e-06, + "loss": 0.6359, + "step": 9463 + }, + { + "epoch": 2.621606648199446, + "grad_norm": 0.7284663915634155, + "learning_rate": 3.012175141495574e-06, + "loss": 0.581, + "step": 9464 + }, + { + "epoch": 2.621883656509695, + "grad_norm": 0.7280388474464417, + "learning_rate": 3.0118185779774693e-06, + "loss": 0.6284, + "step": 9465 + }, + { + "epoch": 2.6221606648199445, + "grad_norm": 0.6828910708427429, + "learning_rate": 3.011462003592148e-06, + "loss": 0.5252, + "step": 9466 + }, + { + "epoch": 2.622437673130194, + "grad_norm": 0.7209635972976685, + "learning_rate": 3.0111054183471825e-06, + "loss": 0.6168, + "step": 9467 + }, + { + "epoch": 2.622714681440443, + "grad_norm": 0.7128496766090393, + "learning_rate": 3.010748822250144e-06, + "loss": 0.5724, + "step": 9468 + }, + { + "epoch": 2.6229916897506924, + "grad_norm": 0.7473496198654175, + "learning_rate": 3.010392215308604e-06, + "loss": 0.6042, + "step": 9469 + }, + { + "epoch": 2.623268698060942, + "grad_norm": 0.7299935817718506, + "learning_rate": 3.0100355975301328e-06, + "loss": 0.529, + "step": 9470 + }, + { + "epoch": 2.623545706371191, + "grad_norm": 0.7437604069709778, + "learning_rate": 3.009678968922304e-06, + "loss": 0.5975, + "step": 9471 + }, + { + "epoch": 2.6238227146814403, + "grad_norm": 0.7657999396324158, + "learning_rate": 3.009322329492689e-06, + "loss": 0.6125, + "step": 9472 + }, + { + "epoch": 2.62409972299169, + "grad_norm": 0.7673649191856384, + "learning_rate": 3.0089656792488604e-06, + "loss": 0.6358, + "step": 9473 + }, + { + "epoch": 2.624376731301939, + "grad_norm": 0.7584307789802551, + "learning_rate": 3.0086090181983914e-06, + "loss": 0.5712, + "step": 9474 + }, + { + "epoch": 2.6246537396121883, + "grad_norm": 0.7290375232696533, + "learning_rate": 3.008252346348853e-06, + "loss": 0.5979, + "step": 9475 + }, + { + "epoch": 2.6249307479224377, + "grad_norm": 0.7436773777008057, + "learning_rate": 3.0078956637078193e-06, + "loss": 0.6078, + "step": 9476 + }, + { + "epoch": 2.6252077562326868, + "grad_norm": 0.7523717880249023, + "learning_rate": 3.0075389702828647e-06, + "loss": 0.6113, + "step": 9477 + }, + { + "epoch": 2.6254847645429362, + "grad_norm": 0.722774863243103, + "learning_rate": 3.007182266081561e-06, + "loss": 0.6151, + "step": 9478 + }, + { + "epoch": 2.6257617728531857, + "grad_norm": 0.7217817902565002, + "learning_rate": 3.006825551111482e-06, + "loss": 0.5692, + "step": 9479 + }, + { + "epoch": 2.6260387811634347, + "grad_norm": 0.7214882373809814, + "learning_rate": 3.006468825380203e-06, + "loss": 0.5631, + "step": 9480 + }, + { + "epoch": 2.626315789473684, + "grad_norm": 0.7373204827308655, + "learning_rate": 3.006112088895297e-06, + "loss": 0.5715, + "step": 9481 + }, + { + "epoch": 2.6265927977839336, + "grad_norm": 0.7106059789657593, + "learning_rate": 3.005755341664339e-06, + "loss": 0.5559, + "step": 9482 + }, + { + "epoch": 2.6268698060941826, + "grad_norm": 0.7502145767211914, + "learning_rate": 3.005398583694904e-06, + "loss": 0.5861, + "step": 9483 + }, + { + "epoch": 2.627146814404432, + "grad_norm": 0.7245887517929077, + "learning_rate": 3.005041814994566e-06, + "loss": 0.5801, + "step": 9484 + }, + { + "epoch": 2.6274238227146816, + "grad_norm": 0.7328919172286987, + "learning_rate": 3.0046850355709005e-06, + "loss": 0.5702, + "step": 9485 + }, + { + "epoch": 2.6277008310249306, + "grad_norm": 0.7673099040985107, + "learning_rate": 3.0043282454314824e-06, + "loss": 0.5663, + "step": 9486 + }, + { + "epoch": 2.62797783933518, + "grad_norm": 0.7004749774932861, + "learning_rate": 3.0039714445838896e-06, + "loss": 0.5603, + "step": 9487 + }, + { + "epoch": 2.6282548476454295, + "grad_norm": 0.7447711229324341, + "learning_rate": 3.003614633035694e-06, + "loss": 0.6117, + "step": 9488 + }, + { + "epoch": 2.6285318559556785, + "grad_norm": 0.7462581992149353, + "learning_rate": 3.0032578107944747e-06, + "loss": 0.6043, + "step": 9489 + }, + { + "epoch": 2.628808864265928, + "grad_norm": 0.7413774132728577, + "learning_rate": 3.002900977867807e-06, + "loss": 0.5636, + "step": 9490 + }, + { + "epoch": 2.6290858725761774, + "grad_norm": 0.7752082347869873, + "learning_rate": 3.0025441342632666e-06, + "loss": 0.595, + "step": 9491 + }, + { + "epoch": 2.6293628808864264, + "grad_norm": 0.7633326053619385, + "learning_rate": 3.002187279988431e-06, + "loss": 0.5867, + "step": 9492 + }, + { + "epoch": 2.629639889196676, + "grad_norm": 0.714028000831604, + "learning_rate": 3.001830415050877e-06, + "loss": 0.5907, + "step": 9493 + }, + { + "epoch": 2.6299168975069254, + "grad_norm": 0.7516353726387024, + "learning_rate": 3.0014735394581824e-06, + "loss": 0.5741, + "step": 9494 + }, + { + "epoch": 2.6301939058171744, + "grad_norm": 0.7380667924880981, + "learning_rate": 3.0011166532179235e-06, + "loss": 0.6013, + "step": 9495 + }, + { + "epoch": 2.630470914127424, + "grad_norm": 0.7714182138442993, + "learning_rate": 3.0007597563376784e-06, + "loss": 0.6277, + "step": 9496 + }, + { + "epoch": 2.6307479224376733, + "grad_norm": 0.7297613024711609, + "learning_rate": 3.0004028488250248e-06, + "loss": 0.5951, + "step": 9497 + }, + { + "epoch": 2.6310249307479223, + "grad_norm": 0.6872463822364807, + "learning_rate": 3.0000459306875405e-06, + "loss": 0.5985, + "step": 9498 + }, + { + "epoch": 2.631301939058172, + "grad_norm": 0.7453889846801758, + "learning_rate": 2.9996890019328044e-06, + "loss": 0.6077, + "step": 9499 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.7232484817504883, + "learning_rate": 2.9993320625683954e-06, + "loss": 0.6089, + "step": 9500 + }, + { + "epoch": 2.6318559556786703, + "grad_norm": 0.7161245942115784, + "learning_rate": 2.99897511260189e-06, + "loss": 0.5707, + "step": 9501 + }, + { + "epoch": 2.6321329639889197, + "grad_norm": 0.7165181040763855, + "learning_rate": 2.9986181520408695e-06, + "loss": 0.6155, + "step": 9502 + }, + { + "epoch": 2.632409972299169, + "grad_norm": 0.7250982522964478, + "learning_rate": 2.998261180892913e-06, + "loss": 0.6123, + "step": 9503 + }, + { + "epoch": 2.632686980609418, + "grad_norm": 0.6966186761856079, + "learning_rate": 2.997904199165599e-06, + "loss": 0.5974, + "step": 9504 + }, + { + "epoch": 2.6329639889196677, + "grad_norm": 0.7354041934013367, + "learning_rate": 2.9975472068665063e-06, + "loss": 0.6189, + "step": 9505 + }, + { + "epoch": 2.633240997229917, + "grad_norm": 0.7224618792533875, + "learning_rate": 2.997190204003217e-06, + "loss": 0.5636, + "step": 9506 + }, + { + "epoch": 2.633518005540166, + "grad_norm": 0.7574952840805054, + "learning_rate": 2.9968331905833094e-06, + "loss": 0.5655, + "step": 9507 + }, + { + "epoch": 2.6337950138504156, + "grad_norm": 0.720216691493988, + "learning_rate": 2.9964761666143638e-06, + "loss": 0.5666, + "step": 9508 + }, + { + "epoch": 2.634072022160665, + "grad_norm": 0.7723554968833923, + "learning_rate": 2.9961191321039622e-06, + "loss": 0.6007, + "step": 9509 + }, + { + "epoch": 2.634349030470914, + "grad_norm": 0.709218442440033, + "learning_rate": 2.9957620870596845e-06, + "loss": 0.6237, + "step": 9510 + }, + { + "epoch": 2.6346260387811635, + "grad_norm": 0.7179205417633057, + "learning_rate": 2.995405031489111e-06, + "loss": 0.6205, + "step": 9511 + }, + { + "epoch": 2.634903047091413, + "grad_norm": 0.6865183711051941, + "learning_rate": 2.9950479653998245e-06, + "loss": 0.6088, + "step": 9512 + }, + { + "epoch": 2.635180055401662, + "grad_norm": 0.6995043158531189, + "learning_rate": 2.9946908887994046e-06, + "loss": 0.5564, + "step": 9513 + }, + { + "epoch": 2.6354570637119115, + "grad_norm": 0.7671514749526978, + "learning_rate": 2.994333801695434e-06, + "loss": 0.6239, + "step": 9514 + }, + { + "epoch": 2.635734072022161, + "grad_norm": 0.7148752808570862, + "learning_rate": 2.9939767040954943e-06, + "loss": 0.616, + "step": 9515 + }, + { + "epoch": 2.63601108033241, + "grad_norm": 0.7045048475265503, + "learning_rate": 2.993619596007168e-06, + "loss": 0.5905, + "step": 9516 + }, + { + "epoch": 2.6362880886426594, + "grad_norm": 0.7336992621421814, + "learning_rate": 2.9932624774380375e-06, + "loss": 0.61, + "step": 9517 + }, + { + "epoch": 2.636565096952909, + "grad_norm": 0.7365102767944336, + "learning_rate": 2.992905348395684e-06, + "loss": 0.6127, + "step": 9518 + }, + { + "epoch": 2.636842105263158, + "grad_norm": 0.7184882760047913, + "learning_rate": 2.9925482088876923e-06, + "loss": 0.5515, + "step": 9519 + }, + { + "epoch": 2.6371191135734073, + "grad_norm": 0.7389319539070129, + "learning_rate": 2.992191058921643e-06, + "loss": 0.6074, + "step": 9520 + }, + { + "epoch": 2.6373961218836564, + "grad_norm": 0.7419624924659729, + "learning_rate": 2.9918338985051216e-06, + "loss": 0.5779, + "step": 9521 + }, + { + "epoch": 2.637673130193906, + "grad_norm": 0.681654155254364, + "learning_rate": 2.99147672764571e-06, + "loss": 0.5927, + "step": 9522 + }, + { + "epoch": 2.6379501385041553, + "grad_norm": 0.7966417670249939, + "learning_rate": 2.9911195463509935e-06, + "loss": 0.6108, + "step": 9523 + }, + { + "epoch": 2.6382271468144043, + "grad_norm": 0.7548315525054932, + "learning_rate": 2.990762354628553e-06, + "loss": 0.568, + "step": 9524 + }, + { + "epoch": 2.6385041551246537, + "grad_norm": 0.7525621652603149, + "learning_rate": 2.990405152485976e-06, + "loss": 0.6276, + "step": 9525 + }, + { + "epoch": 2.638781163434903, + "grad_norm": 0.7998759150505066, + "learning_rate": 2.9900479399308444e-06, + "loss": 0.5655, + "step": 9526 + }, + { + "epoch": 2.6390581717451522, + "grad_norm": 0.7348849773406982, + "learning_rate": 2.9896907169707435e-06, + "loss": 0.604, + "step": 9527 + }, + { + "epoch": 2.6393351800554017, + "grad_norm": 0.7512754797935486, + "learning_rate": 2.9893334836132594e-06, + "loss": 0.5545, + "step": 9528 + }, + { + "epoch": 2.6396121883656507, + "grad_norm": 0.7238155007362366, + "learning_rate": 2.9889762398659743e-06, + "loss": 0.5913, + "step": 9529 + }, + { + "epoch": 2.6398891966759, + "grad_norm": 0.7376219630241394, + "learning_rate": 2.9886189857364755e-06, + "loss": 0.6672, + "step": 9530 + }, + { + "epoch": 2.6401662049861496, + "grad_norm": 0.767388641834259, + "learning_rate": 2.9882617212323485e-06, + "loss": 0.5675, + "step": 9531 + }, + { + "epoch": 2.6404432132963986, + "grad_norm": 0.7074344754219055, + "learning_rate": 2.9879044463611774e-06, + "loss": 0.5669, + "step": 9532 + }, + { + "epoch": 2.640720221606648, + "grad_norm": 0.6814509630203247, + "learning_rate": 2.9875471611305486e-06, + "loss": 0.5579, + "step": 9533 + }, + { + "epoch": 2.6409972299168976, + "grad_norm": 0.7133084535598755, + "learning_rate": 2.9871898655480493e-06, + "loss": 0.5783, + "step": 9534 + }, + { + "epoch": 2.6412742382271466, + "grad_norm": 0.7370125651359558, + "learning_rate": 2.9868325596212648e-06, + "loss": 0.584, + "step": 9535 + }, + { + "epoch": 2.641551246537396, + "grad_norm": 0.7439171671867371, + "learning_rate": 2.986475243357782e-06, + "loss": 0.5746, + "step": 9536 + }, + { + "epoch": 2.6418282548476455, + "grad_norm": 0.7310712933540344, + "learning_rate": 2.9861179167651866e-06, + "loss": 0.6293, + "step": 9537 + }, + { + "epoch": 2.6421052631578945, + "grad_norm": 0.7177421450614929, + "learning_rate": 2.985760579851068e-06, + "loss": 0.6244, + "step": 9538 + }, + { + "epoch": 2.642382271468144, + "grad_norm": 0.7210702896118164, + "learning_rate": 2.9854032326230102e-06, + "loss": 0.6147, + "step": 9539 + }, + { + "epoch": 2.6426592797783934, + "grad_norm": 0.7226083278656006, + "learning_rate": 2.9850458750886025e-06, + "loss": 0.5809, + "step": 9540 + }, + { + "epoch": 2.6429362880886424, + "grad_norm": 0.7365908026695251, + "learning_rate": 2.9846885072554333e-06, + "loss": 0.5798, + "step": 9541 + }, + { + "epoch": 2.643213296398892, + "grad_norm": 0.6953509449958801, + "learning_rate": 2.984331129131088e-06, + "loss": 0.5839, + "step": 9542 + }, + { + "epoch": 2.6434903047091414, + "grad_norm": 0.7429455518722534, + "learning_rate": 2.983973740723157e-06, + "loss": 0.5836, + "step": 9543 + }, + { + "epoch": 2.6437673130193904, + "grad_norm": 0.6981019377708435, + "learning_rate": 2.9836163420392273e-06, + "loss": 0.5913, + "step": 9544 + }, + { + "epoch": 2.64404432132964, + "grad_norm": 0.7662050724029541, + "learning_rate": 2.9832589330868877e-06, + "loss": 0.5759, + "step": 9545 + }, + { + "epoch": 2.6443213296398893, + "grad_norm": 0.6959735155105591, + "learning_rate": 2.982901513873726e-06, + "loss": 0.6031, + "step": 9546 + }, + { + "epoch": 2.6445983379501383, + "grad_norm": 0.7445115447044373, + "learning_rate": 2.982544084407333e-06, + "loss": 0.6363, + "step": 9547 + }, + { + "epoch": 2.644875346260388, + "grad_norm": 0.7081512212753296, + "learning_rate": 2.982186644695296e-06, + "loss": 0.5881, + "step": 9548 + }, + { + "epoch": 2.6451523545706372, + "grad_norm": 0.7388466000556946, + "learning_rate": 2.9818291947452062e-06, + "loss": 0.5806, + "step": 9549 + }, + { + "epoch": 2.6454293628808863, + "grad_norm": 0.6682338714599609, + "learning_rate": 2.9814717345646516e-06, + "loss": 0.5682, + "step": 9550 + }, + { + "epoch": 2.6457063711911357, + "grad_norm": 0.7200564742088318, + "learning_rate": 2.9811142641612223e-06, + "loss": 0.5747, + "step": 9551 + }, + { + "epoch": 2.645983379501385, + "grad_norm": 0.7652223110198975, + "learning_rate": 2.9807567835425085e-06, + "loss": 0.6063, + "step": 9552 + }, + { + "epoch": 2.646260387811634, + "grad_norm": 0.7285156846046448, + "learning_rate": 2.9803992927161e-06, + "loss": 0.5795, + "step": 9553 + }, + { + "epoch": 2.6465373961218837, + "grad_norm": 0.7372853755950928, + "learning_rate": 2.9800417916895885e-06, + "loss": 0.5766, + "step": 9554 + }, + { + "epoch": 2.646814404432133, + "grad_norm": 0.7120565176010132, + "learning_rate": 2.979684280470564e-06, + "loss": 0.5672, + "step": 9555 + }, + { + "epoch": 2.647091412742382, + "grad_norm": 0.7326462268829346, + "learning_rate": 2.9793267590666164e-06, + "loss": 0.5471, + "step": 9556 + }, + { + "epoch": 2.6473684210526316, + "grad_norm": 0.7396538853645325, + "learning_rate": 2.978969227485339e-06, + "loss": 0.5931, + "step": 9557 + }, + { + "epoch": 2.647645429362881, + "grad_norm": 0.7358024716377258, + "learning_rate": 2.9786116857343205e-06, + "loss": 0.618, + "step": 9558 + }, + { + "epoch": 2.64792243767313, + "grad_norm": 0.7099702954292297, + "learning_rate": 2.9782541338211534e-06, + "loss": 0.601, + "step": 9559 + }, + { + "epoch": 2.6481994459833795, + "grad_norm": 0.7943007349967957, + "learning_rate": 2.9778965717534314e-06, + "loss": 0.5557, + "step": 9560 + }, + { + "epoch": 2.648476454293629, + "grad_norm": 0.7590210437774658, + "learning_rate": 2.9775389995387434e-06, + "loss": 0.6195, + "step": 9561 + }, + { + "epoch": 2.648753462603878, + "grad_norm": 0.742610514163971, + "learning_rate": 2.9771814171846836e-06, + "loss": 0.5486, + "step": 9562 + }, + { + "epoch": 2.6490304709141275, + "grad_norm": 0.7486650347709656, + "learning_rate": 2.9768238246988433e-06, + "loss": 0.5742, + "step": 9563 + }, + { + "epoch": 2.649307479224377, + "grad_norm": 0.7409623861312866, + "learning_rate": 2.976466222088816e-06, + "loss": 0.5885, + "step": 9564 + }, + { + "epoch": 2.649584487534626, + "grad_norm": 0.7396969199180603, + "learning_rate": 2.9761086093621934e-06, + "loss": 0.6073, + "step": 9565 + }, + { + "epoch": 2.6498614958448754, + "grad_norm": 0.700952410697937, + "learning_rate": 2.9757509865265693e-06, + "loss": 0.588, + "step": 9566 + }, + { + "epoch": 2.650138504155125, + "grad_norm": 0.7877334356307983, + "learning_rate": 2.9753933535895373e-06, + "loss": 0.6178, + "step": 9567 + }, + { + "epoch": 2.650415512465374, + "grad_norm": 0.7633309960365295, + "learning_rate": 2.97503571055869e-06, + "loss": 0.6016, + "step": 9568 + }, + { + "epoch": 2.6506925207756233, + "grad_norm": 0.7346745729446411, + "learning_rate": 2.9746780574416214e-06, + "loss": 0.5777, + "step": 9569 + }, + { + "epoch": 2.650969529085873, + "grad_norm": 0.6854615807533264, + "learning_rate": 2.974320394245925e-06, + "loss": 0.6255, + "step": 9570 + }, + { + "epoch": 2.651246537396122, + "grad_norm": 0.743980348110199, + "learning_rate": 2.9739627209791965e-06, + "loss": 0.5989, + "step": 9571 + }, + { + "epoch": 2.6515235457063713, + "grad_norm": 0.6915601491928101, + "learning_rate": 2.973605037649029e-06, + "loss": 0.5674, + "step": 9572 + }, + { + "epoch": 2.6518005540166207, + "grad_norm": 0.7698721885681152, + "learning_rate": 2.973247344263016e-06, + "loss": 0.6151, + "step": 9573 + }, + { + "epoch": 2.6520775623268698, + "grad_norm": 0.7377426028251648, + "learning_rate": 2.972889640828754e-06, + "loss": 0.6186, + "step": 9574 + }, + { + "epoch": 2.652354570637119, + "grad_norm": 0.7789843082427979, + "learning_rate": 2.9725319273538372e-06, + "loss": 0.5894, + "step": 9575 + }, + { + "epoch": 2.6526315789473687, + "grad_norm": 0.7466744184494019, + "learning_rate": 2.9721742038458613e-06, + "loss": 0.6251, + "step": 9576 + }, + { + "epoch": 2.6529085872576177, + "grad_norm": 0.708580493927002, + "learning_rate": 2.971816470312421e-06, + "loss": 0.6142, + "step": 9577 + }, + { + "epoch": 2.653185595567867, + "grad_norm": 0.7652707695960999, + "learning_rate": 2.971458726761111e-06, + "loss": 0.611, + "step": 9578 + }, + { + "epoch": 2.6534626038781166, + "grad_norm": 0.7291722893714905, + "learning_rate": 2.97110097319953e-06, + "loss": 0.6017, + "step": 9579 + }, + { + "epoch": 2.6537396121883656, + "grad_norm": 0.7659996747970581, + "learning_rate": 2.970743209635271e-06, + "loss": 0.5777, + "step": 9580 + }, + { + "epoch": 2.654016620498615, + "grad_norm": 0.7659127116203308, + "learning_rate": 2.9703854360759328e-06, + "loss": 0.5903, + "step": 9581 + }, + { + "epoch": 2.654293628808864, + "grad_norm": 0.7328753471374512, + "learning_rate": 2.9700276525291096e-06, + "loss": 0.5977, + "step": 9582 + }, + { + "epoch": 2.6545706371191136, + "grad_norm": 0.7215525507926941, + "learning_rate": 2.9696698590023985e-06, + "loss": 0.5738, + "step": 9583 + }, + { + "epoch": 2.654847645429363, + "grad_norm": 0.7107459902763367, + "learning_rate": 2.969312055503398e-06, + "loss": 0.6168, + "step": 9584 + }, + { + "epoch": 2.655124653739612, + "grad_norm": 0.6944848299026489, + "learning_rate": 2.968954242039704e-06, + "loss": 0.591, + "step": 9585 + }, + { + "epoch": 2.6554016620498615, + "grad_norm": 0.7375234961509705, + "learning_rate": 2.968596418618913e-06, + "loss": 0.5599, + "step": 9586 + }, + { + "epoch": 2.655678670360111, + "grad_norm": 0.7887413501739502, + "learning_rate": 2.968238585248624e-06, + "loss": 0.5592, + "step": 9587 + }, + { + "epoch": 2.65595567867036, + "grad_norm": 0.7620272040367126, + "learning_rate": 2.9678807419364342e-06, + "loss": 0.6038, + "step": 9588 + }, + { + "epoch": 2.6562326869806094, + "grad_norm": 0.7225857377052307, + "learning_rate": 2.967522888689941e-06, + "loss": 0.6041, + "step": 9589 + }, + { + "epoch": 2.6565096952908585, + "grad_norm": 0.700484037399292, + "learning_rate": 2.9671650255167433e-06, + "loss": 0.5717, + "step": 9590 + }, + { + "epoch": 2.656786703601108, + "grad_norm": 0.7240809798240662, + "learning_rate": 2.9668071524244387e-06, + "loss": 0.5635, + "step": 9591 + }, + { + "epoch": 2.6570637119113574, + "grad_norm": 0.7192869782447815, + "learning_rate": 2.9664492694206272e-06, + "loss": 0.6109, + "step": 9592 + }, + { + "epoch": 2.6573407202216064, + "grad_norm": 0.7375945448875427, + "learning_rate": 2.966091376512905e-06, + "loss": 0.6293, + "step": 9593 + }, + { + "epoch": 2.657617728531856, + "grad_norm": 0.7899526953697205, + "learning_rate": 2.9657334737088734e-06, + "loss": 0.613, + "step": 9594 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.7093481421470642, + "learning_rate": 2.965375561016131e-06, + "loss": 0.562, + "step": 9595 + }, + { + "epoch": 2.6581717451523543, + "grad_norm": 0.7345967292785645, + "learning_rate": 2.9650176384422764e-06, + "loss": 0.5783, + "step": 9596 + }, + { + "epoch": 2.658448753462604, + "grad_norm": 0.7613131403923035, + "learning_rate": 2.96465970599491e-06, + "loss": 0.6016, + "step": 9597 + }, + { + "epoch": 2.6587257617728532, + "grad_norm": 0.7190057635307312, + "learning_rate": 2.964301763681632e-06, + "loss": 0.5851, + "step": 9598 + }, + { + "epoch": 2.6590027700831023, + "grad_norm": 0.7403254508972168, + "learning_rate": 2.9639438115100416e-06, + "loss": 0.577, + "step": 9599 + }, + { + "epoch": 2.6592797783933517, + "grad_norm": 0.7889108061790466, + "learning_rate": 2.9635858494877386e-06, + "loss": 0.6024, + "step": 9600 + }, + { + "epoch": 2.659556786703601, + "grad_norm": 0.750576913356781, + "learning_rate": 2.9632278776223253e-06, + "loss": 0.6004, + "step": 9601 + }, + { + "epoch": 2.65983379501385, + "grad_norm": 0.7538860440254211, + "learning_rate": 2.9628698959214e-06, + "loss": 0.5941, + "step": 9602 + }, + { + "epoch": 2.6601108033240997, + "grad_norm": 0.7576056122779846, + "learning_rate": 2.9625119043925655e-06, + "loss": 0.5905, + "step": 9603 + }, + { + "epoch": 2.660387811634349, + "grad_norm": 0.7413800954818726, + "learning_rate": 2.9621539030434223e-06, + "loss": 0.5935, + "step": 9604 + }, + { + "epoch": 2.660664819944598, + "grad_norm": 0.7557872533798218, + "learning_rate": 2.9617958918815713e-06, + "loss": 0.5932, + "step": 9605 + }, + { + "epoch": 2.6609418282548476, + "grad_norm": 0.7386396527290344, + "learning_rate": 2.9614378709146136e-06, + "loss": 0.5809, + "step": 9606 + }, + { + "epoch": 2.661218836565097, + "grad_norm": 0.7721570134162903, + "learning_rate": 2.9610798401501523e-06, + "loss": 0.6371, + "step": 9607 + }, + { + "epoch": 2.661495844875346, + "grad_norm": 0.7065140604972839, + "learning_rate": 2.960721799595788e-06, + "loss": 0.6045, + "step": 9608 + }, + { + "epoch": 2.6617728531855955, + "grad_norm": 0.7167521715164185, + "learning_rate": 2.960363749259124e-06, + "loss": 0.6045, + "step": 9609 + }, + { + "epoch": 2.662049861495845, + "grad_norm": 0.7369183301925659, + "learning_rate": 2.9600056891477613e-06, + "loss": 0.6044, + "step": 9610 + }, + { + "epoch": 2.662326869806094, + "grad_norm": 0.7165054082870483, + "learning_rate": 2.9596476192693036e-06, + "loss": 0.5877, + "step": 9611 + }, + { + "epoch": 2.6626038781163435, + "grad_norm": 0.733930230140686, + "learning_rate": 2.959289539631353e-06, + "loss": 0.5752, + "step": 9612 + }, + { + "epoch": 2.662880886426593, + "grad_norm": 0.7349205017089844, + "learning_rate": 2.9589314502415125e-06, + "loss": 0.5882, + "step": 9613 + }, + { + "epoch": 2.663157894736842, + "grad_norm": 0.7827674746513367, + "learning_rate": 2.9585733511073855e-06, + "loss": 0.5944, + "step": 9614 + }, + { + "epoch": 2.6634349030470914, + "grad_norm": 0.7822999954223633, + "learning_rate": 2.9582152422365745e-06, + "loss": 0.6061, + "step": 9615 + }, + { + "epoch": 2.663711911357341, + "grad_norm": 0.719627857208252, + "learning_rate": 2.9578571236366843e-06, + "loss": 0.6213, + "step": 9616 + }, + { + "epoch": 2.66398891966759, + "grad_norm": 0.7425439357757568, + "learning_rate": 2.957498995315318e-06, + "loss": 0.6105, + "step": 9617 + }, + { + "epoch": 2.6642659279778393, + "grad_norm": 0.7512136697769165, + "learning_rate": 2.9571408572800802e-06, + "loss": 0.5737, + "step": 9618 + }, + { + "epoch": 2.664542936288089, + "grad_norm": 0.7477335333824158, + "learning_rate": 2.9567827095385737e-06, + "loss": 0.6346, + "step": 9619 + }, + { + "epoch": 2.664819944598338, + "grad_norm": 0.7497116327285767, + "learning_rate": 2.956424552098405e-06, + "loss": 0.5831, + "step": 9620 + }, + { + "epoch": 2.6650969529085873, + "grad_norm": 0.7096310257911682, + "learning_rate": 2.956066384967176e-06, + "loss": 0.5804, + "step": 9621 + }, + { + "epoch": 2.6653739612188367, + "grad_norm": 0.7228270769119263, + "learning_rate": 2.955708208152494e-06, + "loss": 0.5566, + "step": 9622 + }, + { + "epoch": 2.6656509695290858, + "grad_norm": 0.7391560077667236, + "learning_rate": 2.9553500216619626e-06, + "loss": 0.5742, + "step": 9623 + }, + { + "epoch": 2.665927977839335, + "grad_norm": 0.7600433826446533, + "learning_rate": 2.954991825503188e-06, + "loss": 0.6008, + "step": 9624 + }, + { + "epoch": 2.6662049861495847, + "grad_norm": 0.7527140974998474, + "learning_rate": 2.9546336196837743e-06, + "loss": 0.5938, + "step": 9625 + }, + { + "epoch": 2.6664819944598337, + "grad_norm": 0.727566659450531, + "learning_rate": 2.954275404211328e-06, + "loss": 0.6062, + "step": 9626 + }, + { + "epoch": 2.666759002770083, + "grad_norm": 0.7642120122909546, + "learning_rate": 2.9539171790934552e-06, + "loss": 0.5261, + "step": 9627 + }, + { + "epoch": 2.6670360110803326, + "grad_norm": 0.7786373496055603, + "learning_rate": 2.9535589443377604e-06, + "loss": 0.6142, + "step": 9628 + }, + { + "epoch": 2.6673130193905816, + "grad_norm": 0.7548345327377319, + "learning_rate": 2.953200699951852e-06, + "loss": 0.6039, + "step": 9629 + }, + { + "epoch": 2.667590027700831, + "grad_norm": 0.7462404370307922, + "learning_rate": 2.952842445943335e-06, + "loss": 0.61, + "step": 9630 + }, + { + "epoch": 2.6678670360110806, + "grad_norm": 0.7438084483146667, + "learning_rate": 2.952484182319817e-06, + "loss": 0.6228, + "step": 9631 + }, + { + "epoch": 2.6681440443213296, + "grad_norm": 0.7126626968383789, + "learning_rate": 2.9521259090889032e-06, + "loss": 0.5762, + "step": 9632 + }, + { + "epoch": 2.668421052631579, + "grad_norm": 0.7101565003395081, + "learning_rate": 2.951767626258203e-06, + "loss": 0.5764, + "step": 9633 + }, + { + "epoch": 2.6686980609418285, + "grad_norm": 0.8123554587364197, + "learning_rate": 2.951409333835321e-06, + "loss": 0.5824, + "step": 9634 + }, + { + "epoch": 2.6689750692520775, + "grad_norm": 0.7582353353500366, + "learning_rate": 2.9510510318278675e-06, + "loss": 0.5428, + "step": 9635 + }, + { + "epoch": 2.669252077562327, + "grad_norm": 0.691589891910553, + "learning_rate": 2.950692720243448e-06, + "loss": 0.5842, + "step": 9636 + }, + { + "epoch": 2.6695290858725764, + "grad_norm": 0.7699835896492004, + "learning_rate": 2.9503343990896715e-06, + "loss": 0.599, + "step": 9637 + }, + { + "epoch": 2.6698060941828254, + "grad_norm": 0.753537118434906, + "learning_rate": 2.9499760683741448e-06, + "loss": 0.5947, + "step": 9638 + }, + { + "epoch": 2.670083102493075, + "grad_norm": 0.7336054444313049, + "learning_rate": 2.949617728104478e-06, + "loss": 0.5759, + "step": 9639 + }, + { + "epoch": 2.6703601108033244, + "grad_norm": 0.7584792375564575, + "learning_rate": 2.9492593782882785e-06, + "loss": 0.6344, + "step": 9640 + }, + { + "epoch": 2.6706371191135734, + "grad_norm": 0.7488454580307007, + "learning_rate": 2.948901018933154e-06, + "loss": 0.6385, + "step": 9641 + }, + { + "epoch": 2.670914127423823, + "grad_norm": 0.7217627167701721, + "learning_rate": 2.9485426500467157e-06, + "loss": 0.5351, + "step": 9642 + }, + { + "epoch": 2.6711911357340723, + "grad_norm": 0.716277003288269, + "learning_rate": 2.948184271636571e-06, + "loss": 0.5781, + "step": 9643 + }, + { + "epoch": 2.6714681440443213, + "grad_norm": 0.7645633220672607, + "learning_rate": 2.9478258837103302e-06, + "loss": 0.6118, + "step": 9644 + }, + { + "epoch": 2.6717451523545708, + "grad_norm": 0.7127770781517029, + "learning_rate": 2.9474674862756014e-06, + "loss": 0.6011, + "step": 9645 + }, + { + "epoch": 2.67202216066482, + "grad_norm": 0.7803115248680115, + "learning_rate": 2.9471090793399966e-06, + "loss": 0.6041, + "step": 9646 + }, + { + "epoch": 2.6722991689750693, + "grad_norm": 0.7439555525779724, + "learning_rate": 2.9467506629111227e-06, + "loss": 0.5994, + "step": 9647 + }, + { + "epoch": 2.6725761772853187, + "grad_norm": 0.7263895869255066, + "learning_rate": 2.946392236996592e-06, + "loss": 0.5833, + "step": 9648 + }, + { + "epoch": 2.6728531855955677, + "grad_norm": 0.7334993481636047, + "learning_rate": 2.946033801604014e-06, + "loss": 0.5808, + "step": 9649 + }, + { + "epoch": 2.673130193905817, + "grad_norm": 0.7821438312530518, + "learning_rate": 2.9456753567409997e-06, + "loss": 0.6225, + "step": 9650 + }, + { + "epoch": 2.6734072022160666, + "grad_norm": 0.7657595276832581, + "learning_rate": 2.945316902415159e-06, + "loss": 0.6224, + "step": 9651 + }, + { + "epoch": 2.6736842105263157, + "grad_norm": 0.7557787299156189, + "learning_rate": 2.9449584386341037e-06, + "loss": 0.6104, + "step": 9652 + }, + { + "epoch": 2.673961218836565, + "grad_norm": 0.7655667066574097, + "learning_rate": 2.944599965405444e-06, + "loss": 0.5885, + "step": 9653 + }, + { + "epoch": 2.674238227146814, + "grad_norm": 0.7425467371940613, + "learning_rate": 2.9442414827367916e-06, + "loss": 0.6347, + "step": 9654 + }, + { + "epoch": 2.6745152354570636, + "grad_norm": 0.7573320865631104, + "learning_rate": 2.943882990635759e-06, + "loss": 0.5746, + "step": 9655 + }, + { + "epoch": 2.674792243767313, + "grad_norm": 0.7560545802116394, + "learning_rate": 2.9435244891099555e-06, + "loss": 0.5833, + "step": 9656 + }, + { + "epoch": 2.675069252077562, + "grad_norm": 0.761762261390686, + "learning_rate": 2.943165978166995e-06, + "loss": 0.5782, + "step": 9657 + }, + { + "epoch": 2.6753462603878115, + "grad_norm": 0.7574995160102844, + "learning_rate": 2.94280745781449e-06, + "loss": 0.6128, + "step": 9658 + }, + { + "epoch": 2.675623268698061, + "grad_norm": 0.7315826416015625, + "learning_rate": 2.942448928060051e-06, + "loss": 0.5502, + "step": 9659 + }, + { + "epoch": 2.67590027700831, + "grad_norm": 0.7283254861831665, + "learning_rate": 2.942090388911291e-06, + "loss": 0.5925, + "step": 9660 + }, + { + "epoch": 2.6761772853185595, + "grad_norm": 0.7325502038002014, + "learning_rate": 2.941731840375824e-06, + "loss": 0.5873, + "step": 9661 + }, + { + "epoch": 2.676454293628809, + "grad_norm": 0.7269346714019775, + "learning_rate": 2.9413732824612617e-06, + "loss": 0.6004, + "step": 9662 + }, + { + "epoch": 2.676731301939058, + "grad_norm": 0.7535871863365173, + "learning_rate": 2.9410147151752172e-06, + "loss": 0.655, + "step": 9663 + }, + { + "epoch": 2.6770083102493074, + "grad_norm": 0.7521666884422302, + "learning_rate": 2.940656138525304e-06, + "loss": 0.6068, + "step": 9664 + }, + { + "epoch": 2.677285318559557, + "grad_norm": 0.7384536862373352, + "learning_rate": 2.9402975525191364e-06, + "loss": 0.6126, + "step": 9665 + }, + { + "epoch": 2.677562326869806, + "grad_norm": 0.6997430920600891, + "learning_rate": 2.9399389571643263e-06, + "loss": 0.5439, + "step": 9666 + }, + { + "epoch": 2.6778393351800553, + "grad_norm": 0.7200013399124146, + "learning_rate": 2.9395803524684885e-06, + "loss": 0.5884, + "step": 9667 + }, + { + "epoch": 2.678116343490305, + "grad_norm": 0.7792138457298279, + "learning_rate": 2.939221738439239e-06, + "loss": 0.5921, + "step": 9668 + }, + { + "epoch": 2.678393351800554, + "grad_norm": 0.7050516605377197, + "learning_rate": 2.9388631150841884e-06, + "loss": 0.59, + "step": 9669 + }, + { + "epoch": 2.6786703601108033, + "grad_norm": 0.7874724268913269, + "learning_rate": 2.9385044824109544e-06, + "loss": 0.615, + "step": 9670 + }, + { + "epoch": 2.6789473684210527, + "grad_norm": 0.7635147571563721, + "learning_rate": 2.93814584042715e-06, + "loss": 0.5562, + "step": 9671 + }, + { + "epoch": 2.6792243767313018, + "grad_norm": 0.7510080337524414, + "learning_rate": 2.9377871891403907e-06, + "loss": 0.6612, + "step": 9672 + }, + { + "epoch": 2.679501385041551, + "grad_norm": 0.7657120227813721, + "learning_rate": 2.937428528558291e-06, + "loss": 0.6431, + "step": 9673 + }, + { + "epoch": 2.6797783933518007, + "grad_norm": 0.7359679937362671, + "learning_rate": 2.9370698586884666e-06, + "loss": 0.5634, + "step": 9674 + }, + { + "epoch": 2.6800554016620497, + "grad_norm": 0.7201767563819885, + "learning_rate": 2.936711179538533e-06, + "loss": 0.5771, + "step": 9675 + }, + { + "epoch": 2.680332409972299, + "grad_norm": 0.7329818606376648, + "learning_rate": 2.9363524911161063e-06, + "loss": 0.6105, + "step": 9676 + }, + { + "epoch": 2.6806094182825486, + "grad_norm": 0.7106482982635498, + "learning_rate": 2.9359937934288013e-06, + "loss": 0.5773, + "step": 9677 + }, + { + "epoch": 2.6808864265927976, + "grad_norm": 0.705254852771759, + "learning_rate": 2.9356350864842355e-06, + "loss": 0.516, + "step": 9678 + }, + { + "epoch": 2.681163434903047, + "grad_norm": 0.7572067975997925, + "learning_rate": 2.9352763702900234e-06, + "loss": 0.6003, + "step": 9679 + }, + { + "epoch": 2.6814404432132966, + "grad_norm": 0.7427892088890076, + "learning_rate": 2.9349176448537826e-06, + "loss": 0.6282, + "step": 9680 + }, + { + "epoch": 2.6817174515235456, + "grad_norm": 0.7307045459747314, + "learning_rate": 2.9345589101831294e-06, + "loss": 0.5686, + "step": 9681 + }, + { + "epoch": 2.681994459833795, + "grad_norm": 0.7494450211524963, + "learning_rate": 2.934200166285681e-06, + "loss": 0.6109, + "step": 9682 + }, + { + "epoch": 2.6822714681440445, + "grad_norm": 0.7409537434577942, + "learning_rate": 2.9338414131690546e-06, + "loss": 0.6004, + "step": 9683 + }, + { + "epoch": 2.6825484764542935, + "grad_norm": 0.7529603242874146, + "learning_rate": 2.9334826508408665e-06, + "loss": 0.6336, + "step": 9684 + }, + { + "epoch": 2.682825484764543, + "grad_norm": 0.7535399198532104, + "learning_rate": 2.9331238793087353e-06, + "loss": 0.582, + "step": 9685 + }, + { + "epoch": 2.6831024930747924, + "grad_norm": 0.7496642470359802, + "learning_rate": 2.9327650985802776e-06, + "loss": 0.5911, + "step": 9686 + }, + { + "epoch": 2.6833795013850414, + "grad_norm": 0.7197352647781372, + "learning_rate": 2.932406308663112e-06, + "loss": 0.5887, + "step": 9687 + }, + { + "epoch": 2.683656509695291, + "grad_norm": 0.7457350492477417, + "learning_rate": 2.932047509564856e-06, + "loss": 0.5508, + "step": 9688 + }, + { + "epoch": 2.6839335180055404, + "grad_norm": 0.7327432632446289, + "learning_rate": 2.9316887012931283e-06, + "loss": 0.5757, + "step": 9689 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.6928367614746094, + "learning_rate": 2.931329883855547e-06, + "loss": 0.586, + "step": 9690 + }, + { + "epoch": 2.684487534626039, + "grad_norm": 0.7441751956939697, + "learning_rate": 2.9309710572597307e-06, + "loss": 0.5658, + "step": 9691 + }, + { + "epoch": 2.6847645429362883, + "grad_norm": 0.7539411783218384, + "learning_rate": 2.9306122215132974e-06, + "loss": 0.6482, + "step": 9692 + }, + { + "epoch": 2.6850415512465373, + "grad_norm": 0.7249196171760559, + "learning_rate": 2.930253376623868e-06, + "loss": 0.6555, + "step": 9693 + }, + { + "epoch": 2.6853185595567868, + "grad_norm": 0.7135176062583923, + "learning_rate": 2.9298945225990606e-06, + "loss": 0.5994, + "step": 9694 + }, + { + "epoch": 2.6855955678670362, + "grad_norm": 0.7672198414802551, + "learning_rate": 2.9295356594464943e-06, + "loss": 0.6298, + "step": 9695 + }, + { + "epoch": 2.6858725761772853, + "grad_norm": 0.746395468711853, + "learning_rate": 2.929176787173789e-06, + "loss": 0.6163, + "step": 9696 + }, + { + "epoch": 2.6861495844875347, + "grad_norm": 0.6857293844223022, + "learning_rate": 2.928817905788565e-06, + "loss": 0.5453, + "step": 9697 + }, + { + "epoch": 2.686426592797784, + "grad_norm": 0.7374250292778015, + "learning_rate": 2.9284590152984416e-06, + "loss": 0.5891, + "step": 9698 + }, + { + "epoch": 2.686703601108033, + "grad_norm": 0.7343308925628662, + "learning_rate": 2.928100115711039e-06, + "loss": 0.5837, + "step": 9699 + }, + { + "epoch": 2.6869806094182827, + "grad_norm": 0.7549000978469849, + "learning_rate": 2.9277412070339783e-06, + "loss": 0.5832, + "step": 9700 + }, + { + "epoch": 2.687257617728532, + "grad_norm": 0.7739076614379883, + "learning_rate": 2.927382289274878e-06, + "loss": 0.6404, + "step": 9701 + }, + { + "epoch": 2.687534626038781, + "grad_norm": 0.7524203062057495, + "learning_rate": 2.9270233624413617e-06, + "loss": 0.5872, + "step": 9702 + }, + { + "epoch": 2.6878116343490306, + "grad_norm": 0.7455158829689026, + "learning_rate": 2.926664426541049e-06, + "loss": 0.6154, + "step": 9703 + }, + { + "epoch": 2.68808864265928, + "grad_norm": 0.7223373055458069, + "learning_rate": 2.92630548158156e-06, + "loss": 0.6114, + "step": 9704 + }, + { + "epoch": 2.688365650969529, + "grad_norm": 0.738690972328186, + "learning_rate": 2.9259465275705166e-06, + "loss": 0.5711, + "step": 9705 + }, + { + "epoch": 2.6886426592797785, + "grad_norm": 0.7432278394699097, + "learning_rate": 2.9255875645155426e-06, + "loss": 0.5757, + "step": 9706 + }, + { + "epoch": 2.6889196675900275, + "grad_norm": 0.7743191123008728, + "learning_rate": 2.925228592424256e-06, + "loss": 0.6445, + "step": 9707 + }, + { + "epoch": 2.689196675900277, + "grad_norm": 0.7368574142456055, + "learning_rate": 2.9248696113042815e-06, + "loss": 0.5934, + "step": 9708 + }, + { + "epoch": 2.6894736842105265, + "grad_norm": 0.7550312876701355, + "learning_rate": 2.9245106211632398e-06, + "loss": 0.5848, + "step": 9709 + }, + { + "epoch": 2.6897506925207755, + "grad_norm": 0.7899466156959534, + "learning_rate": 2.9241516220087523e-06, + "loss": 0.5965, + "step": 9710 + }, + { + "epoch": 2.690027700831025, + "grad_norm": 0.7387313842773438, + "learning_rate": 2.923792613848444e-06, + "loss": 0.5383, + "step": 9711 + }, + { + "epoch": 2.6903047091412744, + "grad_norm": 0.7411960363388062, + "learning_rate": 2.9234335966899364e-06, + "loss": 0.5996, + "step": 9712 + }, + { + "epoch": 2.6905817174515234, + "grad_norm": 0.7664662003517151, + "learning_rate": 2.923074570540852e-06, + "loss": 0.6016, + "step": 9713 + }, + { + "epoch": 2.690858725761773, + "grad_norm": 0.7731584906578064, + "learning_rate": 2.9227155354088134e-06, + "loss": 0.616, + "step": 9714 + }, + { + "epoch": 2.691135734072022, + "grad_norm": 0.7891464829444885, + "learning_rate": 2.922356491301445e-06, + "loss": 0.6266, + "step": 9715 + }, + { + "epoch": 2.6914127423822714, + "grad_norm": 0.7034754157066345, + "learning_rate": 2.92199743822637e-06, + "loss": 0.5748, + "step": 9716 + }, + { + "epoch": 2.691689750692521, + "grad_norm": 0.716183066368103, + "learning_rate": 2.9216383761912113e-06, + "loss": 0.559, + "step": 9717 + }, + { + "epoch": 2.69196675900277, + "grad_norm": 0.7393279075622559, + "learning_rate": 2.9212793052035934e-06, + "loss": 0.5972, + "step": 9718 + }, + { + "epoch": 2.6922437673130193, + "grad_norm": 0.7390798330307007, + "learning_rate": 2.9209202252711406e-06, + "loss": 0.6116, + "step": 9719 + }, + { + "epoch": 2.6925207756232687, + "grad_norm": 0.7001668214797974, + "learning_rate": 2.920561136401475e-06, + "loss": 0.5541, + "step": 9720 + }, + { + "epoch": 2.6927977839335178, + "grad_norm": 0.7521222829818726, + "learning_rate": 2.9202020386022235e-06, + "loss": 0.5988, + "step": 9721 + }, + { + "epoch": 2.6930747922437672, + "grad_norm": 0.7328142523765564, + "learning_rate": 2.9198429318810095e-06, + "loss": 0.63, + "step": 9722 + }, + { + "epoch": 2.6933518005540167, + "grad_norm": 0.7463355660438538, + "learning_rate": 2.9194838162454573e-06, + "loss": 0.61, + "step": 9723 + }, + { + "epoch": 2.6936288088642657, + "grad_norm": 0.7568442225456238, + "learning_rate": 2.9191246917031936e-06, + "loss": 0.5795, + "step": 9724 + }, + { + "epoch": 2.693905817174515, + "grad_norm": 0.7941131591796875, + "learning_rate": 2.9187655582618413e-06, + "loss": 0.6286, + "step": 9725 + }, + { + "epoch": 2.6941828254847646, + "grad_norm": 0.7539636492729187, + "learning_rate": 2.918406415929028e-06, + "loss": 0.5493, + "step": 9726 + }, + { + "epoch": 2.6944598337950136, + "grad_norm": 0.7329300045967102, + "learning_rate": 2.9180472647123765e-06, + "loss": 0.6389, + "step": 9727 + }, + { + "epoch": 2.694736842105263, + "grad_norm": 0.7334020733833313, + "learning_rate": 2.9176881046195145e-06, + "loss": 0.6247, + "step": 9728 + }, + { + "epoch": 2.6950138504155126, + "grad_norm": 0.7126622796058655, + "learning_rate": 2.9173289356580676e-06, + "loss": 0.5517, + "step": 9729 + }, + { + "epoch": 2.6952908587257616, + "grad_norm": 0.7421119213104248, + "learning_rate": 2.916969757835662e-06, + "loss": 0.5809, + "step": 9730 + }, + { + "epoch": 2.695567867036011, + "grad_norm": 0.6701324582099915, + "learning_rate": 2.9166105711599237e-06, + "loss": 0.5741, + "step": 9731 + }, + { + "epoch": 2.6958448753462605, + "grad_norm": 0.7376656532287598, + "learning_rate": 2.916251375638478e-06, + "loss": 0.5799, + "step": 9732 + }, + { + "epoch": 2.6961218836565095, + "grad_norm": 0.7610093355178833, + "learning_rate": 2.915892171278953e-06, + "loss": 0.5926, + "step": 9733 + }, + { + "epoch": 2.696398891966759, + "grad_norm": 0.7553840279579163, + "learning_rate": 2.915532958088976e-06, + "loss": 0.5387, + "step": 9734 + }, + { + "epoch": 2.6966759002770084, + "grad_norm": 0.7403333783149719, + "learning_rate": 2.915173736076172e-06, + "loss": 0.5533, + "step": 9735 + }, + { + "epoch": 2.6969529085872574, + "grad_norm": 0.7409501671791077, + "learning_rate": 2.91481450524817e-06, + "loss": 0.6253, + "step": 9736 + }, + { + "epoch": 2.697229916897507, + "grad_norm": 0.7569580078125, + "learning_rate": 2.9144552656125963e-06, + "loss": 0.5649, + "step": 9737 + }, + { + "epoch": 2.6975069252077564, + "grad_norm": 0.7319048047065735, + "learning_rate": 2.9140960171770795e-06, + "loss": 0.6189, + "step": 9738 + }, + { + "epoch": 2.6977839335180054, + "grad_norm": 0.7332355976104736, + "learning_rate": 2.9137367599492463e-06, + "loss": 0.54, + "step": 9739 + }, + { + "epoch": 2.698060941828255, + "grad_norm": 0.7282864451408386, + "learning_rate": 2.9133774939367248e-06, + "loss": 0.5977, + "step": 9740 + }, + { + "epoch": 2.6983379501385043, + "grad_norm": 0.7558872699737549, + "learning_rate": 2.9130182191471445e-06, + "loss": 0.6258, + "step": 9741 + }, + { + "epoch": 2.6986149584487533, + "grad_norm": 0.7340047359466553, + "learning_rate": 2.9126589355881317e-06, + "loss": 0.6142, + "step": 9742 + }, + { + "epoch": 2.698891966759003, + "grad_norm": 0.7753874659538269, + "learning_rate": 2.9122996432673165e-06, + "loss": 0.5904, + "step": 9743 + }, + { + "epoch": 2.6991689750692522, + "grad_norm": 0.6965166926383972, + "learning_rate": 2.911940342192327e-06, + "loss": 0.5466, + "step": 9744 + }, + { + "epoch": 2.6994459833795013, + "grad_norm": 0.7482168674468994, + "learning_rate": 2.9115810323707917e-06, + "loss": 0.5935, + "step": 9745 + }, + { + "epoch": 2.6997229916897507, + "grad_norm": 0.6827434301376343, + "learning_rate": 2.91122171381034e-06, + "loss": 0.5701, + "step": 9746 + }, + { + "epoch": 2.7, + "grad_norm": 0.6957672834396362, + "learning_rate": 2.9108623865186015e-06, + "loss": 0.5836, + "step": 9747 + }, + { + "epoch": 2.700277008310249, + "grad_norm": 0.7049537897109985, + "learning_rate": 2.910503050503205e-06, + "loss": 0.5234, + "step": 9748 + }, + { + "epoch": 2.7005540166204987, + "grad_norm": 0.7790834307670593, + "learning_rate": 2.9101437057717808e-06, + "loss": 0.6205, + "step": 9749 + }, + { + "epoch": 2.700831024930748, + "grad_norm": 0.7092089653015137, + "learning_rate": 2.9097843523319574e-06, + "loss": 0.5931, + "step": 9750 + }, + { + "epoch": 2.701108033240997, + "grad_norm": 0.8164337277412415, + "learning_rate": 2.9094249901913666e-06, + "loss": 0.6217, + "step": 9751 + }, + { + "epoch": 2.7013850415512466, + "grad_norm": 0.7725958228111267, + "learning_rate": 2.9090656193576374e-06, + "loss": 0.5835, + "step": 9752 + }, + { + "epoch": 2.701662049861496, + "grad_norm": 0.7309401035308838, + "learning_rate": 2.9087062398384005e-06, + "loss": 0.5764, + "step": 9753 + }, + { + "epoch": 2.701939058171745, + "grad_norm": 0.7685689330101013, + "learning_rate": 2.9083468516412865e-06, + "loss": 0.5686, + "step": 9754 + }, + { + "epoch": 2.7022160664819945, + "grad_norm": 0.7559168338775635, + "learning_rate": 2.9079874547739255e-06, + "loss": 0.6157, + "step": 9755 + }, + { + "epoch": 2.702493074792244, + "grad_norm": 0.7864164113998413, + "learning_rate": 2.90762804924395e-06, + "loss": 0.6118, + "step": 9756 + }, + { + "epoch": 2.702770083102493, + "grad_norm": 0.7336371541023254, + "learning_rate": 2.9072686350589894e-06, + "loss": 0.5844, + "step": 9757 + }, + { + "epoch": 2.7030470914127425, + "grad_norm": 0.7496275901794434, + "learning_rate": 2.9069092122266758e-06, + "loss": 0.6102, + "step": 9758 + }, + { + "epoch": 2.703324099722992, + "grad_norm": 0.7452666163444519, + "learning_rate": 2.90654978075464e-06, + "loss": 0.6431, + "step": 9759 + }, + { + "epoch": 2.703601108033241, + "grad_norm": 0.777228593826294, + "learning_rate": 2.9061903406505153e-06, + "loss": 0.5739, + "step": 9760 + }, + { + "epoch": 2.7038781163434904, + "grad_norm": 0.7218241095542908, + "learning_rate": 2.9058308919219315e-06, + "loss": 0.5931, + "step": 9761 + }, + { + "epoch": 2.70415512465374, + "grad_norm": 0.7450538873672485, + "learning_rate": 2.905471434576521e-06, + "loss": 0.6052, + "step": 9762 + }, + { + "epoch": 2.704432132963989, + "grad_norm": 0.7024423480033875, + "learning_rate": 2.9051119686219176e-06, + "loss": 0.5941, + "step": 9763 + }, + { + "epoch": 2.7047091412742383, + "grad_norm": 0.736329972743988, + "learning_rate": 2.904752494065752e-06, + "loss": 0.5991, + "step": 9764 + }, + { + "epoch": 2.704986149584488, + "grad_norm": 0.6893607378005981, + "learning_rate": 2.904393010915657e-06, + "loss": 0.5577, + "step": 9765 + }, + { + "epoch": 2.705263157894737, + "grad_norm": 0.7273480892181396, + "learning_rate": 2.9040335191792663e-06, + "loss": 0.5537, + "step": 9766 + }, + { + "epoch": 2.7055401662049863, + "grad_norm": 0.6868672370910645, + "learning_rate": 2.9036740188642123e-06, + "loss": 0.5484, + "step": 9767 + }, + { + "epoch": 2.7058171745152357, + "grad_norm": 0.7351400852203369, + "learning_rate": 2.9033145099781268e-06, + "loss": 0.5935, + "step": 9768 + }, + { + "epoch": 2.7060941828254848, + "grad_norm": 0.7292349934577942, + "learning_rate": 2.9029549925286453e-06, + "loss": 0.5882, + "step": 9769 + }, + { + "epoch": 2.706371191135734, + "grad_norm": 0.7196292877197266, + "learning_rate": 2.9025954665234e-06, + "loss": 0.6021, + "step": 9770 + }, + { + "epoch": 2.7066481994459832, + "grad_norm": 0.751592755317688, + "learning_rate": 2.9022359319700254e-06, + "loss": 0.5931, + "step": 9771 + }, + { + "epoch": 2.7069252077562327, + "grad_norm": 0.7287519574165344, + "learning_rate": 2.9018763888761535e-06, + "loss": 0.5459, + "step": 9772 + }, + { + "epoch": 2.707202216066482, + "grad_norm": 0.7140438556671143, + "learning_rate": 2.901516837249421e-06, + "loss": 0.5956, + "step": 9773 + }, + { + "epoch": 2.707479224376731, + "grad_norm": 0.7360804080963135, + "learning_rate": 2.90115727709746e-06, + "loss": 0.6057, + "step": 9774 + }, + { + "epoch": 2.7077562326869806, + "grad_norm": 0.7283020615577698, + "learning_rate": 2.9007977084279058e-06, + "loss": 0.5403, + "step": 9775 + }, + { + "epoch": 2.7080332409972296, + "grad_norm": 0.7723913192749023, + "learning_rate": 2.9004381312483926e-06, + "loss": 0.5868, + "step": 9776 + }, + { + "epoch": 2.708310249307479, + "grad_norm": 0.7189229726791382, + "learning_rate": 2.9000785455665553e-06, + "loss": 0.6085, + "step": 9777 + }, + { + "epoch": 2.7085872576177286, + "grad_norm": 0.7812227010726929, + "learning_rate": 2.899718951390029e-06, + "loss": 0.5868, + "step": 9778 + }, + { + "epoch": 2.7088642659279776, + "grad_norm": 0.7589960098266602, + "learning_rate": 2.899359348726448e-06, + "loss": 0.6533, + "step": 9779 + }, + { + "epoch": 2.709141274238227, + "grad_norm": 0.7362611889839172, + "learning_rate": 2.8989997375834485e-06, + "loss": 0.576, + "step": 9780 + }, + { + "epoch": 2.7094182825484765, + "grad_norm": 0.7336854934692383, + "learning_rate": 2.898640117968665e-06, + "loss": 0.5834, + "step": 9781 + }, + { + "epoch": 2.7096952908587255, + "grad_norm": 0.7211445569992065, + "learning_rate": 2.898280489889735e-06, + "loss": 0.5809, + "step": 9782 + }, + { + "epoch": 2.709972299168975, + "grad_norm": 0.7566702365875244, + "learning_rate": 2.8979208533542923e-06, + "loss": 0.6097, + "step": 9783 + }, + { + "epoch": 2.7102493074792244, + "grad_norm": 0.7137244343757629, + "learning_rate": 2.8975612083699737e-06, + "loss": 0.6096, + "step": 9784 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.7393102049827576, + "learning_rate": 2.8972015549444162e-06, + "loss": 0.6004, + "step": 9785 + }, + { + "epoch": 2.710803324099723, + "grad_norm": 0.7391594648361206, + "learning_rate": 2.896841893085255e-06, + "loss": 0.5695, + "step": 9786 + }, + { + "epoch": 2.7110803324099724, + "grad_norm": 0.780170738697052, + "learning_rate": 2.896482222800126e-06, + "loss": 0.5785, + "step": 9787 + }, + { + "epoch": 2.7113573407202214, + "grad_norm": 0.7280192375183105, + "learning_rate": 2.8961225440966678e-06, + "loss": 0.5825, + "step": 9788 + }, + { + "epoch": 2.711634349030471, + "grad_norm": 0.7467705011367798, + "learning_rate": 2.8957628569825165e-06, + "loss": 0.6411, + "step": 9789 + }, + { + "epoch": 2.7119113573407203, + "grad_norm": 0.7193437814712524, + "learning_rate": 2.895403161465309e-06, + "loss": 0.5487, + "step": 9790 + }, + { + "epoch": 2.7121883656509693, + "grad_norm": 0.7068139910697937, + "learning_rate": 2.895043457552682e-06, + "loss": 0.5979, + "step": 9791 + }, + { + "epoch": 2.712465373961219, + "grad_norm": 0.7164737582206726, + "learning_rate": 2.8946837452522746e-06, + "loss": 0.5583, + "step": 9792 + }, + { + "epoch": 2.7127423822714682, + "grad_norm": 0.838596761226654, + "learning_rate": 2.8943240245717225e-06, + "loss": 0.6158, + "step": 9793 + }, + { + "epoch": 2.7130193905817173, + "grad_norm": 0.7566062808036804, + "learning_rate": 2.893964295518664e-06, + "loss": 0.5801, + "step": 9794 + }, + { + "epoch": 2.7132963988919667, + "grad_norm": 0.7423161864280701, + "learning_rate": 2.893604558100739e-06, + "loss": 0.6034, + "step": 9795 + }, + { + "epoch": 2.713573407202216, + "grad_norm": 0.7267448306083679, + "learning_rate": 2.8932448123255825e-06, + "loss": 0.6231, + "step": 9796 + }, + { + "epoch": 2.713850415512465, + "grad_norm": 0.7339398860931396, + "learning_rate": 2.892885058200835e-06, + "loss": 0.5997, + "step": 9797 + }, + { + "epoch": 2.7141274238227147, + "grad_norm": 0.7441668510437012, + "learning_rate": 2.8925252957341347e-06, + "loss": 0.6246, + "step": 9798 + }, + { + "epoch": 2.714404432132964, + "grad_norm": 0.7524739503860474, + "learning_rate": 2.8921655249331198e-06, + "loss": 0.6254, + "step": 9799 + }, + { + "epoch": 2.714681440443213, + "grad_norm": 0.7392253875732422, + "learning_rate": 2.8918057458054288e-06, + "loss": 0.5989, + "step": 9800 + }, + { + "epoch": 2.7149584487534626, + "grad_norm": 0.746084988117218, + "learning_rate": 2.891445958358702e-06, + "loss": 0.6185, + "step": 9801 + }, + { + "epoch": 2.715235457063712, + "grad_norm": 0.7321019768714905, + "learning_rate": 2.8910861626005774e-06, + "loss": 0.5733, + "step": 9802 + }, + { + "epoch": 2.715512465373961, + "grad_norm": 0.7405261397361755, + "learning_rate": 2.8907263585386954e-06, + "loss": 0.5753, + "step": 9803 + }, + { + "epoch": 2.7157894736842105, + "grad_norm": 0.7382570505142212, + "learning_rate": 2.890366546180694e-06, + "loss": 0.6047, + "step": 9804 + }, + { + "epoch": 2.71606648199446, + "grad_norm": 0.7403123378753662, + "learning_rate": 2.8900067255342152e-06, + "loss": 0.5944, + "step": 9805 + }, + { + "epoch": 2.716343490304709, + "grad_norm": 0.7846508622169495, + "learning_rate": 2.889646896606897e-06, + "loss": 0.6135, + "step": 9806 + }, + { + "epoch": 2.7166204986149585, + "grad_norm": 0.7530471682548523, + "learning_rate": 2.88928705940638e-06, + "loss": 0.5899, + "step": 9807 + }, + { + "epoch": 2.716897506925208, + "grad_norm": 0.6979814767837524, + "learning_rate": 2.8889272139403047e-06, + "loss": 0.6406, + "step": 9808 + }, + { + "epoch": 2.717174515235457, + "grad_norm": 0.7776252627372742, + "learning_rate": 2.888567360216311e-06, + "loss": 0.6207, + "step": 9809 + }, + { + "epoch": 2.7174515235457064, + "grad_norm": 0.7415891289710999, + "learning_rate": 2.8882074982420405e-06, + "loss": 0.5763, + "step": 9810 + }, + { + "epoch": 2.717728531855956, + "grad_norm": 0.7440627217292786, + "learning_rate": 2.8878476280251333e-06, + "loss": 0.5803, + "step": 9811 + }, + { + "epoch": 2.718005540166205, + "grad_norm": 0.744896411895752, + "learning_rate": 2.8874877495732306e-06, + "loss": 0.5769, + "step": 9812 + }, + { + "epoch": 2.7182825484764543, + "grad_norm": 0.7108137607574463, + "learning_rate": 2.887127862893973e-06, + "loss": 0.5794, + "step": 9813 + }, + { + "epoch": 2.718559556786704, + "grad_norm": 0.7342739701271057, + "learning_rate": 2.8867679679950032e-06, + "loss": 0.5807, + "step": 9814 + }, + { + "epoch": 2.718836565096953, + "grad_norm": 0.7428385615348816, + "learning_rate": 2.886408064883961e-06, + "loss": 0.5883, + "step": 9815 + }, + { + "epoch": 2.7191135734072023, + "grad_norm": 0.7807708978652954, + "learning_rate": 2.886048153568489e-06, + "loss": 0.6403, + "step": 9816 + }, + { + "epoch": 2.7193905817174517, + "grad_norm": 0.7568454146385193, + "learning_rate": 2.885688234056229e-06, + "loss": 0.657, + "step": 9817 + }, + { + "epoch": 2.7196675900277008, + "grad_norm": 0.7030062079429626, + "learning_rate": 2.885328306354823e-06, + "loss": 0.6298, + "step": 9818 + }, + { + "epoch": 2.71994459833795, + "grad_norm": 0.7581620216369629, + "learning_rate": 2.8849683704719123e-06, + "loss": 0.5754, + "step": 9819 + }, + { + "epoch": 2.7202216066481997, + "grad_norm": 0.727524995803833, + "learning_rate": 2.8846084264151403e-06, + "loss": 0.6101, + "step": 9820 + }, + { + "epoch": 2.7204986149584487, + "grad_norm": 0.7349806427955627, + "learning_rate": 2.8842484741921496e-06, + "loss": 0.6251, + "step": 9821 + }, + { + "epoch": 2.720775623268698, + "grad_norm": 0.754910945892334, + "learning_rate": 2.883888513810582e-06, + "loss": 0.5552, + "step": 9822 + }, + { + "epoch": 2.7210526315789476, + "grad_norm": 0.7700474858283997, + "learning_rate": 2.883528545278081e-06, + "loss": 0.615, + "step": 9823 + }, + { + "epoch": 2.7213296398891966, + "grad_norm": 0.7219333648681641, + "learning_rate": 2.8831685686022897e-06, + "loss": 0.598, + "step": 9824 + }, + { + "epoch": 2.721606648199446, + "grad_norm": 0.7458710074424744, + "learning_rate": 2.8828085837908514e-06, + "loss": 0.6061, + "step": 9825 + }, + { + "epoch": 2.7218836565096955, + "grad_norm": 0.7252250909805298, + "learning_rate": 2.8824485908514087e-06, + "loss": 0.5542, + "step": 9826 + }, + { + "epoch": 2.7221606648199446, + "grad_norm": 0.7319718599319458, + "learning_rate": 2.882088589791607e-06, + "loss": 0.5981, + "step": 9827 + }, + { + "epoch": 2.722437673130194, + "grad_norm": 0.7807872295379639, + "learning_rate": 2.8817285806190877e-06, + "loss": 0.6203, + "step": 9828 + }, + { + "epoch": 2.7227146814404435, + "grad_norm": 0.7514304518699646, + "learning_rate": 2.881368563341496e-06, + "loss": 0.5926, + "step": 9829 + }, + { + "epoch": 2.7229916897506925, + "grad_norm": 0.7369164824485779, + "learning_rate": 2.881008537966476e-06, + "loss": 0.601, + "step": 9830 + }, + { + "epoch": 2.723268698060942, + "grad_norm": 0.7181528210639954, + "learning_rate": 2.880648504501671e-06, + "loss": 0.5683, + "step": 9831 + }, + { + "epoch": 2.723545706371191, + "grad_norm": 0.730129599571228, + "learning_rate": 2.880288462954727e-06, + "loss": 0.6088, + "step": 9832 + }, + { + "epoch": 2.7238227146814404, + "grad_norm": 0.7116170525550842, + "learning_rate": 2.8799284133332878e-06, + "loss": 0.5718, + "step": 9833 + }, + { + "epoch": 2.72409972299169, + "grad_norm": 0.7043094038963318, + "learning_rate": 2.879568355644998e-06, + "loss": 0.5554, + "step": 9834 + }, + { + "epoch": 2.724376731301939, + "grad_norm": 0.7326838374137878, + "learning_rate": 2.8792082898975028e-06, + "loss": 0.5893, + "step": 9835 + }, + { + "epoch": 2.7246537396121884, + "grad_norm": 0.7812146544456482, + "learning_rate": 2.878848216098446e-06, + "loss": 0.6242, + "step": 9836 + }, + { + "epoch": 2.724930747922438, + "grad_norm": 0.7676199674606323, + "learning_rate": 2.8784881342554753e-06, + "loss": 0.6502, + "step": 9837 + }, + { + "epoch": 2.725207756232687, + "grad_norm": 0.7518264055252075, + "learning_rate": 2.8781280443762348e-06, + "loss": 0.5986, + "step": 9838 + }, + { + "epoch": 2.7254847645429363, + "grad_norm": 0.7407569885253906, + "learning_rate": 2.8777679464683704e-06, + "loss": 0.6142, + "step": 9839 + }, + { + "epoch": 2.7257617728531853, + "grad_norm": 0.7235451340675354, + "learning_rate": 2.8774078405395276e-06, + "loss": 0.6065, + "step": 9840 + }, + { + "epoch": 2.726038781163435, + "grad_norm": 0.714812159538269, + "learning_rate": 2.8770477265973525e-06, + "loss": 0.6125, + "step": 9841 + }, + { + "epoch": 2.7263157894736842, + "grad_norm": 0.8110000491142273, + "learning_rate": 2.876687604649491e-06, + "loss": 0.6032, + "step": 9842 + }, + { + "epoch": 2.7265927977839333, + "grad_norm": 0.7523626089096069, + "learning_rate": 2.8763274747035907e-06, + "loss": 0.6239, + "step": 9843 + }, + { + "epoch": 2.7268698060941827, + "grad_norm": 0.7214419841766357, + "learning_rate": 2.8759673367672964e-06, + "loss": 0.6413, + "step": 9844 + }, + { + "epoch": 2.727146814404432, + "grad_norm": 0.7556858658790588, + "learning_rate": 2.8756071908482553e-06, + "loss": 0.6243, + "step": 9845 + }, + { + "epoch": 2.727423822714681, + "grad_norm": 0.7446470856666565, + "learning_rate": 2.8752470369541152e-06, + "loss": 0.5993, + "step": 9846 + }, + { + "epoch": 2.7277008310249307, + "grad_norm": 0.7211043834686279, + "learning_rate": 2.8748868750925207e-06, + "loss": 0.594, + "step": 9847 + }, + { + "epoch": 2.72797783933518, + "grad_norm": 0.7198621034622192, + "learning_rate": 2.8745267052711217e-06, + "loss": 0.6002, + "step": 9848 + }, + { + "epoch": 2.728254847645429, + "grad_norm": 0.7595424652099609, + "learning_rate": 2.874166527497564e-06, + "loss": 0.5839, + "step": 9849 + }, + { + "epoch": 2.7285318559556786, + "grad_norm": 0.7390759587287903, + "learning_rate": 2.8738063417794947e-06, + "loss": 0.5863, + "step": 9850 + }, + { + "epoch": 2.728808864265928, + "grad_norm": 0.746052622795105, + "learning_rate": 2.873446148124563e-06, + "loss": 0.5581, + "step": 9851 + }, + { + "epoch": 2.729085872576177, + "grad_norm": 0.732109785079956, + "learning_rate": 2.873085946540416e-06, + "loss": 0.6028, + "step": 9852 + }, + { + "epoch": 2.7293628808864265, + "grad_norm": 0.7366575002670288, + "learning_rate": 2.872725737034701e-06, + "loss": 0.6197, + "step": 9853 + }, + { + "epoch": 2.729639889196676, + "grad_norm": 0.824176549911499, + "learning_rate": 2.872365519615067e-06, + "loss": 0.6023, + "step": 9854 + }, + { + "epoch": 2.729916897506925, + "grad_norm": 0.767004132270813, + "learning_rate": 2.8720052942891624e-06, + "loss": 0.6114, + "step": 9855 + }, + { + "epoch": 2.7301939058171745, + "grad_norm": 0.738639235496521, + "learning_rate": 2.871645061064635e-06, + "loss": 0.5904, + "step": 9856 + }, + { + "epoch": 2.730470914127424, + "grad_norm": 0.7889578342437744, + "learning_rate": 2.8712848199491343e-06, + "loss": 0.5975, + "step": 9857 + }, + { + "epoch": 2.730747922437673, + "grad_norm": 0.7595368027687073, + "learning_rate": 2.870924570950308e-06, + "loss": 0.5689, + "step": 9858 + }, + { + "epoch": 2.7310249307479224, + "grad_norm": 0.7748566269874573, + "learning_rate": 2.8705643140758065e-06, + "loss": 0.6055, + "step": 9859 + }, + { + "epoch": 2.731301939058172, + "grad_norm": 0.7329906821250916, + "learning_rate": 2.8702040493332776e-06, + "loss": 0.5957, + "step": 9860 + }, + { + "epoch": 2.731578947368421, + "grad_norm": 0.7348831295967102, + "learning_rate": 2.8698437767303724e-06, + "loss": 0.6031, + "step": 9861 + }, + { + "epoch": 2.7318559556786703, + "grad_norm": 0.7103914618492126, + "learning_rate": 2.8694834962747386e-06, + "loss": 0.5777, + "step": 9862 + }, + { + "epoch": 2.73213296398892, + "grad_norm": 0.8002786040306091, + "learning_rate": 2.869123207974027e-06, + "loss": 0.597, + "step": 9863 + }, + { + "epoch": 2.732409972299169, + "grad_norm": 0.7433820366859436, + "learning_rate": 2.868762911835886e-06, + "loss": 0.5898, + "step": 9864 + }, + { + "epoch": 2.7326869806094183, + "grad_norm": 0.7274856567382812, + "learning_rate": 2.8684026078679684e-06, + "loss": 0.5686, + "step": 9865 + }, + { + "epoch": 2.7329639889196677, + "grad_norm": 0.7254496216773987, + "learning_rate": 2.8680422960779215e-06, + "loss": 0.5935, + "step": 9866 + }, + { + "epoch": 2.7332409972299168, + "grad_norm": 0.7380785346031189, + "learning_rate": 2.8676819764733965e-06, + "loss": 0.5996, + "step": 9867 + }, + { + "epoch": 2.733518005540166, + "grad_norm": 0.7902967929840088, + "learning_rate": 2.8673216490620453e-06, + "loss": 0.6329, + "step": 9868 + }, + { + "epoch": 2.7337950138504157, + "grad_norm": 0.7636850476264954, + "learning_rate": 2.8669613138515163e-06, + "loss": 0.5976, + "step": 9869 + }, + { + "epoch": 2.7340720221606647, + "grad_norm": 0.6949307918548584, + "learning_rate": 2.8666009708494624e-06, + "loss": 0.5478, + "step": 9870 + }, + { + "epoch": 2.734349030470914, + "grad_norm": 0.7565574049949646, + "learning_rate": 2.8662406200635336e-06, + "loss": 0.5925, + "step": 9871 + }, + { + "epoch": 2.7346260387811636, + "grad_norm": 0.7426764369010925, + "learning_rate": 2.865880261501381e-06, + "loss": 0.5982, + "step": 9872 + }, + { + "epoch": 2.7349030470914126, + "grad_norm": 0.7588077783584595, + "learning_rate": 2.8655198951706553e-06, + "loss": 0.5879, + "step": 9873 + }, + { + "epoch": 2.735180055401662, + "grad_norm": 0.8109205365180969, + "learning_rate": 2.8651595210790097e-06, + "loss": 0.6319, + "step": 9874 + }, + { + "epoch": 2.7354570637119116, + "grad_norm": 0.7463670372962952, + "learning_rate": 2.8647991392340945e-06, + "loss": 0.5441, + "step": 9875 + }, + { + "epoch": 2.7357340720221606, + "grad_norm": 0.7290921807289124, + "learning_rate": 2.864438749643563e-06, + "loss": 0.557, + "step": 9876 + }, + { + "epoch": 2.73601108033241, + "grad_norm": 0.7342871427536011, + "learning_rate": 2.8640783523150646e-06, + "loss": 0.5994, + "step": 9877 + }, + { + "epoch": 2.7362880886426595, + "grad_norm": 0.7484715580940247, + "learning_rate": 2.8637179472562543e-06, + "loss": 0.6144, + "step": 9878 + }, + { + "epoch": 2.7365650969529085, + "grad_norm": 0.7376574873924255, + "learning_rate": 2.863357534474782e-06, + "loss": 0.6049, + "step": 9879 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7331152558326721, + "learning_rate": 2.862997113978302e-06, + "loss": 0.5784, + "step": 9880 + }, + { + "epoch": 2.7371191135734074, + "grad_norm": 0.7068518996238708, + "learning_rate": 2.8626366857744663e-06, + "loss": 0.6158, + "step": 9881 + }, + { + "epoch": 2.7373961218836564, + "grad_norm": 0.7011471390724182, + "learning_rate": 2.862276249870927e-06, + "loss": 0.5773, + "step": 9882 + }, + { + "epoch": 2.737673130193906, + "grad_norm": 0.7379846572875977, + "learning_rate": 2.8619158062753387e-06, + "loss": 0.5378, + "step": 9883 + }, + { + "epoch": 2.7379501385041554, + "grad_norm": 0.7986716032028198, + "learning_rate": 2.8615553549953534e-06, + "loss": 0.5816, + "step": 9884 + }, + { + "epoch": 2.7382271468144044, + "grad_norm": 0.7555115222930908, + "learning_rate": 2.861194896038625e-06, + "loss": 0.5616, + "step": 9885 + }, + { + "epoch": 2.738504155124654, + "grad_norm": 0.7393010854721069, + "learning_rate": 2.8608344294128054e-06, + "loss": 0.6302, + "step": 9886 + }, + { + "epoch": 2.7387811634349033, + "grad_norm": 0.7402096390724182, + "learning_rate": 2.8604739551255502e-06, + "loss": 0.5949, + "step": 9887 + }, + { + "epoch": 2.7390581717451523, + "grad_norm": 0.7098001837730408, + "learning_rate": 2.8601134731845126e-06, + "loss": 0.5708, + "step": 9888 + }, + { + "epoch": 2.7393351800554018, + "grad_norm": 0.7866057753562927, + "learning_rate": 2.8597529835973458e-06, + "loss": 0.5667, + "step": 9889 + }, + { + "epoch": 2.7396121883656512, + "grad_norm": 0.7264719605445862, + "learning_rate": 2.859392486371705e-06, + "loss": 0.5916, + "step": 9890 + }, + { + "epoch": 2.7398891966759003, + "grad_norm": 0.7185991406440735, + "learning_rate": 2.8590319815152433e-06, + "loss": 0.5991, + "step": 9891 + }, + { + "epoch": 2.7401662049861497, + "grad_norm": 0.7286472320556641, + "learning_rate": 2.8586714690356156e-06, + "loss": 0.6273, + "step": 9892 + }, + { + "epoch": 2.7404432132963987, + "grad_norm": 0.7181364297866821, + "learning_rate": 2.858310948940477e-06, + "loss": 0.6077, + "step": 9893 + }, + { + "epoch": 2.740720221606648, + "grad_norm": 0.739396870136261, + "learning_rate": 2.8579504212374827e-06, + "loss": 0.5982, + "step": 9894 + }, + { + "epoch": 2.7409972299168976, + "grad_norm": 0.7353379130363464, + "learning_rate": 2.8575898859342854e-06, + "loss": 0.5989, + "step": 9895 + }, + { + "epoch": 2.7412742382271467, + "grad_norm": 0.7130840420722961, + "learning_rate": 2.8572293430385423e-06, + "loss": 0.5807, + "step": 9896 + }, + { + "epoch": 2.741551246537396, + "grad_norm": 0.746138334274292, + "learning_rate": 2.8568687925579084e-06, + "loss": 0.6147, + "step": 9897 + }, + { + "epoch": 2.7418282548476456, + "grad_norm": 0.773597240447998, + "learning_rate": 2.8565082345000384e-06, + "loss": 0.6073, + "step": 9898 + }, + { + "epoch": 2.7421052631578946, + "grad_norm": 0.7569901943206787, + "learning_rate": 2.8561476688725877e-06, + "loss": 0.6017, + "step": 9899 + }, + { + "epoch": 2.742382271468144, + "grad_norm": 0.7603424787521362, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.6067, + "step": 9900 + }, + { + "epoch": 2.742659279778393, + "grad_norm": 0.7068322896957397, + "learning_rate": 2.85542651493957e-06, + "loss": 0.6157, + "step": 9901 + }, + { + "epoch": 2.7429362880886425, + "grad_norm": 0.7092251777648926, + "learning_rate": 2.855065926649314e-06, + "loss": 0.5645, + "step": 9902 + }, + { + "epoch": 2.743213296398892, + "grad_norm": 0.7123865485191345, + "learning_rate": 2.8547053308201016e-06, + "loss": 0.5271, + "step": 9903 + }, + { + "epoch": 2.743490304709141, + "grad_norm": 0.7555539608001709, + "learning_rate": 2.854344727459589e-06, + "loss": 0.6047, + "step": 9904 + }, + { + "epoch": 2.7437673130193905, + "grad_norm": 0.7480126619338989, + "learning_rate": 2.853984116575433e-06, + "loss": 0.5918, + "step": 9905 + }, + { + "epoch": 2.74404432132964, + "grad_norm": 0.7401037216186523, + "learning_rate": 2.853623498175291e-06, + "loss": 0.5512, + "step": 9906 + }, + { + "epoch": 2.744321329639889, + "grad_norm": 0.7168396711349487, + "learning_rate": 2.853262872266818e-06, + "loss": 0.5981, + "step": 9907 + }, + { + "epoch": 2.7445983379501384, + "grad_norm": 0.7359672784805298, + "learning_rate": 2.8529022388576722e-06, + "loss": 0.6046, + "step": 9908 + }, + { + "epoch": 2.744875346260388, + "grad_norm": 0.7549652457237244, + "learning_rate": 2.8525415979555114e-06, + "loss": 0.56, + "step": 9909 + }, + { + "epoch": 2.745152354570637, + "grad_norm": 0.7408303022384644, + "learning_rate": 2.852180949567992e-06, + "loss": 0.6197, + "step": 9910 + }, + { + "epoch": 2.7454293628808863, + "grad_norm": 0.7500876784324646, + "learning_rate": 2.8518202937027716e-06, + "loss": 0.5886, + "step": 9911 + }, + { + "epoch": 2.745706371191136, + "grad_norm": 0.7531888484954834, + "learning_rate": 2.8514596303675073e-06, + "loss": 0.6038, + "step": 9912 + }, + { + "epoch": 2.745983379501385, + "grad_norm": 0.7636232376098633, + "learning_rate": 2.8510989595698586e-06, + "loss": 0.593, + "step": 9913 + }, + { + "epoch": 2.7462603878116343, + "grad_norm": 0.8050282001495361, + "learning_rate": 2.850738281317482e-06, + "loss": 0.614, + "step": 9914 + }, + { + "epoch": 2.7465373961218837, + "grad_norm": 0.7462037205696106, + "learning_rate": 2.850377595618035e-06, + "loss": 0.5723, + "step": 9915 + }, + { + "epoch": 2.7468144044321328, + "grad_norm": 0.7207900285720825, + "learning_rate": 2.850016902479178e-06, + "loss": 0.6055, + "step": 9916 + }, + { + "epoch": 2.747091412742382, + "grad_norm": 0.7226089835166931, + "learning_rate": 2.8496562019085685e-06, + "loss": 0.6262, + "step": 9917 + }, + { + "epoch": 2.7473684210526317, + "grad_norm": 0.7438740730285645, + "learning_rate": 2.849295493913864e-06, + "loss": 0.6241, + "step": 9918 + }, + { + "epoch": 2.7476454293628807, + "grad_norm": 0.7763872146606445, + "learning_rate": 2.8489347785027248e-06, + "loss": 0.6049, + "step": 9919 + }, + { + "epoch": 2.74792243767313, + "grad_norm": 0.7234786152839661, + "learning_rate": 2.8485740556828082e-06, + "loss": 0.582, + "step": 9920 + }, + { + "epoch": 2.7481994459833796, + "grad_norm": 0.7248966097831726, + "learning_rate": 2.848213325461775e-06, + "loss": 0.5763, + "step": 9921 + }, + { + "epoch": 2.7484764542936286, + "grad_norm": 0.7010312080383301, + "learning_rate": 2.847852587847284e-06, + "loss": 0.5621, + "step": 9922 + }, + { + "epoch": 2.748753462603878, + "grad_norm": 0.6976631879806519, + "learning_rate": 2.8474918428469937e-06, + "loss": 0.6227, + "step": 9923 + }, + { + "epoch": 2.7490304709141276, + "grad_norm": 0.7105882167816162, + "learning_rate": 2.8471310904685644e-06, + "loss": 0.597, + "step": 9924 + }, + { + "epoch": 2.7493074792243766, + "grad_norm": 0.7569697499275208, + "learning_rate": 2.846770330719656e-06, + "loss": 0.6418, + "step": 9925 + }, + { + "epoch": 2.749584487534626, + "grad_norm": 0.716084361076355, + "learning_rate": 2.846409563607927e-06, + "loss": 0.6109, + "step": 9926 + }, + { + "epoch": 2.7498614958448755, + "grad_norm": 0.7867611646652222, + "learning_rate": 2.846048789141039e-06, + "loss": 0.5647, + "step": 9927 + }, + { + "epoch": 2.7501385041551245, + "grad_norm": 0.7521023154258728, + "learning_rate": 2.8456880073266514e-06, + "loss": 0.5844, + "step": 9928 + }, + { + "epoch": 2.750415512465374, + "grad_norm": 0.7604575157165527, + "learning_rate": 2.845327218172424e-06, + "loss": 0.6146, + "step": 9929 + }, + { + "epoch": 2.7506925207756234, + "grad_norm": 0.7507315278053284, + "learning_rate": 2.8449664216860183e-06, + "loss": 0.6499, + "step": 9930 + }, + { + "epoch": 2.7509695290858724, + "grad_norm": 0.7635160088539124, + "learning_rate": 2.8446056178750945e-06, + "loss": 0.6021, + "step": 9931 + }, + { + "epoch": 2.751246537396122, + "grad_norm": 0.7149273157119751, + "learning_rate": 2.8442448067473142e-06, + "loss": 0.5803, + "step": 9932 + }, + { + "epoch": 2.7515235457063714, + "grad_norm": 0.7017562389373779, + "learning_rate": 2.8438839883103363e-06, + "loss": 0.5707, + "step": 9933 + }, + { + "epoch": 2.7518005540166204, + "grad_norm": 0.760026216506958, + "learning_rate": 2.8435231625718242e-06, + "loss": 0.5867, + "step": 9934 + }, + { + "epoch": 2.75207756232687, + "grad_norm": 0.7673696875572205, + "learning_rate": 2.8431623295394374e-06, + "loss": 0.58, + "step": 9935 + }, + { + "epoch": 2.7523545706371193, + "grad_norm": 0.8185964822769165, + "learning_rate": 2.842801489220838e-06, + "loss": 0.5845, + "step": 9936 + }, + { + "epoch": 2.7526315789473683, + "grad_norm": 0.718859851360321, + "learning_rate": 2.8424406416236878e-06, + "loss": 0.5931, + "step": 9937 + }, + { + "epoch": 2.752908587257618, + "grad_norm": 0.743505597114563, + "learning_rate": 2.8420797867556483e-06, + "loss": 0.5825, + "step": 9938 + }, + { + "epoch": 2.7531855955678672, + "grad_norm": 0.7636753916740417, + "learning_rate": 2.8417189246243813e-06, + "loss": 0.6018, + "step": 9939 + }, + { + "epoch": 2.7534626038781163, + "grad_norm": 0.7697381973266602, + "learning_rate": 2.8413580552375487e-06, + "loss": 0.6298, + "step": 9940 + }, + { + "epoch": 2.7537396121883657, + "grad_norm": 0.7020732760429382, + "learning_rate": 2.8409971786028127e-06, + "loss": 0.5888, + "step": 9941 + }, + { + "epoch": 2.754016620498615, + "grad_norm": 0.7665590643882751, + "learning_rate": 2.8406362947278365e-06, + "loss": 0.6058, + "step": 9942 + }, + { + "epoch": 2.754293628808864, + "grad_norm": 0.7251317501068115, + "learning_rate": 2.8402754036202813e-06, + "loss": 0.5865, + "step": 9943 + }, + { + "epoch": 2.7545706371191137, + "grad_norm": 0.736558735370636, + "learning_rate": 2.839914505287811e-06, + "loss": 0.5718, + "step": 9944 + }, + { + "epoch": 2.754847645429363, + "grad_norm": 0.7454231381416321, + "learning_rate": 2.839553599738087e-06, + "loss": 0.5996, + "step": 9945 + }, + { + "epoch": 2.755124653739612, + "grad_norm": 0.7132341861724854, + "learning_rate": 2.8391926869787726e-06, + "loss": 0.5469, + "step": 9946 + }, + { + "epoch": 2.7554016620498616, + "grad_norm": 0.7635960578918457, + "learning_rate": 2.838831767017532e-06, + "loss": 0.6274, + "step": 9947 + }, + { + "epoch": 2.755678670360111, + "grad_norm": 0.7447330355644226, + "learning_rate": 2.8384708398620273e-06, + "loss": 0.6008, + "step": 9948 + }, + { + "epoch": 2.75595567867036, + "grad_norm": 0.7645748257637024, + "learning_rate": 2.838109905519922e-06, + "loss": 0.5936, + "step": 9949 + }, + { + "epoch": 2.7562326869806095, + "grad_norm": 0.758327305316925, + "learning_rate": 2.8377489639988808e-06, + "loss": 0.5703, + "step": 9950 + }, + { + "epoch": 2.756509695290859, + "grad_norm": 0.7296598553657532, + "learning_rate": 2.837388015306566e-06, + "loss": 0.6457, + "step": 9951 + }, + { + "epoch": 2.756786703601108, + "grad_norm": 0.7532568573951721, + "learning_rate": 2.8370270594506432e-06, + "loss": 0.5712, + "step": 9952 + }, + { + "epoch": 2.7570637119113575, + "grad_norm": 0.7405272722244263, + "learning_rate": 2.836666096438774e-06, + "loss": 0.5734, + "step": 9953 + }, + { + "epoch": 2.757340720221607, + "grad_norm": 0.7674778699874878, + "learning_rate": 2.8363051262786247e-06, + "loss": 0.5953, + "step": 9954 + }, + { + "epoch": 2.757617728531856, + "grad_norm": 0.754295825958252, + "learning_rate": 2.835944148977858e-06, + "loss": 0.5753, + "step": 9955 + }, + { + "epoch": 2.7578947368421054, + "grad_norm": 0.8103058338165283, + "learning_rate": 2.835583164544139e-06, + "loss": 0.5539, + "step": 9956 + }, + { + "epoch": 2.7581717451523544, + "grad_norm": 0.7378125786781311, + "learning_rate": 2.835222172985133e-06, + "loss": 0.5761, + "step": 9957 + }, + { + "epoch": 2.758448753462604, + "grad_norm": 0.7553445100784302, + "learning_rate": 2.8348611743085044e-06, + "loss": 0.6038, + "step": 9958 + }, + { + "epoch": 2.7587257617728533, + "grad_norm": 0.7586629986763, + "learning_rate": 2.8345001685219166e-06, + "loss": 0.5723, + "step": 9959 + }, + { + "epoch": 2.7590027700831024, + "grad_norm": 0.7592271566390991, + "learning_rate": 2.8341391556330372e-06, + "loss": 0.5991, + "step": 9960 + }, + { + "epoch": 2.759279778393352, + "grad_norm": 0.7449199557304382, + "learning_rate": 2.8337781356495297e-06, + "loss": 0.6285, + "step": 9961 + }, + { + "epoch": 2.7595567867036013, + "grad_norm": 0.8248133659362793, + "learning_rate": 2.8334171085790606e-06, + "loss": 0.5723, + "step": 9962 + }, + { + "epoch": 2.7598337950138503, + "grad_norm": 0.754346489906311, + "learning_rate": 2.833056074429294e-06, + "loss": 0.6417, + "step": 9963 + }, + { + "epoch": 2.7601108033240997, + "grad_norm": 0.7287446856498718, + "learning_rate": 2.8326950332078967e-06, + "loss": 0.6107, + "step": 9964 + }, + { + "epoch": 2.7603878116343488, + "grad_norm": 0.7717081308364868, + "learning_rate": 2.8323339849225345e-06, + "loss": 0.6411, + "step": 9965 + }, + { + "epoch": 2.7606648199445982, + "grad_norm": 0.7579557299613953, + "learning_rate": 2.831972929580873e-06, + "loss": 0.6529, + "step": 9966 + }, + { + "epoch": 2.7609418282548477, + "grad_norm": 0.7521576285362244, + "learning_rate": 2.831611867190579e-06, + "loss": 0.5969, + "step": 9967 + }, + { + "epoch": 2.7612188365650967, + "grad_norm": 0.7382582426071167, + "learning_rate": 2.831250797759317e-06, + "loss": 0.5763, + "step": 9968 + }, + { + "epoch": 2.761495844875346, + "grad_norm": 0.7770794630050659, + "learning_rate": 2.8308897212947558e-06, + "loss": 0.6108, + "step": 9969 + }, + { + "epoch": 2.7617728531855956, + "grad_norm": 0.7501707673072815, + "learning_rate": 2.83052863780456e-06, + "loss": 0.6017, + "step": 9970 + }, + { + "epoch": 2.7620498614958446, + "grad_norm": 0.7477511763572693, + "learning_rate": 2.8301675472963973e-06, + "loss": 0.5902, + "step": 9971 + }, + { + "epoch": 2.762326869806094, + "grad_norm": 0.7257628440856934, + "learning_rate": 2.829806449777934e-06, + "loss": 0.6117, + "step": 9972 + }, + { + "epoch": 2.7626038781163436, + "grad_norm": 0.7329751253128052, + "learning_rate": 2.8294453452568395e-06, + "loss": 0.6202, + "step": 9973 + }, + { + "epoch": 2.7628808864265926, + "grad_norm": 0.7268105149269104, + "learning_rate": 2.829084233740777e-06, + "loss": 0.5372, + "step": 9974 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.7518357038497925, + "learning_rate": 2.8287231152374164e-06, + "loss": 0.592, + "step": 9975 + }, + { + "epoch": 2.7634349030470915, + "grad_norm": 0.7339359521865845, + "learning_rate": 2.8283619897544247e-06, + "loss": 0.6167, + "step": 9976 + }, + { + "epoch": 2.7637119113573405, + "grad_norm": 0.7902821898460388, + "learning_rate": 2.828000857299469e-06, + "loss": 0.5682, + "step": 9977 + }, + { + "epoch": 2.76398891966759, + "grad_norm": 0.7443130016326904, + "learning_rate": 2.827639717880218e-06, + "loss": 0.6265, + "step": 9978 + }, + { + "epoch": 2.7642659279778394, + "grad_norm": 0.7677735686302185, + "learning_rate": 2.827278571504339e-06, + "loss": 0.5631, + "step": 9979 + }, + { + "epoch": 2.7645429362880884, + "grad_norm": 0.742179811000824, + "learning_rate": 2.8269174181795e-06, + "loss": 0.5674, + "step": 9980 + }, + { + "epoch": 2.764819944598338, + "grad_norm": 0.744736909866333, + "learning_rate": 2.8265562579133694e-06, + "loss": 0.6166, + "step": 9981 + }, + { + "epoch": 2.7650969529085874, + "grad_norm": 0.7325068712234497, + "learning_rate": 2.8261950907136154e-06, + "loss": 0.5825, + "step": 9982 + }, + { + "epoch": 2.7653739612188364, + "grad_norm": 0.7419037818908691, + "learning_rate": 2.825833916587907e-06, + "loss": 0.567, + "step": 9983 + }, + { + "epoch": 2.765650969529086, + "grad_norm": 0.7596971988677979, + "learning_rate": 2.8254727355439122e-06, + "loss": 0.6059, + "step": 9984 + }, + { + "epoch": 2.7659279778393353, + "grad_norm": 0.7159470319747925, + "learning_rate": 2.8251115475893e-06, + "loss": 0.5671, + "step": 9985 + }, + { + "epoch": 2.7662049861495843, + "grad_norm": 0.7521191239356995, + "learning_rate": 2.82475035273174e-06, + "loss": 0.5968, + "step": 9986 + }, + { + "epoch": 2.766481994459834, + "grad_norm": 0.7235638499259949, + "learning_rate": 2.8243891509788997e-06, + "loss": 0.6249, + "step": 9987 + }, + { + "epoch": 2.7667590027700832, + "grad_norm": 0.7232115268707275, + "learning_rate": 2.8240279423384497e-06, + "loss": 0.52, + "step": 9988 + }, + { + "epoch": 2.7670360110803323, + "grad_norm": 0.7460992336273193, + "learning_rate": 2.823666726818059e-06, + "loss": 0.6231, + "step": 9989 + }, + { + "epoch": 2.7673130193905817, + "grad_norm": 0.7331829071044922, + "learning_rate": 2.8233055044253975e-06, + "loss": 0.5745, + "step": 9990 + }, + { + "epoch": 2.767590027700831, + "grad_norm": 0.7375158667564392, + "learning_rate": 2.8229442751681334e-06, + "loss": 0.5944, + "step": 9991 + }, + { + "epoch": 2.76786703601108, + "grad_norm": 0.7081493139266968, + "learning_rate": 2.822583039053939e-06, + "loss": 0.6253, + "step": 9992 + }, + { + "epoch": 2.7681440443213297, + "grad_norm": 0.7295372486114502, + "learning_rate": 2.8222217960904823e-06, + "loss": 0.5776, + "step": 9993 + }, + { + "epoch": 2.768421052631579, + "grad_norm": 0.7875064015388489, + "learning_rate": 2.821860546285434e-06, + "loss": 0.5834, + "step": 9994 + }, + { + "epoch": 2.768698060941828, + "grad_norm": 0.7445015907287598, + "learning_rate": 2.8214992896464643e-06, + "loss": 0.6145, + "step": 9995 + }, + { + "epoch": 2.7689750692520776, + "grad_norm": 0.7500941157341003, + "learning_rate": 2.821138026181244e-06, + "loss": 0.5726, + "step": 9996 + }, + { + "epoch": 2.769252077562327, + "grad_norm": 0.7612580060958862, + "learning_rate": 2.820776755897443e-06, + "loss": 0.6014, + "step": 9997 + }, + { + "epoch": 2.769529085872576, + "grad_norm": 0.7436586022377014, + "learning_rate": 2.820415478802733e-06, + "loss": 0.5925, + "step": 9998 + }, + { + "epoch": 2.7698060941828255, + "grad_norm": 0.7628955245018005, + "learning_rate": 2.820054194904783e-06, + "loss": 0.6223, + "step": 9999 + }, + { + "epoch": 2.770083102493075, + "grad_norm": 0.7731196880340576, + "learning_rate": 2.8196929042112652e-06, + "loss": 0.5961, + "step": 10000 + }, + { + "epoch": 2.770360110803324, + "grad_norm": 0.6865490078926086, + "learning_rate": 2.8193316067298514e-06, + "loss": 0.6111, + "step": 10001 + }, + { + "epoch": 2.7706371191135735, + "grad_norm": 0.7498258352279663, + "learning_rate": 2.818970302468212e-06, + "loss": 0.6077, + "step": 10002 + }, + { + "epoch": 2.770914127423823, + "grad_norm": 0.7517375349998474, + "learning_rate": 2.818608991434018e-06, + "loss": 0.5781, + "step": 10003 + }, + { + "epoch": 2.771191135734072, + "grad_norm": 0.7093813419342041, + "learning_rate": 2.8182476736349414e-06, + "loss": 0.5823, + "step": 10004 + }, + { + "epoch": 2.7714681440443214, + "grad_norm": 0.7525933384895325, + "learning_rate": 2.817886349078655e-06, + "loss": 0.5924, + "step": 10005 + }, + { + "epoch": 2.771745152354571, + "grad_norm": 0.7898561954498291, + "learning_rate": 2.8175250177728288e-06, + "loss": 0.5583, + "step": 10006 + }, + { + "epoch": 2.77202216066482, + "grad_norm": 0.698114275932312, + "learning_rate": 2.817163679725135e-06, + "loss": 0.5231, + "step": 10007 + }, + { + "epoch": 2.7722991689750693, + "grad_norm": 0.7242182493209839, + "learning_rate": 2.816802334943248e-06, + "loss": 0.5472, + "step": 10008 + }, + { + "epoch": 2.772576177285319, + "grad_norm": 0.7450344562530518, + "learning_rate": 2.8164409834348366e-06, + "loss": 0.6198, + "step": 10009 + }, + { + "epoch": 2.772853185595568, + "grad_norm": 0.7549633383750916, + "learning_rate": 2.816079625207576e-06, + "loss": 0.5971, + "step": 10010 + }, + { + "epoch": 2.7731301939058173, + "grad_norm": 0.7310160398483276, + "learning_rate": 2.815718260269138e-06, + "loss": 0.5808, + "step": 10011 + }, + { + "epoch": 2.7734072022160667, + "grad_norm": 0.6927647590637207, + "learning_rate": 2.815356888627195e-06, + "loss": 0.5657, + "step": 10012 + }, + { + "epoch": 2.7736842105263158, + "grad_norm": 0.7121282815933228, + "learning_rate": 2.814995510289419e-06, + "loss": 0.5943, + "step": 10013 + }, + { + "epoch": 2.773961218836565, + "grad_norm": 0.7226163148880005, + "learning_rate": 2.8146341252634846e-06, + "loss": 0.6026, + "step": 10014 + }, + { + "epoch": 2.7742382271468147, + "grad_norm": 0.746878981590271, + "learning_rate": 2.8142727335570647e-06, + "loss": 0.5921, + "step": 10015 + }, + { + "epoch": 2.7745152354570637, + "grad_norm": 0.7342948913574219, + "learning_rate": 2.813911335177831e-06, + "loss": 0.5626, + "step": 10016 + }, + { + "epoch": 2.774792243767313, + "grad_norm": 0.7332433462142944, + "learning_rate": 2.813549930133459e-06, + "loss": 0.6213, + "step": 10017 + }, + { + "epoch": 2.775069252077562, + "grad_norm": 0.7966634035110474, + "learning_rate": 2.8131885184316214e-06, + "loss": 0.6069, + "step": 10018 + }, + { + "epoch": 2.7753462603878116, + "grad_norm": 0.7166701555252075, + "learning_rate": 2.8128271000799904e-06, + "loss": 0.5755, + "step": 10019 + }, + { + "epoch": 2.775623268698061, + "grad_norm": 0.7254489660263062, + "learning_rate": 2.8124656750862422e-06, + "loss": 0.5584, + "step": 10020 + }, + { + "epoch": 2.77590027700831, + "grad_norm": 0.73366779088974, + "learning_rate": 2.8121042434580504e-06, + "loss": 0.6032, + "step": 10021 + }, + { + "epoch": 2.7761772853185596, + "grad_norm": 0.8047178983688354, + "learning_rate": 2.811742805203087e-06, + "loss": 0.6413, + "step": 10022 + }, + { + "epoch": 2.776454293628809, + "grad_norm": 0.7692306041717529, + "learning_rate": 2.8113813603290287e-06, + "loss": 0.5748, + "step": 10023 + }, + { + "epoch": 2.776731301939058, + "grad_norm": 0.7090165019035339, + "learning_rate": 2.8110199088435487e-06, + "loss": 0.5303, + "step": 10024 + }, + { + "epoch": 2.7770083102493075, + "grad_norm": 0.7563058137893677, + "learning_rate": 2.8106584507543217e-06, + "loss": 0.5874, + "step": 10025 + }, + { + "epoch": 2.7772853185595565, + "grad_norm": 0.7437519431114197, + "learning_rate": 2.810296986069022e-06, + "loss": 0.5511, + "step": 10026 + }, + { + "epoch": 2.777562326869806, + "grad_norm": 0.750501811504364, + "learning_rate": 2.809935514795326e-06, + "loss": 0.5746, + "step": 10027 + }, + { + "epoch": 2.7778393351800554, + "grad_norm": 0.7103888392448425, + "learning_rate": 2.8095740369409062e-06, + "loss": 0.5923, + "step": 10028 + }, + { + "epoch": 2.7781163434903045, + "grad_norm": 0.7357938289642334, + "learning_rate": 2.80921255251344e-06, + "loss": 0.6062, + "step": 10029 + }, + { + "epoch": 2.778393351800554, + "grad_norm": 0.7615418434143066, + "learning_rate": 2.8088510615206014e-06, + "loss": 0.5656, + "step": 10030 + }, + { + "epoch": 2.7786703601108034, + "grad_norm": 0.7450594902038574, + "learning_rate": 2.8084895639700664e-06, + "loss": 0.595, + "step": 10031 + }, + { + "epoch": 2.7789473684210524, + "grad_norm": 0.7462698817253113, + "learning_rate": 2.8081280598695095e-06, + "loss": 0.5784, + "step": 10032 + }, + { + "epoch": 2.779224376731302, + "grad_norm": 0.7743009328842163, + "learning_rate": 2.8077665492266077e-06, + "loss": 0.6279, + "step": 10033 + }, + { + "epoch": 2.7795013850415513, + "grad_norm": 0.6979047060012817, + "learning_rate": 2.807405032049035e-06, + "loss": 0.5675, + "step": 10034 + }, + { + "epoch": 2.7797783933518003, + "grad_norm": 0.7872177958488464, + "learning_rate": 2.807043508344469e-06, + "loss": 0.5971, + "step": 10035 + }, + { + "epoch": 2.78005540166205, + "grad_norm": 0.7440598607063293, + "learning_rate": 2.8066819781205855e-06, + "loss": 0.5577, + "step": 10036 + }, + { + "epoch": 2.7803324099722992, + "grad_norm": 0.7891249060630798, + "learning_rate": 2.806320441385061e-06, + "loss": 0.6543, + "step": 10037 + }, + { + "epoch": 2.7806094182825483, + "grad_norm": 0.7209693789482117, + "learning_rate": 2.8059588981455703e-06, + "loss": 0.567, + "step": 10038 + }, + { + "epoch": 2.7808864265927977, + "grad_norm": 0.7257687449455261, + "learning_rate": 2.8055973484097907e-06, + "loss": 0.5811, + "step": 10039 + }, + { + "epoch": 2.781163434903047, + "grad_norm": 0.730828046798706, + "learning_rate": 2.8052357921854e-06, + "loss": 0.5483, + "step": 10040 + }, + { + "epoch": 2.781440443213296, + "grad_norm": 0.7517199516296387, + "learning_rate": 2.8048742294800733e-06, + "loss": 0.6337, + "step": 10041 + }, + { + "epoch": 2.7817174515235457, + "grad_norm": 0.704637348651886, + "learning_rate": 2.8045126603014884e-06, + "loss": 0.587, + "step": 10042 + }, + { + "epoch": 2.781994459833795, + "grad_norm": 0.7503658533096313, + "learning_rate": 2.8041510846573215e-06, + "loss": 0.6262, + "step": 10043 + }, + { + "epoch": 2.782271468144044, + "grad_norm": 0.7343692183494568, + "learning_rate": 2.8037895025552513e-06, + "loss": 0.5447, + "step": 10044 + }, + { + "epoch": 2.7825484764542936, + "grad_norm": 0.7240074872970581, + "learning_rate": 2.803427914002953e-06, + "loss": 0.6007, + "step": 10045 + }, + { + "epoch": 2.782825484764543, + "grad_norm": 0.7470293045043945, + "learning_rate": 2.8030663190081063e-06, + "loss": 0.5813, + "step": 10046 + }, + { + "epoch": 2.783102493074792, + "grad_norm": 0.7610907554626465, + "learning_rate": 2.802704717578387e-06, + "loss": 0.5667, + "step": 10047 + }, + { + "epoch": 2.7833795013850415, + "grad_norm": 0.7097828388214111, + "learning_rate": 2.8023431097214737e-06, + "loss": 0.5882, + "step": 10048 + }, + { + "epoch": 2.783656509695291, + "grad_norm": 0.7703149914741516, + "learning_rate": 2.8019814954450446e-06, + "loss": 0.6014, + "step": 10049 + }, + { + "epoch": 2.78393351800554, + "grad_norm": 0.7168884873390198, + "learning_rate": 2.8016198747567764e-06, + "loss": 0.589, + "step": 10050 + }, + { + "epoch": 2.7842105263157895, + "grad_norm": 0.7583420276641846, + "learning_rate": 2.8012582476643487e-06, + "loss": 0.599, + "step": 10051 + }, + { + "epoch": 2.784487534626039, + "grad_norm": 0.7327255010604858, + "learning_rate": 2.8008966141754383e-06, + "loss": 0.6251, + "step": 10052 + }, + { + "epoch": 2.784764542936288, + "grad_norm": 0.7265594005584717, + "learning_rate": 2.800534974297725e-06, + "loss": 0.5959, + "step": 10053 + }, + { + "epoch": 2.7850415512465374, + "grad_norm": 0.7101613879203796, + "learning_rate": 2.800173328038886e-06, + "loss": 0.594, + "step": 10054 + }, + { + "epoch": 2.785318559556787, + "grad_norm": 0.7457520365715027, + "learning_rate": 2.799811675406601e-06, + "loss": 0.5728, + "step": 10055 + }, + { + "epoch": 2.785595567867036, + "grad_norm": 0.7473955750465393, + "learning_rate": 2.7994500164085494e-06, + "loss": 0.5967, + "step": 10056 + }, + { + "epoch": 2.7858725761772853, + "grad_norm": 0.7834071516990662, + "learning_rate": 2.7990883510524087e-06, + "loss": 0.6262, + "step": 10057 + }, + { + "epoch": 2.786149584487535, + "grad_norm": 0.7517677545547485, + "learning_rate": 2.798726679345858e-06, + "loss": 0.5976, + "step": 10058 + }, + { + "epoch": 2.786426592797784, + "grad_norm": 0.717934787273407, + "learning_rate": 2.7983650012965773e-06, + "loss": 0.5945, + "step": 10059 + }, + { + "epoch": 2.7867036011080333, + "grad_norm": 0.7563323974609375, + "learning_rate": 2.7980033169122455e-06, + "loss": 0.5763, + "step": 10060 + }, + { + "epoch": 2.7869806094182827, + "grad_norm": 0.7239896059036255, + "learning_rate": 2.797641626200543e-06, + "loss": 0.5837, + "step": 10061 + }, + { + "epoch": 2.7872576177285318, + "grad_norm": 0.7858687043190002, + "learning_rate": 2.797279929169148e-06, + "loss": 0.6311, + "step": 10062 + }, + { + "epoch": 2.787534626038781, + "grad_norm": 0.7397139668464661, + "learning_rate": 2.796918225825741e-06, + "loss": 0.5855, + "step": 10063 + }, + { + "epoch": 2.7878116343490307, + "grad_norm": 0.7133927941322327, + "learning_rate": 2.7965565161780017e-06, + "loss": 0.6059, + "step": 10064 + }, + { + "epoch": 2.7880886426592797, + "grad_norm": 0.7488803267478943, + "learning_rate": 2.796194800233611e-06, + "loss": 0.5584, + "step": 10065 + }, + { + "epoch": 2.788365650969529, + "grad_norm": 0.7250664234161377, + "learning_rate": 2.7958330780002472e-06, + "loss": 0.5363, + "step": 10066 + }, + { + "epoch": 2.7886426592797786, + "grad_norm": 0.7320556640625, + "learning_rate": 2.7954713494855924e-06, + "loss": 0.5771, + "step": 10067 + }, + { + "epoch": 2.7889196675900276, + "grad_norm": 0.7644149661064148, + "learning_rate": 2.7951096146973263e-06, + "loss": 0.5977, + "step": 10068 + }, + { + "epoch": 2.789196675900277, + "grad_norm": 0.6871392726898193, + "learning_rate": 2.7947478736431293e-06, + "loss": 0.5614, + "step": 10069 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.731040358543396, + "learning_rate": 2.7943861263306822e-06, + "loss": 0.5968, + "step": 10070 + }, + { + "epoch": 2.7897506925207756, + "grad_norm": 0.7525355815887451, + "learning_rate": 2.7940243727676653e-06, + "loss": 0.5914, + "step": 10071 + }, + { + "epoch": 2.790027700831025, + "grad_norm": 0.7682839632034302, + "learning_rate": 2.793662612961762e-06, + "loss": 0.6167, + "step": 10072 + }, + { + "epoch": 2.7903047091412745, + "grad_norm": 0.750568151473999, + "learning_rate": 2.793300846920649e-06, + "loss": 0.5705, + "step": 10073 + }, + { + "epoch": 2.7905817174515235, + "grad_norm": 0.771504282951355, + "learning_rate": 2.7929390746520117e-06, + "loss": 0.5881, + "step": 10074 + }, + { + "epoch": 2.790858725761773, + "grad_norm": 0.753092885017395, + "learning_rate": 2.7925772961635294e-06, + "loss": 0.6019, + "step": 10075 + }, + { + "epoch": 2.7911357340720224, + "grad_norm": 0.7207103371620178, + "learning_rate": 2.7922155114628834e-06, + "loss": 0.5513, + "step": 10076 + }, + { + "epoch": 2.7914127423822714, + "grad_norm": 0.80680251121521, + "learning_rate": 2.791853720557756e-06, + "loss": 0.6031, + "step": 10077 + }, + { + "epoch": 2.791689750692521, + "grad_norm": 0.7253208160400391, + "learning_rate": 2.79149192345583e-06, + "loss": 0.5972, + "step": 10078 + }, + { + "epoch": 2.7919667590027704, + "grad_norm": 0.738048791885376, + "learning_rate": 2.7911301201647846e-06, + "loss": 0.5545, + "step": 10079 + }, + { + "epoch": 2.7922437673130194, + "grad_norm": 0.798290491104126, + "learning_rate": 2.7907683106923034e-06, + "loss": 0.5873, + "step": 10080 + }, + { + "epoch": 2.792520775623269, + "grad_norm": 0.753075122833252, + "learning_rate": 2.7904064950460696e-06, + "loss": 0.5438, + "step": 10081 + }, + { + "epoch": 2.792797783933518, + "grad_norm": 0.7435222268104553, + "learning_rate": 2.7900446732337634e-06, + "loss": 0.5823, + "step": 10082 + }, + { + "epoch": 2.7930747922437673, + "grad_norm": 0.7120912671089172, + "learning_rate": 2.789682845263068e-06, + "loss": 0.5491, + "step": 10083 + }, + { + "epoch": 2.7933518005540168, + "grad_norm": 0.7340899109840393, + "learning_rate": 2.7893210111416665e-06, + "loss": 0.6199, + "step": 10084 + }, + { + "epoch": 2.793628808864266, + "grad_norm": 0.720433235168457, + "learning_rate": 2.7889591708772405e-06, + "loss": 0.5926, + "step": 10085 + }, + { + "epoch": 2.7939058171745152, + "grad_norm": 0.7632058262825012, + "learning_rate": 2.788597324477474e-06, + "loss": 0.5928, + "step": 10086 + }, + { + "epoch": 2.7941828254847643, + "grad_norm": 0.7659016847610474, + "learning_rate": 2.7882354719500485e-06, + "loss": 0.6098, + "step": 10087 + }, + { + "epoch": 2.7944598337950137, + "grad_norm": 0.7464661002159119, + "learning_rate": 2.787873613302649e-06, + "loss": 0.615, + "step": 10088 + }, + { + "epoch": 2.794736842105263, + "grad_norm": 0.7274541854858398, + "learning_rate": 2.787511748542957e-06, + "loss": 0.5968, + "step": 10089 + }, + { + "epoch": 2.795013850415512, + "grad_norm": 0.7382057905197144, + "learning_rate": 2.787149877678656e-06, + "loss": 0.607, + "step": 10090 + }, + { + "epoch": 2.7952908587257617, + "grad_norm": 0.7254559397697449, + "learning_rate": 2.7867880007174295e-06, + "loss": 0.5698, + "step": 10091 + }, + { + "epoch": 2.795567867036011, + "grad_norm": 0.7147872447967529, + "learning_rate": 2.7864261176669625e-06, + "loss": 0.5256, + "step": 10092 + }, + { + "epoch": 2.79584487534626, + "grad_norm": 0.7616999745368958, + "learning_rate": 2.7860642285349366e-06, + "loss": 0.6358, + "step": 10093 + }, + { + "epoch": 2.7961218836565096, + "grad_norm": 0.7379427552223206, + "learning_rate": 2.7857023333290377e-06, + "loss": 0.6253, + "step": 10094 + }, + { + "epoch": 2.796398891966759, + "grad_norm": 0.74202960729599, + "learning_rate": 2.7853404320569477e-06, + "loss": 0.5957, + "step": 10095 + }, + { + "epoch": 2.796675900277008, + "grad_norm": 0.7566871643066406, + "learning_rate": 2.7849785247263515e-06, + "loss": 0.5535, + "step": 10096 + }, + { + "epoch": 2.7969529085872575, + "grad_norm": 0.7378758788108826, + "learning_rate": 2.7846166113449345e-06, + "loss": 0.5844, + "step": 10097 + }, + { + "epoch": 2.797229916897507, + "grad_norm": 0.696129322052002, + "learning_rate": 2.7842546919203795e-06, + "loss": 0.5659, + "step": 10098 + }, + { + "epoch": 2.797506925207756, + "grad_norm": 0.7553945779800415, + "learning_rate": 2.783892766460371e-06, + "loss": 0.6302, + "step": 10099 + }, + { + "epoch": 2.7977839335180055, + "grad_norm": 0.7305325865745544, + "learning_rate": 2.783530834972594e-06, + "loss": 0.5851, + "step": 10100 + }, + { + "epoch": 2.798060941828255, + "grad_norm": 0.754202127456665, + "learning_rate": 2.783168897464734e-06, + "loss": 0.6049, + "step": 10101 + }, + { + "epoch": 2.798337950138504, + "grad_norm": 0.7164472341537476, + "learning_rate": 2.782806953944475e-06, + "loss": 0.591, + "step": 10102 + }, + { + "epoch": 2.7986149584487534, + "grad_norm": 0.7178779244422913, + "learning_rate": 2.782445004419503e-06, + "loss": 0.5708, + "step": 10103 + }, + { + "epoch": 2.798891966759003, + "grad_norm": 0.727104663848877, + "learning_rate": 2.7820830488975004e-06, + "loss": 0.5752, + "step": 10104 + }, + { + "epoch": 2.799168975069252, + "grad_norm": 0.7736073136329651, + "learning_rate": 2.7817210873861555e-06, + "loss": 0.5803, + "step": 10105 + }, + { + "epoch": 2.7994459833795013, + "grad_norm": 0.7395794987678528, + "learning_rate": 2.7813591198931524e-06, + "loss": 0.5339, + "step": 10106 + }, + { + "epoch": 2.799722991689751, + "grad_norm": 0.7755144834518433, + "learning_rate": 2.7809971464261763e-06, + "loss": 0.6031, + "step": 10107 + }, + { + "epoch": 2.8, + "grad_norm": 0.7793182134628296, + "learning_rate": 2.780635166992913e-06, + "loss": 0.5834, + "step": 10108 + }, + { + "epoch": 2.8002770083102493, + "grad_norm": 0.7401881814002991, + "learning_rate": 2.7802731816010494e-06, + "loss": 0.5644, + "step": 10109 + }, + { + "epoch": 2.8005540166204987, + "grad_norm": 0.7597428560256958, + "learning_rate": 2.7799111902582697e-06, + "loss": 0.6007, + "step": 10110 + }, + { + "epoch": 2.8008310249307478, + "grad_norm": 0.7522040605545044, + "learning_rate": 2.7795491929722612e-06, + "loss": 0.5958, + "step": 10111 + }, + { + "epoch": 2.801108033240997, + "grad_norm": 0.7653687596321106, + "learning_rate": 2.779187189750709e-06, + "loss": 0.6162, + "step": 10112 + }, + { + "epoch": 2.8013850415512467, + "grad_norm": 0.7406167387962341, + "learning_rate": 2.7788251806013e-06, + "loss": 0.5886, + "step": 10113 + }, + { + "epoch": 2.8016620498614957, + "grad_norm": 0.7772331833839417, + "learning_rate": 2.77846316553172e-06, + "loss": 0.5792, + "step": 10114 + }, + { + "epoch": 2.801939058171745, + "grad_norm": 0.7543328404426575, + "learning_rate": 2.7781011445496562e-06, + "loss": 0.6009, + "step": 10115 + }, + { + "epoch": 2.8022160664819946, + "grad_norm": 0.7309496998786926, + "learning_rate": 2.777739117662795e-06, + "loss": 0.5857, + "step": 10116 + }, + { + "epoch": 2.8024930747922436, + "grad_norm": 0.7182099223136902, + "learning_rate": 2.7773770848788227e-06, + "loss": 0.6056, + "step": 10117 + }, + { + "epoch": 2.802770083102493, + "grad_norm": 0.8034688830375671, + "learning_rate": 2.7770150462054262e-06, + "loss": 0.5207, + "step": 10118 + }, + { + "epoch": 2.8030470914127426, + "grad_norm": 0.7384365200996399, + "learning_rate": 2.7766530016502936e-06, + "loss": 0.5801, + "step": 10119 + }, + { + "epoch": 2.8033240997229916, + "grad_norm": 0.7842321991920471, + "learning_rate": 2.776290951221111e-06, + "loss": 0.6051, + "step": 10120 + }, + { + "epoch": 2.803601108033241, + "grad_norm": 0.7223963737487793, + "learning_rate": 2.775928894925566e-06, + "loss": 0.6145, + "step": 10121 + }, + { + "epoch": 2.8038781163434905, + "grad_norm": 0.7191910147666931, + "learning_rate": 2.775566832771346e-06, + "loss": 0.5643, + "step": 10122 + }, + { + "epoch": 2.8041551246537395, + "grad_norm": 0.6896896958351135, + "learning_rate": 2.7752047647661384e-06, + "loss": 0.5521, + "step": 10123 + }, + { + "epoch": 2.804432132963989, + "grad_norm": 0.7741313576698303, + "learning_rate": 2.7748426909176308e-06, + "loss": 0.5841, + "step": 10124 + }, + { + "epoch": 2.8047091412742384, + "grad_norm": 0.7306196689605713, + "learning_rate": 2.7744806112335104e-06, + "loss": 0.5889, + "step": 10125 + }, + { + "epoch": 2.8049861495844874, + "grad_norm": 0.7337019443511963, + "learning_rate": 2.774118525721467e-06, + "loss": 0.559, + "step": 10126 + }, + { + "epoch": 2.805263157894737, + "grad_norm": 0.7309577465057373, + "learning_rate": 2.773756434389186e-06, + "loss": 0.5988, + "step": 10127 + }, + { + "epoch": 2.8055401662049864, + "grad_norm": 0.7765200734138489, + "learning_rate": 2.7733943372443572e-06, + "loss": 0.6217, + "step": 10128 + }, + { + "epoch": 2.8058171745152354, + "grad_norm": 0.7576752305030823, + "learning_rate": 2.773032234294669e-06, + "loss": 0.5931, + "step": 10129 + }, + { + "epoch": 2.806094182825485, + "grad_norm": 0.7484496235847473, + "learning_rate": 2.772670125547809e-06, + "loss": 0.5675, + "step": 10130 + }, + { + "epoch": 2.8063711911357343, + "grad_norm": 0.7473483085632324, + "learning_rate": 2.7723080110114655e-06, + "loss": 0.6268, + "step": 10131 + }, + { + "epoch": 2.8066481994459833, + "grad_norm": 0.7621675729751587, + "learning_rate": 2.7719458906933277e-06, + "loss": 0.5431, + "step": 10132 + }, + { + "epoch": 2.8069252077562328, + "grad_norm": 0.7543573379516602, + "learning_rate": 2.771583764601084e-06, + "loss": 0.5843, + "step": 10133 + }, + { + "epoch": 2.8072022160664822, + "grad_norm": 0.7247941493988037, + "learning_rate": 2.771221632742423e-06, + "loss": 0.6032, + "step": 10134 + }, + { + "epoch": 2.8074792243767313, + "grad_norm": 0.7635888457298279, + "learning_rate": 2.7708594951250358e-06, + "loss": 0.5738, + "step": 10135 + }, + { + "epoch": 2.8077562326869807, + "grad_norm": 0.7076152563095093, + "learning_rate": 2.770497351756608e-06, + "loss": 0.5828, + "step": 10136 + }, + { + "epoch": 2.80803324099723, + "grad_norm": 0.7250659465789795, + "learning_rate": 2.7701352026448313e-06, + "loss": 0.6096, + "step": 10137 + }, + { + "epoch": 2.808310249307479, + "grad_norm": 0.7596719264984131, + "learning_rate": 2.769773047797395e-06, + "loss": 0.6277, + "step": 10138 + }, + { + "epoch": 2.8085872576177286, + "grad_norm": 0.7516382336616516, + "learning_rate": 2.7694108872219873e-06, + "loss": 0.5905, + "step": 10139 + }, + { + "epoch": 2.808864265927978, + "grad_norm": 0.7437866926193237, + "learning_rate": 2.7690487209262985e-06, + "loss": 0.5777, + "step": 10140 + }, + { + "epoch": 2.809141274238227, + "grad_norm": 0.7432383298873901, + "learning_rate": 2.7686865489180183e-06, + "loss": 0.5942, + "step": 10141 + }, + { + "epoch": 2.8094182825484766, + "grad_norm": 0.7514076828956604, + "learning_rate": 2.768324371204837e-06, + "loss": 0.6108, + "step": 10142 + }, + { + "epoch": 2.8096952908587256, + "grad_norm": 0.7362116575241089, + "learning_rate": 2.7679621877944434e-06, + "loss": 0.5703, + "step": 10143 + }, + { + "epoch": 2.809972299168975, + "grad_norm": 0.7133225202560425, + "learning_rate": 2.7675999986945282e-06, + "loss": 0.5773, + "step": 10144 + }, + { + "epoch": 2.8102493074792245, + "grad_norm": 0.7191652059555054, + "learning_rate": 2.767237803912783e-06, + "loss": 0.595, + "step": 10145 + }, + { + "epoch": 2.8105263157894735, + "grad_norm": 0.7200401425361633, + "learning_rate": 2.766875603456895e-06, + "loss": 0.5711, + "step": 10146 + }, + { + "epoch": 2.810803324099723, + "grad_norm": 0.7562271356582642, + "learning_rate": 2.766513397334557e-06, + "loss": 0.6237, + "step": 10147 + }, + { + "epoch": 2.8110803324099725, + "grad_norm": 0.735927402973175, + "learning_rate": 2.7661511855534595e-06, + "loss": 0.5886, + "step": 10148 + }, + { + "epoch": 2.8113573407202215, + "grad_norm": 0.7801263928413391, + "learning_rate": 2.765788968121292e-06, + "loss": 0.5874, + "step": 10149 + }, + { + "epoch": 2.811634349030471, + "grad_norm": 0.6956904530525208, + "learning_rate": 2.7654267450457463e-06, + "loss": 0.5136, + "step": 10150 + }, + { + "epoch": 2.81191135734072, + "grad_norm": 0.7640595436096191, + "learning_rate": 2.765064516334513e-06, + "loss": 0.6038, + "step": 10151 + }, + { + "epoch": 2.8121883656509694, + "grad_norm": 0.7327202558517456, + "learning_rate": 2.7647022819952833e-06, + "loss": 0.5634, + "step": 10152 + }, + { + "epoch": 2.812465373961219, + "grad_norm": 0.7656522989273071, + "learning_rate": 2.7643400420357475e-06, + "loss": 0.6169, + "step": 10153 + }, + { + "epoch": 2.812742382271468, + "grad_norm": 0.7299674153327942, + "learning_rate": 2.763977796463599e-06, + "loss": 0.6227, + "step": 10154 + }, + { + "epoch": 2.8130193905817173, + "grad_norm": 0.7286268472671509, + "learning_rate": 2.763615545286526e-06, + "loss": 0.6531, + "step": 10155 + }, + { + "epoch": 2.813296398891967, + "grad_norm": 0.7793086171150208, + "learning_rate": 2.7632532885122227e-06, + "loss": 0.5584, + "step": 10156 + }, + { + "epoch": 2.813573407202216, + "grad_norm": 0.7282178997993469, + "learning_rate": 2.76289102614838e-06, + "loss": 0.6217, + "step": 10157 + }, + { + "epoch": 2.8138504155124653, + "grad_norm": 0.7371731400489807, + "learning_rate": 2.7625287582026888e-06, + "loss": 0.6266, + "step": 10158 + }, + { + "epoch": 2.8141274238227147, + "grad_norm": 0.7733290195465088, + "learning_rate": 2.7621664846828415e-06, + "loss": 0.6445, + "step": 10159 + }, + { + "epoch": 2.8144044321329638, + "grad_norm": 0.7712571024894714, + "learning_rate": 2.761804205596531e-06, + "loss": 0.6249, + "step": 10160 + }, + { + "epoch": 2.8146814404432132, + "grad_norm": 0.7427979111671448, + "learning_rate": 2.761441920951448e-06, + "loss": 0.6307, + "step": 10161 + }, + { + "epoch": 2.8149584487534627, + "grad_norm": 0.7213217616081238, + "learning_rate": 2.7610796307552854e-06, + "loss": 0.5625, + "step": 10162 + }, + { + "epoch": 2.8152354570637117, + "grad_norm": 0.7439704537391663, + "learning_rate": 2.7607173350157357e-06, + "loss": 0.5922, + "step": 10163 + }, + { + "epoch": 2.815512465373961, + "grad_norm": 0.7900652289390564, + "learning_rate": 2.7603550337404916e-06, + "loss": 0.6083, + "step": 10164 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.7682667970657349, + "learning_rate": 2.7599927269372453e-06, + "loss": 0.5536, + "step": 10165 + }, + { + "epoch": 2.8160664819944596, + "grad_norm": 0.7357151508331299, + "learning_rate": 2.7596304146136883e-06, + "loss": 0.6209, + "step": 10166 + }, + { + "epoch": 2.816343490304709, + "grad_norm": 0.7456122636795044, + "learning_rate": 2.759268096777516e-06, + "loss": 0.5957, + "step": 10167 + }, + { + "epoch": 2.8166204986149586, + "grad_norm": 0.7296216487884521, + "learning_rate": 2.758905773436419e-06, + "loss": 0.6071, + "step": 10168 + }, + { + "epoch": 2.8168975069252076, + "grad_norm": 0.7860580682754517, + "learning_rate": 2.758543444598091e-06, + "loss": 0.6382, + "step": 10169 + }, + { + "epoch": 2.817174515235457, + "grad_norm": 0.7770671844482422, + "learning_rate": 2.758181110270227e-06, + "loss": 0.6017, + "step": 10170 + }, + { + "epoch": 2.8174515235457065, + "grad_norm": 0.6933687329292297, + "learning_rate": 2.7578187704605176e-06, + "loss": 0.5789, + "step": 10171 + }, + { + "epoch": 2.8177285318559555, + "grad_norm": 0.722878098487854, + "learning_rate": 2.757456425176657e-06, + "loss": 0.6016, + "step": 10172 + }, + { + "epoch": 2.818005540166205, + "grad_norm": 0.7481485605239868, + "learning_rate": 2.757094074426339e-06, + "loss": 0.6036, + "step": 10173 + }, + { + "epoch": 2.8182825484764544, + "grad_norm": 0.7612590193748474, + "learning_rate": 2.7567317182172583e-06, + "loss": 0.5457, + "step": 10174 + }, + { + "epoch": 2.8185595567867034, + "grad_norm": 0.7551214694976807, + "learning_rate": 2.756369356557106e-06, + "loss": 0.6061, + "step": 10175 + }, + { + "epoch": 2.818836565096953, + "grad_norm": 0.7312377691268921, + "learning_rate": 2.7560069894535783e-06, + "loss": 0.5956, + "step": 10176 + }, + { + "epoch": 2.8191135734072024, + "grad_norm": 0.7991149425506592, + "learning_rate": 2.755644616914369e-06, + "loss": 0.5719, + "step": 10177 + }, + { + "epoch": 2.8193905817174514, + "grad_norm": 0.741660475730896, + "learning_rate": 2.7552822389471716e-06, + "loss": 0.5659, + "step": 10178 + }, + { + "epoch": 2.819667590027701, + "grad_norm": 0.7375673055648804, + "learning_rate": 2.7549198555596794e-06, + "loss": 0.5978, + "step": 10179 + }, + { + "epoch": 2.8199445983379503, + "grad_norm": 0.7576779723167419, + "learning_rate": 2.7545574667595894e-06, + "loss": 0.5936, + "step": 10180 + }, + { + "epoch": 2.8202216066481993, + "grad_norm": 0.7339960932731628, + "learning_rate": 2.7541950725545924e-06, + "loss": 0.6356, + "step": 10181 + }, + { + "epoch": 2.820498614958449, + "grad_norm": 0.722320556640625, + "learning_rate": 2.7538326729523858e-06, + "loss": 0.57, + "step": 10182 + }, + { + "epoch": 2.8207756232686982, + "grad_norm": 0.7453545928001404, + "learning_rate": 2.7534702679606635e-06, + "loss": 0.6029, + "step": 10183 + }, + { + "epoch": 2.8210526315789473, + "grad_norm": 0.7736693620681763, + "learning_rate": 2.7531078575871196e-06, + "loss": 0.5681, + "step": 10184 + }, + { + "epoch": 2.8213296398891967, + "grad_norm": 0.734969437122345, + "learning_rate": 2.7527454418394494e-06, + "loss": 0.5816, + "step": 10185 + }, + { + "epoch": 2.821606648199446, + "grad_norm": 0.7231078743934631, + "learning_rate": 2.7523830207253494e-06, + "loss": 0.5659, + "step": 10186 + }, + { + "epoch": 2.821883656509695, + "grad_norm": 0.7357196807861328, + "learning_rate": 2.752020594252511e-06, + "loss": 0.6238, + "step": 10187 + }, + { + "epoch": 2.8221606648199447, + "grad_norm": 0.7271388173103333, + "learning_rate": 2.7516581624286337e-06, + "loss": 0.5628, + "step": 10188 + }, + { + "epoch": 2.822437673130194, + "grad_norm": 0.7925208806991577, + "learning_rate": 2.7512957252614108e-06, + "loss": 0.5855, + "step": 10189 + }, + { + "epoch": 2.822714681440443, + "grad_norm": 0.7478787899017334, + "learning_rate": 2.7509332827585367e-06, + "loss": 0.5793, + "step": 10190 + }, + { + "epoch": 2.8229916897506926, + "grad_norm": 0.7537767887115479, + "learning_rate": 2.7505708349277093e-06, + "loss": 0.6236, + "step": 10191 + }, + { + "epoch": 2.823268698060942, + "grad_norm": 0.775567889213562, + "learning_rate": 2.7502083817766227e-06, + "loss": 0.6143, + "step": 10192 + }, + { + "epoch": 2.823545706371191, + "grad_norm": 0.744401752948761, + "learning_rate": 2.7498459233129738e-06, + "loss": 0.6268, + "step": 10193 + }, + { + "epoch": 2.8238227146814405, + "grad_norm": 0.762998640537262, + "learning_rate": 2.749483459544457e-06, + "loss": 0.5823, + "step": 10194 + }, + { + "epoch": 2.82409972299169, + "grad_norm": 0.7275715470314026, + "learning_rate": 2.7491209904787695e-06, + "loss": 0.57, + "step": 10195 + }, + { + "epoch": 2.824376731301939, + "grad_norm": 0.6960054039955139, + "learning_rate": 2.7487585161236076e-06, + "loss": 0.5769, + "step": 10196 + }, + { + "epoch": 2.8246537396121885, + "grad_norm": 0.7539833188056946, + "learning_rate": 2.748396036486667e-06, + "loss": 0.5861, + "step": 10197 + }, + { + "epoch": 2.824930747922438, + "grad_norm": 0.7374334931373596, + "learning_rate": 2.748033551575644e-06, + "loss": 0.584, + "step": 10198 + }, + { + "epoch": 2.825207756232687, + "grad_norm": 0.7197996973991394, + "learning_rate": 2.747671061398236e-06, + "loss": 0.6054, + "step": 10199 + }, + { + "epoch": 2.8254847645429364, + "grad_norm": 0.7191854119300842, + "learning_rate": 2.747308565962138e-06, + "loss": 0.5984, + "step": 10200 + }, + { + "epoch": 2.825761772853186, + "grad_norm": 0.7698421478271484, + "learning_rate": 2.7469460652750477e-06, + "loss": 0.5802, + "step": 10201 + }, + { + "epoch": 2.826038781163435, + "grad_norm": 0.7502759099006653, + "learning_rate": 2.7465835593446622e-06, + "loss": 0.6245, + "step": 10202 + }, + { + "epoch": 2.8263157894736843, + "grad_norm": 0.7236973643302917, + "learning_rate": 2.7462210481786776e-06, + "loss": 0.5862, + "step": 10203 + }, + { + "epoch": 2.8265927977839334, + "grad_norm": 0.676241397857666, + "learning_rate": 2.745858531784791e-06, + "loss": 0.556, + "step": 10204 + }, + { + "epoch": 2.826869806094183, + "grad_norm": 0.7392049431800842, + "learning_rate": 2.7454960101707012e-06, + "loss": 0.5983, + "step": 10205 + }, + { + "epoch": 2.8271468144044323, + "grad_norm": 0.7158267498016357, + "learning_rate": 2.7451334833441035e-06, + "loss": 0.5992, + "step": 10206 + }, + { + "epoch": 2.8274238227146813, + "grad_norm": 0.7335191369056702, + "learning_rate": 2.744770951312696e-06, + "loss": 0.6643, + "step": 10207 + }, + { + "epoch": 2.8277008310249307, + "grad_norm": 0.707643985748291, + "learning_rate": 2.7444084140841766e-06, + "loss": 0.5584, + "step": 10208 + }, + { + "epoch": 2.82797783933518, + "grad_norm": 0.7364636063575745, + "learning_rate": 2.744045871666241e-06, + "loss": 0.5815, + "step": 10209 + }, + { + "epoch": 2.8282548476454292, + "grad_norm": 0.7579964995384216, + "learning_rate": 2.7436833240665892e-06, + "loss": 0.5676, + "step": 10210 + }, + { + "epoch": 2.8285318559556787, + "grad_norm": 0.773270845413208, + "learning_rate": 2.743320771292919e-06, + "loss": 0.6214, + "step": 10211 + }, + { + "epoch": 2.8288088642659277, + "grad_norm": 0.7789325714111328, + "learning_rate": 2.7429582133529266e-06, + "loss": 0.5636, + "step": 10212 + }, + { + "epoch": 2.829085872576177, + "grad_norm": 0.719110369682312, + "learning_rate": 2.74259565025431e-06, + "loss": 0.5672, + "step": 10213 + }, + { + "epoch": 2.8293628808864266, + "grad_norm": 0.7776400446891785, + "learning_rate": 2.7422330820047693e-06, + "loss": 0.5938, + "step": 10214 + }, + { + "epoch": 2.8296398891966756, + "grad_norm": 0.7782211899757385, + "learning_rate": 2.7418705086120018e-06, + "loss": 0.6078, + "step": 10215 + }, + { + "epoch": 2.829916897506925, + "grad_norm": 0.7914127111434937, + "learning_rate": 2.7415079300837056e-06, + "loss": 0.6053, + "step": 10216 + }, + { + "epoch": 2.8301939058171746, + "grad_norm": 0.7330632209777832, + "learning_rate": 2.7411453464275783e-06, + "loss": 0.5704, + "step": 10217 + }, + { + "epoch": 2.8304709141274236, + "grad_norm": 0.7698874473571777, + "learning_rate": 2.74078275765132e-06, + "loss": 0.5881, + "step": 10218 + }, + { + "epoch": 2.830747922437673, + "grad_norm": 0.7458370923995972, + "learning_rate": 2.74042016376263e-06, + "loss": 0.6182, + "step": 10219 + }, + { + "epoch": 2.8310249307479225, + "grad_norm": 0.7117974758148193, + "learning_rate": 2.7400575647692046e-06, + "loss": 0.5814, + "step": 10220 + }, + { + "epoch": 2.8313019390581715, + "grad_norm": 0.7336068153381348, + "learning_rate": 2.739694960678745e-06, + "loss": 0.5504, + "step": 10221 + }, + { + "epoch": 2.831578947368421, + "grad_norm": 0.7284829020500183, + "learning_rate": 2.7393323514989485e-06, + "loss": 0.5956, + "step": 10222 + }, + { + "epoch": 2.8318559556786704, + "grad_norm": 0.7642632722854614, + "learning_rate": 2.738969737237515e-06, + "loss": 0.6281, + "step": 10223 + }, + { + "epoch": 2.8321329639889194, + "grad_norm": 0.754257321357727, + "learning_rate": 2.7386071179021445e-06, + "loss": 0.569, + "step": 10224 + }, + { + "epoch": 2.832409972299169, + "grad_norm": 0.7193533182144165, + "learning_rate": 2.7382444935005353e-06, + "loss": 0.5377, + "step": 10225 + }, + { + "epoch": 2.8326869806094184, + "grad_norm": 0.784979522228241, + "learning_rate": 2.7378818640403865e-06, + "loss": 0.6296, + "step": 10226 + }, + { + "epoch": 2.8329639889196674, + "grad_norm": 0.75473552942276, + "learning_rate": 2.7375192295293984e-06, + "loss": 0.6027, + "step": 10227 + }, + { + "epoch": 2.833240997229917, + "grad_norm": 0.7351518869400024, + "learning_rate": 2.737156589975271e-06, + "loss": 0.5944, + "step": 10228 + }, + { + "epoch": 2.8335180055401663, + "grad_norm": 0.7493429780006409, + "learning_rate": 2.7367939453857034e-06, + "loss": 0.5478, + "step": 10229 + }, + { + "epoch": 2.8337950138504153, + "grad_norm": 0.735837996006012, + "learning_rate": 2.736431295768395e-06, + "loss": 0.5491, + "step": 10230 + }, + { + "epoch": 2.834072022160665, + "grad_norm": 0.7217791676521301, + "learning_rate": 2.736068641131047e-06, + "loss": 0.5784, + "step": 10231 + }, + { + "epoch": 2.8343490304709142, + "grad_norm": 0.730108380317688, + "learning_rate": 2.735705981481359e-06, + "loss": 0.5939, + "step": 10232 + }, + { + "epoch": 2.8346260387811633, + "grad_norm": 0.7806057929992676, + "learning_rate": 2.735343316827031e-06, + "loss": 0.5877, + "step": 10233 + }, + { + "epoch": 2.8349030470914127, + "grad_norm": 0.7238426208496094, + "learning_rate": 2.7349806471757634e-06, + "loss": 0.6003, + "step": 10234 + }, + { + "epoch": 2.835180055401662, + "grad_norm": 0.7456830143928528, + "learning_rate": 2.7346179725352562e-06, + "loss": 0.5845, + "step": 10235 + }, + { + "epoch": 2.835457063711911, + "grad_norm": 0.7526271939277649, + "learning_rate": 2.7342552929132106e-06, + "loss": 0.5942, + "step": 10236 + }, + { + "epoch": 2.8357340720221607, + "grad_norm": 0.7471960186958313, + "learning_rate": 2.7338926083173266e-06, + "loss": 0.6783, + "step": 10237 + }, + { + "epoch": 2.83601108033241, + "grad_norm": 0.7458435297012329, + "learning_rate": 2.733529918755306e-06, + "loss": 0.6002, + "step": 10238 + }, + { + "epoch": 2.836288088642659, + "grad_norm": 0.7718545794487, + "learning_rate": 2.733167224234848e-06, + "loss": 0.5943, + "step": 10239 + }, + { + "epoch": 2.8365650969529086, + "grad_norm": 0.7507277131080627, + "learning_rate": 2.7328045247636555e-06, + "loss": 0.615, + "step": 10240 + }, + { + "epoch": 2.836842105263158, + "grad_norm": 0.7238956689834595, + "learning_rate": 2.732441820349427e-06, + "loss": 0.602, + "step": 10241 + }, + { + "epoch": 2.837119113573407, + "grad_norm": 0.6814334392547607, + "learning_rate": 2.7320791109998655e-06, + "loss": 0.5912, + "step": 10242 + }, + { + "epoch": 2.8373961218836565, + "grad_norm": 0.7484681010246277, + "learning_rate": 2.7317163967226722e-06, + "loss": 0.5704, + "step": 10243 + }, + { + "epoch": 2.837673130193906, + "grad_norm": 0.710078775882721, + "learning_rate": 2.731353677525548e-06, + "loss": 0.5733, + "step": 10244 + }, + { + "epoch": 2.837950138504155, + "grad_norm": 0.7513867616653442, + "learning_rate": 2.730990953416194e-06, + "loss": 0.5329, + "step": 10245 + }, + { + "epoch": 2.8382271468144045, + "grad_norm": 0.7298266887664795, + "learning_rate": 2.730628224402313e-06, + "loss": 0.563, + "step": 10246 + }, + { + "epoch": 2.838504155124654, + "grad_norm": 0.7499540448188782, + "learning_rate": 2.7302654904916055e-06, + "loss": 0.5934, + "step": 10247 + }, + { + "epoch": 2.838781163434903, + "grad_norm": 0.7679840922355652, + "learning_rate": 2.7299027516917725e-06, + "loss": 0.5884, + "step": 10248 + }, + { + "epoch": 2.8390581717451524, + "grad_norm": 0.7211563587188721, + "learning_rate": 2.729540008010518e-06, + "loss": 0.5442, + "step": 10249 + }, + { + "epoch": 2.839335180055402, + "grad_norm": 0.7048432230949402, + "learning_rate": 2.729177259455543e-06, + "loss": 0.5749, + "step": 10250 + }, + { + "epoch": 2.839612188365651, + "grad_norm": 0.7010490298271179, + "learning_rate": 2.7288145060345495e-06, + "loss": 0.5917, + "step": 10251 + }, + { + "epoch": 2.8398891966759003, + "grad_norm": 0.7572813034057617, + "learning_rate": 2.7284517477552393e-06, + "loss": 0.5683, + "step": 10252 + }, + { + "epoch": 2.84016620498615, + "grad_norm": 0.7326565980911255, + "learning_rate": 2.728088984625316e-06, + "loss": 0.6065, + "step": 10253 + }, + { + "epoch": 2.840443213296399, + "grad_norm": 0.7696539163589478, + "learning_rate": 2.72772621665248e-06, + "loss": 0.6366, + "step": 10254 + }, + { + "epoch": 2.8407202216066483, + "grad_norm": 0.711655855178833, + "learning_rate": 2.727363443844435e-06, + "loss": 0.6076, + "step": 10255 + }, + { + "epoch": 2.8409972299168977, + "grad_norm": 0.7610486149787903, + "learning_rate": 2.7270006662088842e-06, + "loss": 0.5638, + "step": 10256 + }, + { + "epoch": 2.8412742382271468, + "grad_norm": 0.7857367992401123, + "learning_rate": 2.7266378837535296e-06, + "loss": 0.601, + "step": 10257 + }, + { + "epoch": 2.841551246537396, + "grad_norm": 0.7206484079360962, + "learning_rate": 2.7262750964860727e-06, + "loss": 0.5781, + "step": 10258 + }, + { + "epoch": 2.8418282548476457, + "grad_norm": 0.7355260848999023, + "learning_rate": 2.7259123044142188e-06, + "loss": 0.5942, + "step": 10259 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.7465602159500122, + "learning_rate": 2.7255495075456693e-06, + "loss": 0.6028, + "step": 10260 + }, + { + "epoch": 2.842382271468144, + "grad_norm": 0.7275111079216003, + "learning_rate": 2.7251867058881277e-06, + "loss": 0.6069, + "step": 10261 + }, + { + "epoch": 2.8426592797783936, + "grad_norm": 0.7320525050163269, + "learning_rate": 2.724823899449298e-06, + "loss": 0.6253, + "step": 10262 + }, + { + "epoch": 2.8429362880886426, + "grad_norm": 0.7653960585594177, + "learning_rate": 2.7244610882368815e-06, + "loss": 0.5776, + "step": 10263 + }, + { + "epoch": 2.843213296398892, + "grad_norm": 0.7047906517982483, + "learning_rate": 2.724098272258584e-06, + "loss": 0.5906, + "step": 10264 + }, + { + "epoch": 2.8434903047091415, + "grad_norm": 0.7334323525428772, + "learning_rate": 2.7237354515221075e-06, + "loss": 0.5913, + "step": 10265 + }, + { + "epoch": 2.8437673130193906, + "grad_norm": 0.7125048041343689, + "learning_rate": 2.723372626035156e-06, + "loss": 0.5951, + "step": 10266 + }, + { + "epoch": 2.84404432132964, + "grad_norm": 0.7758623957633972, + "learning_rate": 2.723009795805433e-06, + "loss": 0.6011, + "step": 10267 + }, + { + "epoch": 2.844321329639889, + "grad_norm": 0.6995490193367004, + "learning_rate": 2.722646960840643e-06, + "loss": 0.5852, + "step": 10268 + }, + { + "epoch": 2.8445983379501385, + "grad_norm": 0.6949390172958374, + "learning_rate": 2.7222841211484896e-06, + "loss": 0.5442, + "step": 10269 + }, + { + "epoch": 2.844875346260388, + "grad_norm": 0.7149237990379333, + "learning_rate": 2.721921276736676e-06, + "loss": 0.6074, + "step": 10270 + }, + { + "epoch": 2.845152354570637, + "grad_norm": 0.6994629502296448, + "learning_rate": 2.721558427612907e-06, + "loss": 0.6086, + "step": 10271 + }, + { + "epoch": 2.8454293628808864, + "grad_norm": 0.7139860987663269, + "learning_rate": 2.7211955737848876e-06, + "loss": 0.5656, + "step": 10272 + }, + { + "epoch": 2.845706371191136, + "grad_norm": 0.7294262647628784, + "learning_rate": 2.7208327152603204e-06, + "loss": 0.6139, + "step": 10273 + }, + { + "epoch": 2.845983379501385, + "grad_norm": 0.7386606335639954, + "learning_rate": 2.7204698520469103e-06, + "loss": 0.5715, + "step": 10274 + }, + { + "epoch": 2.8462603878116344, + "grad_norm": 0.8146941065788269, + "learning_rate": 2.720106984152364e-06, + "loss": 0.6356, + "step": 10275 + }, + { + "epoch": 2.8465373961218834, + "grad_norm": 0.7665870189666748, + "learning_rate": 2.7197441115843824e-06, + "loss": 0.6195, + "step": 10276 + }, + { + "epoch": 2.846814404432133, + "grad_norm": 0.7315971255302429, + "learning_rate": 2.719381234350673e-06, + "loss": 0.5145, + "step": 10277 + }, + { + "epoch": 2.8470914127423823, + "grad_norm": 0.7454190850257874, + "learning_rate": 2.7190183524589405e-06, + "loss": 0.6395, + "step": 10278 + }, + { + "epoch": 2.8473684210526313, + "grad_norm": 0.7426220178604126, + "learning_rate": 2.7186554659168886e-06, + "loss": 0.5366, + "step": 10279 + }, + { + "epoch": 2.847645429362881, + "grad_norm": 0.7703450918197632, + "learning_rate": 2.718292574732222e-06, + "loss": 0.5854, + "step": 10280 + }, + { + "epoch": 2.8479224376731302, + "grad_norm": 0.7275317907333374, + "learning_rate": 2.717929678912647e-06, + "loss": 0.5968, + "step": 10281 + }, + { + "epoch": 2.8481994459833793, + "grad_norm": 0.72344970703125, + "learning_rate": 2.717566778465869e-06, + "loss": 0.6176, + "step": 10282 + }, + { + "epoch": 2.8484764542936287, + "grad_norm": 0.7199895977973938, + "learning_rate": 2.717203873399592e-06, + "loss": 0.5797, + "step": 10283 + }, + { + "epoch": 2.848753462603878, + "grad_norm": 0.725170373916626, + "learning_rate": 2.7168409637215213e-06, + "loss": 0.6077, + "step": 10284 + }, + { + "epoch": 2.849030470914127, + "grad_norm": 0.7106685638427734, + "learning_rate": 2.716478049439365e-06, + "loss": 0.5501, + "step": 10285 + }, + { + "epoch": 2.8493074792243767, + "grad_norm": 0.7496122717857361, + "learning_rate": 2.7161151305608258e-06, + "loss": 0.6609, + "step": 10286 + }, + { + "epoch": 2.849584487534626, + "grad_norm": 0.7406869530677795, + "learning_rate": 2.715752207093611e-06, + "loss": 0.5644, + "step": 10287 + }, + { + "epoch": 2.849861495844875, + "grad_norm": 0.7530012726783752, + "learning_rate": 2.7153892790454256e-06, + "loss": 0.6045, + "step": 10288 + }, + { + "epoch": 2.8501385041551246, + "grad_norm": 0.7549383044242859, + "learning_rate": 2.715026346423975e-06, + "loss": 0.6313, + "step": 10289 + }, + { + "epoch": 2.850415512465374, + "grad_norm": 0.7471486330032349, + "learning_rate": 2.7146634092369674e-06, + "loss": 0.5472, + "step": 10290 + }, + { + "epoch": 2.850692520775623, + "grad_norm": 0.7477738857269287, + "learning_rate": 2.7143004674921063e-06, + "loss": 0.583, + "step": 10291 + }, + { + "epoch": 2.8509695290858725, + "grad_norm": 0.7241801023483276, + "learning_rate": 2.7139375211971e-06, + "loss": 0.5983, + "step": 10292 + }, + { + "epoch": 2.851246537396122, + "grad_norm": 0.7376339435577393, + "learning_rate": 2.7135745703596528e-06, + "loss": 0.5867, + "step": 10293 + }, + { + "epoch": 2.851523545706371, + "grad_norm": 0.7515271306037903, + "learning_rate": 2.7132116149874732e-06, + "loss": 0.6049, + "step": 10294 + }, + { + "epoch": 2.8518005540166205, + "grad_norm": 0.7021607756614685, + "learning_rate": 2.7128486550882647e-06, + "loss": 0.5832, + "step": 10295 + }, + { + "epoch": 2.85207756232687, + "grad_norm": 0.7405130863189697, + "learning_rate": 2.7124856906697374e-06, + "loss": 0.6111, + "step": 10296 + }, + { + "epoch": 2.852354570637119, + "grad_norm": 0.7491231560707092, + "learning_rate": 2.7121227217395957e-06, + "loss": 0.647, + "step": 10297 + }, + { + "epoch": 2.8526315789473684, + "grad_norm": 0.7277181148529053, + "learning_rate": 2.711759748305547e-06, + "loss": 0.5674, + "step": 10298 + }, + { + "epoch": 2.852908587257618, + "grad_norm": 0.6759470701217651, + "learning_rate": 2.7113967703752974e-06, + "loss": 0.5795, + "step": 10299 + }, + { + "epoch": 2.853185595567867, + "grad_norm": 0.7326931953430176, + "learning_rate": 2.711033787956555e-06, + "loss": 0.6122, + "step": 10300 + }, + { + "epoch": 2.8534626038781163, + "grad_norm": 0.7393088340759277, + "learning_rate": 2.710670801057027e-06, + "loss": 0.5768, + "step": 10301 + }, + { + "epoch": 2.853739612188366, + "grad_norm": 0.7498008012771606, + "learning_rate": 2.7103078096844187e-06, + "loss": 0.5535, + "step": 10302 + }, + { + "epoch": 2.854016620498615, + "grad_norm": 0.765217125415802, + "learning_rate": 2.7099448138464395e-06, + "loss": 0.5641, + "step": 10303 + }, + { + "epoch": 2.8542936288088643, + "grad_norm": 0.7652571201324463, + "learning_rate": 2.7095818135507957e-06, + "loss": 0.5632, + "step": 10304 + }, + { + "epoch": 2.8545706371191137, + "grad_norm": 0.7151368856430054, + "learning_rate": 2.7092188088051945e-06, + "loss": 0.5924, + "step": 10305 + }, + { + "epoch": 2.8548476454293628, + "grad_norm": 0.7446417808532715, + "learning_rate": 2.7088557996173433e-06, + "loss": 0.5704, + "step": 10306 + }, + { + "epoch": 2.855124653739612, + "grad_norm": 0.74827641248703, + "learning_rate": 2.708492785994951e-06, + "loss": 0.6094, + "step": 10307 + }, + { + "epoch": 2.8554016620498617, + "grad_norm": 0.7772006392478943, + "learning_rate": 2.7081297679457238e-06, + "loss": 0.6026, + "step": 10308 + }, + { + "epoch": 2.8556786703601107, + "grad_norm": 0.6965993046760559, + "learning_rate": 2.7077667454773705e-06, + "loss": 0.5866, + "step": 10309 + }, + { + "epoch": 2.85595567867036, + "grad_norm": 0.7219796776771545, + "learning_rate": 2.707403718597598e-06, + "loss": 0.5489, + "step": 10310 + }, + { + "epoch": 2.8562326869806096, + "grad_norm": 0.712982714176178, + "learning_rate": 2.707040687314116e-06, + "loss": 0.595, + "step": 10311 + }, + { + "epoch": 2.8565096952908586, + "grad_norm": 0.7534233927726746, + "learning_rate": 2.70667765163463e-06, + "loss": 0.5422, + "step": 10312 + }, + { + "epoch": 2.856786703601108, + "grad_norm": 0.7488660216331482, + "learning_rate": 2.706314611566851e-06, + "loss": 0.5522, + "step": 10313 + }, + { + "epoch": 2.8570637119113576, + "grad_norm": 0.7368558645248413, + "learning_rate": 2.705951567118485e-06, + "loss": 0.6267, + "step": 10314 + }, + { + "epoch": 2.8573407202216066, + "grad_norm": 0.7692279815673828, + "learning_rate": 2.7055885182972417e-06, + "loss": 0.5865, + "step": 10315 + }, + { + "epoch": 2.857617728531856, + "grad_norm": 0.7746422290802002, + "learning_rate": 2.7052254651108295e-06, + "loss": 0.5904, + "step": 10316 + }, + { + "epoch": 2.8578947368421055, + "grad_norm": 0.7562121152877808, + "learning_rate": 2.7048624075669554e-06, + "loss": 0.6338, + "step": 10317 + }, + { + "epoch": 2.8581717451523545, + "grad_norm": 0.7386156320571899, + "learning_rate": 2.7044993456733304e-06, + "loss": 0.5977, + "step": 10318 + }, + { + "epoch": 2.858448753462604, + "grad_norm": 0.7650381326675415, + "learning_rate": 2.704136279437662e-06, + "loss": 0.5834, + "step": 10319 + }, + { + "epoch": 2.8587257617728534, + "grad_norm": 0.7712715864181519, + "learning_rate": 2.7037732088676583e-06, + "loss": 0.6099, + "step": 10320 + }, + { + "epoch": 2.8590027700831024, + "grad_norm": 0.7141051888465881, + "learning_rate": 2.703410133971029e-06, + "loss": 0.5966, + "step": 10321 + }, + { + "epoch": 2.859279778393352, + "grad_norm": 0.7260600328445435, + "learning_rate": 2.703047054755484e-06, + "loss": 0.5698, + "step": 10322 + }, + { + "epoch": 2.8595567867036014, + "grad_norm": 0.7182386517524719, + "learning_rate": 2.702683971228731e-06, + "loss": 0.6064, + "step": 10323 + }, + { + "epoch": 2.8598337950138504, + "grad_norm": 0.7457290291786194, + "learning_rate": 2.70232088339848e-06, + "loss": 0.6439, + "step": 10324 + }, + { + "epoch": 2.8601108033241, + "grad_norm": 0.7517281770706177, + "learning_rate": 2.7019577912724393e-06, + "loss": 0.5871, + "step": 10325 + }, + { + "epoch": 2.8603878116343493, + "grad_norm": 0.7766019701957703, + "learning_rate": 2.7015946948583198e-06, + "loss": 0.5691, + "step": 10326 + }, + { + "epoch": 2.8606648199445983, + "grad_norm": 0.77918541431427, + "learning_rate": 2.701231594163829e-06, + "loss": 0.6303, + "step": 10327 + }, + { + "epoch": 2.8609418282548478, + "grad_norm": 0.7339177131652832, + "learning_rate": 2.7008684891966787e-06, + "loss": 0.5974, + "step": 10328 + }, + { + "epoch": 2.861218836565097, + "grad_norm": 0.7705397009849548, + "learning_rate": 2.700505379964577e-06, + "loss": 0.5466, + "step": 10329 + }, + { + "epoch": 2.8614958448753463, + "grad_norm": 0.7495179772377014, + "learning_rate": 2.7001422664752338e-06, + "loss": 0.5933, + "step": 10330 + }, + { + "epoch": 2.8617728531855957, + "grad_norm": 0.7036161422729492, + "learning_rate": 2.699779148736359e-06, + "loss": 0.5784, + "step": 10331 + }, + { + "epoch": 2.8620498614958447, + "grad_norm": 0.8170588612556458, + "learning_rate": 2.6994160267556633e-06, + "loss": 0.5748, + "step": 10332 + }, + { + "epoch": 2.862326869806094, + "grad_norm": 0.7510451674461365, + "learning_rate": 2.6990529005408563e-06, + "loss": 0.6104, + "step": 10333 + }, + { + "epoch": 2.8626038781163436, + "grad_norm": 0.7337208390235901, + "learning_rate": 2.6986897700996467e-06, + "loss": 0.6053, + "step": 10334 + }, + { + "epoch": 2.8628808864265927, + "grad_norm": 0.7081796526908875, + "learning_rate": 2.698326635439747e-06, + "loss": 0.5888, + "step": 10335 + }, + { + "epoch": 2.863157894736842, + "grad_norm": 0.7299284338951111, + "learning_rate": 2.697963496568866e-06, + "loss": 0.573, + "step": 10336 + }, + { + "epoch": 2.863434903047091, + "grad_norm": 0.757738471031189, + "learning_rate": 2.6976003534947144e-06, + "loss": 0.6199, + "step": 10337 + }, + { + "epoch": 2.8637119113573406, + "grad_norm": 0.7651471495628357, + "learning_rate": 2.697237206225003e-06, + "loss": 0.5917, + "step": 10338 + }, + { + "epoch": 2.86398891966759, + "grad_norm": 0.7392405867576599, + "learning_rate": 2.6968740547674416e-06, + "loss": 0.5752, + "step": 10339 + }, + { + "epoch": 2.864265927977839, + "grad_norm": 0.761151909828186, + "learning_rate": 2.696510899129741e-06, + "loss": 0.6254, + "step": 10340 + }, + { + "epoch": 2.8645429362880885, + "grad_norm": 0.74574214220047, + "learning_rate": 2.696147739319613e-06, + "loss": 0.5659, + "step": 10341 + }, + { + "epoch": 2.864819944598338, + "grad_norm": 0.743224024772644, + "learning_rate": 2.695784575344767e-06, + "loss": 0.6149, + "step": 10342 + }, + { + "epoch": 2.865096952908587, + "grad_norm": 0.7006165981292725, + "learning_rate": 2.6954214072129148e-06, + "loss": 0.5829, + "step": 10343 + }, + { + "epoch": 2.8653739612188365, + "grad_norm": 0.7185438871383667, + "learning_rate": 2.6950582349317665e-06, + "loss": 0.6488, + "step": 10344 + }, + { + "epoch": 2.865650969529086, + "grad_norm": 0.7694297432899475, + "learning_rate": 2.694695058509034e-06, + "loss": 0.6311, + "step": 10345 + }, + { + "epoch": 2.865927977839335, + "grad_norm": 0.7526674270629883, + "learning_rate": 2.6943318779524285e-06, + "loss": 0.5898, + "step": 10346 + }, + { + "epoch": 2.8662049861495844, + "grad_norm": 0.7574858069419861, + "learning_rate": 2.6939686932696608e-06, + "loss": 0.5964, + "step": 10347 + }, + { + "epoch": 2.866481994459834, + "grad_norm": 0.690632164478302, + "learning_rate": 2.6936055044684427e-06, + "loss": 0.5699, + "step": 10348 + }, + { + "epoch": 2.866759002770083, + "grad_norm": 0.7249476909637451, + "learning_rate": 2.6932423115564847e-06, + "loss": 0.6352, + "step": 10349 + }, + { + "epoch": 2.8670360110803323, + "grad_norm": 0.6996097564697266, + "learning_rate": 2.6928791145414996e-06, + "loss": 0.5521, + "step": 10350 + }, + { + "epoch": 2.867313019390582, + "grad_norm": 0.7583314776420593, + "learning_rate": 2.692515913431198e-06, + "loss": 0.6453, + "step": 10351 + }, + { + "epoch": 2.867590027700831, + "grad_norm": 0.7682088613510132, + "learning_rate": 2.692152708233292e-06, + "loss": 0.6159, + "step": 10352 + }, + { + "epoch": 2.8678670360110803, + "grad_norm": 0.7417542934417725, + "learning_rate": 2.6917894989554926e-06, + "loss": 0.5728, + "step": 10353 + }, + { + "epoch": 2.8681440443213297, + "grad_norm": 0.7185600399971008, + "learning_rate": 2.6914262856055135e-06, + "loss": 0.561, + "step": 10354 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.7589703798294067, + "learning_rate": 2.691063068191065e-06, + "loss": 0.5948, + "step": 10355 + }, + { + "epoch": 2.868698060941828, + "grad_norm": 0.7474316358566284, + "learning_rate": 2.69069984671986e-06, + "loss": 0.6212, + "step": 10356 + }, + { + "epoch": 2.8689750692520777, + "grad_norm": 0.7184518575668335, + "learning_rate": 2.690336621199609e-06, + "loss": 0.5574, + "step": 10357 + }, + { + "epoch": 2.8692520775623267, + "grad_norm": 0.7420065999031067, + "learning_rate": 2.6899733916380265e-06, + "loss": 0.5569, + "step": 10358 + }, + { + "epoch": 2.869529085872576, + "grad_norm": 0.754462718963623, + "learning_rate": 2.689610158042824e-06, + "loss": 0.5758, + "step": 10359 + }, + { + "epoch": 2.8698060941828256, + "grad_norm": 0.7530493140220642, + "learning_rate": 2.689246920421713e-06, + "loss": 0.5947, + "step": 10360 + }, + { + "epoch": 2.8700831024930746, + "grad_norm": 0.7274234294891357, + "learning_rate": 2.6888836787824074e-06, + "loss": 0.5789, + "step": 10361 + }, + { + "epoch": 2.870360110803324, + "grad_norm": 0.7387732863426208, + "learning_rate": 2.6885204331326183e-06, + "loss": 0.5774, + "step": 10362 + }, + { + "epoch": 2.8706371191135736, + "grad_norm": 0.6835230588912964, + "learning_rate": 2.688157183480059e-06, + "loss": 0.5154, + "step": 10363 + }, + { + "epoch": 2.8709141274238226, + "grad_norm": 0.7362700700759888, + "learning_rate": 2.687793929832442e-06, + "loss": 0.6004, + "step": 10364 + }, + { + "epoch": 2.871191135734072, + "grad_norm": 0.7476580142974854, + "learning_rate": 2.687430672197481e-06, + "loss": 0.584, + "step": 10365 + }, + { + "epoch": 2.8714681440443215, + "grad_norm": 0.7573707699775696, + "learning_rate": 2.687067410582887e-06, + "loss": 0.6167, + "step": 10366 + }, + { + "epoch": 2.8717451523545705, + "grad_norm": 0.7462565898895264, + "learning_rate": 2.686704144996376e-06, + "loss": 0.5862, + "step": 10367 + }, + { + "epoch": 2.87202216066482, + "grad_norm": 0.743422269821167, + "learning_rate": 2.6863408754456577e-06, + "loss": 0.5973, + "step": 10368 + }, + { + "epoch": 2.8722991689750694, + "grad_norm": 0.7598124742507935, + "learning_rate": 2.685977601938447e-06, + "loss": 0.5604, + "step": 10369 + }, + { + "epoch": 2.8725761772853184, + "grad_norm": 0.767044723033905, + "learning_rate": 2.6856143244824573e-06, + "loss": 0.579, + "step": 10370 + }, + { + "epoch": 2.872853185595568, + "grad_norm": 0.764921247959137, + "learning_rate": 2.685251043085401e-06, + "loss": 0.6125, + "step": 10371 + }, + { + "epoch": 2.8731301939058174, + "grad_norm": 0.7420265674591064, + "learning_rate": 2.684887757754992e-06, + "loss": 0.6123, + "step": 10372 + }, + { + "epoch": 2.8734072022160664, + "grad_norm": 0.7900787591934204, + "learning_rate": 2.684524468498944e-06, + "loss": 0.5781, + "step": 10373 + }, + { + "epoch": 2.873684210526316, + "grad_norm": 0.7371800541877747, + "learning_rate": 2.684161175324971e-06, + "loss": 0.5815, + "step": 10374 + }, + { + "epoch": 2.8739612188365653, + "grad_norm": 0.7698126435279846, + "learning_rate": 2.683797878240784e-06, + "loss": 0.5723, + "step": 10375 + }, + { + "epoch": 2.8742382271468143, + "grad_norm": 0.729429304599762, + "learning_rate": 2.6834345772541005e-06, + "loss": 0.5586, + "step": 10376 + }, + { + "epoch": 2.8745152354570638, + "grad_norm": 0.7678404450416565, + "learning_rate": 2.683071272372632e-06, + "loss": 0.6127, + "step": 10377 + }, + { + "epoch": 2.8747922437673132, + "grad_norm": 0.7486993670463562, + "learning_rate": 2.6827079636040924e-06, + "loss": 0.5885, + "step": 10378 + }, + { + "epoch": 2.8750692520775623, + "grad_norm": 0.7203310132026672, + "learning_rate": 2.682344650956196e-06, + "loss": 0.58, + "step": 10379 + }, + { + "epoch": 2.8753462603878117, + "grad_norm": 0.7622033953666687, + "learning_rate": 2.6819813344366584e-06, + "loss": 0.568, + "step": 10380 + }, + { + "epoch": 2.875623268698061, + "grad_norm": 0.7309375405311584, + "learning_rate": 2.681618014053191e-06, + "loss": 0.5834, + "step": 10381 + }, + { + "epoch": 2.87590027700831, + "grad_norm": 0.7362342476844788, + "learning_rate": 2.68125468981351e-06, + "loss": 0.5626, + "step": 10382 + }, + { + "epoch": 2.8761772853185597, + "grad_norm": 0.7526897192001343, + "learning_rate": 2.680891361725329e-06, + "loss": 0.5786, + "step": 10383 + }, + { + "epoch": 2.876454293628809, + "grad_norm": 0.7369986772537231, + "learning_rate": 2.6805280297963625e-06, + "loss": 0.5598, + "step": 10384 + }, + { + "epoch": 2.876731301939058, + "grad_norm": 0.7252307534217834, + "learning_rate": 2.6801646940343245e-06, + "loss": 0.6002, + "step": 10385 + }, + { + "epoch": 2.8770083102493076, + "grad_norm": 0.7209944128990173, + "learning_rate": 2.6798013544469304e-06, + "loss": 0.5885, + "step": 10386 + }, + { + "epoch": 2.877285318559557, + "grad_norm": 0.7498239874839783, + "learning_rate": 2.679438011041894e-06, + "loss": 0.5409, + "step": 10387 + }, + { + "epoch": 2.877562326869806, + "grad_norm": 0.7315983772277832, + "learning_rate": 2.67907466382693e-06, + "loss": 0.5746, + "step": 10388 + }, + { + "epoch": 2.8778393351800555, + "grad_norm": 0.7259673476219177, + "learning_rate": 2.678711312809754e-06, + "loss": 0.5633, + "step": 10389 + }, + { + "epoch": 2.878116343490305, + "grad_norm": 0.7267625331878662, + "learning_rate": 2.678347957998081e-06, + "loss": 0.5913, + "step": 10390 + }, + { + "epoch": 2.878393351800554, + "grad_norm": 0.7919610738754272, + "learning_rate": 2.6779845993996252e-06, + "loss": 0.6664, + "step": 10391 + }, + { + "epoch": 2.8786703601108035, + "grad_norm": 0.7108157277107239, + "learning_rate": 2.6776212370221015e-06, + "loss": 0.5555, + "step": 10392 + }, + { + "epoch": 2.8789473684210525, + "grad_norm": 0.7125835418701172, + "learning_rate": 2.6772578708732254e-06, + "loss": 0.6006, + "step": 10393 + }, + { + "epoch": 2.879224376731302, + "grad_norm": 0.6881554126739502, + "learning_rate": 2.676894500960712e-06, + "loss": 0.5513, + "step": 10394 + }, + { + "epoch": 2.8795013850415514, + "grad_norm": 0.7503734230995178, + "learning_rate": 2.676531127292277e-06, + "loss": 0.6088, + "step": 10395 + }, + { + "epoch": 2.8797783933518004, + "grad_norm": 0.7577073574066162, + "learning_rate": 2.676167749875635e-06, + "loss": 0.5944, + "step": 10396 + }, + { + "epoch": 2.88005540166205, + "grad_norm": 0.7305821776390076, + "learning_rate": 2.675804368718502e-06, + "loss": 0.5907, + "step": 10397 + }, + { + "epoch": 2.8803324099722993, + "grad_norm": 0.7082803845405579, + "learning_rate": 2.6754409838285926e-06, + "loss": 0.5824, + "step": 10398 + }, + { + "epoch": 2.8806094182825484, + "grad_norm": 0.7570558190345764, + "learning_rate": 2.6750775952136244e-06, + "loss": 0.5769, + "step": 10399 + }, + { + "epoch": 2.880886426592798, + "grad_norm": 0.7124000191688538, + "learning_rate": 2.6747142028813107e-06, + "loss": 0.5276, + "step": 10400 + }, + { + "epoch": 2.881163434903047, + "grad_norm": 0.7314373850822449, + "learning_rate": 2.674350806839368e-06, + "loss": 0.5916, + "step": 10401 + }, + { + "epoch": 2.8814404432132963, + "grad_norm": 0.714793860912323, + "learning_rate": 2.6739874070955134e-06, + "loss": 0.5756, + "step": 10402 + }, + { + "epoch": 2.8817174515235457, + "grad_norm": 0.7305763959884644, + "learning_rate": 2.673624003657461e-06, + "loss": 0.6034, + "step": 10403 + }, + { + "epoch": 2.8819944598337948, + "grad_norm": 0.7191005349159241, + "learning_rate": 2.6732605965329282e-06, + "loss": 0.6368, + "step": 10404 + }, + { + "epoch": 2.8822714681440442, + "grad_norm": 0.7448530793190002, + "learning_rate": 2.6728971857296305e-06, + "loss": 0.5453, + "step": 10405 + }, + { + "epoch": 2.8825484764542937, + "grad_norm": 0.6936946511268616, + "learning_rate": 2.672533771255284e-06, + "loss": 0.5296, + "step": 10406 + }, + { + "epoch": 2.8828254847645427, + "grad_norm": 0.7031975388526917, + "learning_rate": 2.6721703531176045e-06, + "loss": 0.5641, + "step": 10407 + }, + { + "epoch": 2.883102493074792, + "grad_norm": 0.7554457187652588, + "learning_rate": 2.671806931324309e-06, + "loss": 0.5301, + "step": 10408 + }, + { + "epoch": 2.8833795013850416, + "grad_norm": 0.7462570071220398, + "learning_rate": 2.671443505883114e-06, + "loss": 0.5864, + "step": 10409 + }, + { + "epoch": 2.8836565096952906, + "grad_norm": 0.7643255591392517, + "learning_rate": 2.671080076801735e-06, + "loss": 0.6312, + "step": 10410 + }, + { + "epoch": 2.88393351800554, + "grad_norm": 0.7761854529380798, + "learning_rate": 2.670716644087889e-06, + "loss": 0.6093, + "step": 10411 + }, + { + "epoch": 2.8842105263157896, + "grad_norm": 0.7329185605049133, + "learning_rate": 2.6703532077492935e-06, + "loss": 0.5356, + "step": 10412 + }, + { + "epoch": 2.8844875346260386, + "grad_norm": 0.7831817269325256, + "learning_rate": 2.669989767793663e-06, + "loss": 0.5695, + "step": 10413 + }, + { + "epoch": 2.884764542936288, + "grad_norm": 0.7784004211425781, + "learning_rate": 2.6696263242287167e-06, + "loss": 0.6021, + "step": 10414 + }, + { + "epoch": 2.8850415512465375, + "grad_norm": 0.7950989007949829, + "learning_rate": 2.6692628770621704e-06, + "loss": 0.575, + "step": 10415 + }, + { + "epoch": 2.8853185595567865, + "grad_norm": 0.7388087511062622, + "learning_rate": 2.6688994263017405e-06, + "loss": 0.5906, + "step": 10416 + }, + { + "epoch": 2.885595567867036, + "grad_norm": 0.6924887299537659, + "learning_rate": 2.6685359719551445e-06, + "loss": 0.542, + "step": 10417 + }, + { + "epoch": 2.8858725761772854, + "grad_norm": 0.7233446836471558, + "learning_rate": 2.6681725140300995e-06, + "loss": 0.5887, + "step": 10418 + }, + { + "epoch": 2.8861495844875344, + "grad_norm": 0.7647252678871155, + "learning_rate": 2.6678090525343227e-06, + "loss": 0.6204, + "step": 10419 + }, + { + "epoch": 2.886426592797784, + "grad_norm": 0.7015892863273621, + "learning_rate": 2.6674455874755306e-06, + "loss": 0.5807, + "step": 10420 + }, + { + "epoch": 2.8867036011080334, + "grad_norm": 0.7895192503929138, + "learning_rate": 2.6670821188614422e-06, + "loss": 0.5693, + "step": 10421 + }, + { + "epoch": 2.8869806094182824, + "grad_norm": 0.7190335988998413, + "learning_rate": 2.6667186466997724e-06, + "loss": 0.5839, + "step": 10422 + }, + { + "epoch": 2.887257617728532, + "grad_norm": 0.7219985127449036, + "learning_rate": 2.666355170998241e-06, + "loss": 0.5896, + "step": 10423 + }, + { + "epoch": 2.8875346260387813, + "grad_norm": 0.7790842056274414, + "learning_rate": 2.665991691764564e-06, + "loss": 0.614, + "step": 10424 + }, + { + "epoch": 2.8878116343490303, + "grad_norm": 1.0126031637191772, + "learning_rate": 2.66562820900646e-06, + "loss": 0.6181, + "step": 10425 + }, + { + "epoch": 2.88808864265928, + "grad_norm": 0.7423431873321533, + "learning_rate": 2.6652647227316453e-06, + "loss": 0.627, + "step": 10426 + }, + { + "epoch": 2.8883656509695292, + "grad_norm": 0.7445407509803772, + "learning_rate": 2.664901232947839e-06, + "loss": 0.6103, + "step": 10427 + }, + { + "epoch": 2.8886426592797783, + "grad_norm": 0.7246869206428528, + "learning_rate": 2.6645377396627587e-06, + "loss": 0.5792, + "step": 10428 + }, + { + "epoch": 2.8889196675900277, + "grad_norm": 0.7190065383911133, + "learning_rate": 2.664174242884121e-06, + "loss": 0.5731, + "step": 10429 + }, + { + "epoch": 2.889196675900277, + "grad_norm": 0.7758619785308838, + "learning_rate": 2.6638107426196457e-06, + "loss": 0.5841, + "step": 10430 + }, + { + "epoch": 2.889473684210526, + "grad_norm": 0.790145754814148, + "learning_rate": 2.66344723887705e-06, + "loss": 0.6128, + "step": 10431 + }, + { + "epoch": 2.8897506925207757, + "grad_norm": 0.7663121223449707, + "learning_rate": 2.663083731664052e-06, + "loss": 0.5924, + "step": 10432 + }, + { + "epoch": 2.890027700831025, + "grad_norm": 0.7356205582618713, + "learning_rate": 2.6627202209883697e-06, + "loss": 0.6058, + "step": 10433 + }, + { + "epoch": 2.890304709141274, + "grad_norm": 0.7383139133453369, + "learning_rate": 2.662356706857722e-06, + "loss": 0.553, + "step": 10434 + }, + { + "epoch": 2.8905817174515236, + "grad_norm": 0.7479643821716309, + "learning_rate": 2.661993189279826e-06, + "loss": 0.5983, + "step": 10435 + }, + { + "epoch": 2.890858725761773, + "grad_norm": 0.7774149179458618, + "learning_rate": 2.6616296682624016e-06, + "loss": 0.6211, + "step": 10436 + }, + { + "epoch": 2.891135734072022, + "grad_norm": 0.7735824584960938, + "learning_rate": 2.6612661438131664e-06, + "loss": 0.6096, + "step": 10437 + }, + { + "epoch": 2.8914127423822715, + "grad_norm": 0.7517207860946655, + "learning_rate": 2.6609026159398395e-06, + "loss": 0.5858, + "step": 10438 + }, + { + "epoch": 2.891689750692521, + "grad_norm": 0.750087320804596, + "learning_rate": 2.6605390846501377e-06, + "loss": 0.5987, + "step": 10439 + }, + { + "epoch": 2.89196675900277, + "grad_norm": 0.7316486239433289, + "learning_rate": 2.6601755499517826e-06, + "loss": 0.5654, + "step": 10440 + }, + { + "epoch": 2.8922437673130195, + "grad_norm": 0.7494932413101196, + "learning_rate": 2.659812011852491e-06, + "loss": 0.6034, + "step": 10441 + }, + { + "epoch": 2.892520775623269, + "grad_norm": 0.7179362177848816, + "learning_rate": 2.659448470359983e-06, + "loss": 0.5445, + "step": 10442 + }, + { + "epoch": 2.892797783933518, + "grad_norm": 0.7596146464347839, + "learning_rate": 2.6590849254819757e-06, + "loss": 0.6145, + "step": 10443 + }, + { + "epoch": 2.8930747922437674, + "grad_norm": 0.7503767013549805, + "learning_rate": 2.6587213772261896e-06, + "loss": 0.5562, + "step": 10444 + }, + { + "epoch": 2.893351800554017, + "grad_norm": 0.7222331762313843, + "learning_rate": 2.6583578256003435e-06, + "loss": 0.6068, + "step": 10445 + }, + { + "epoch": 2.893628808864266, + "grad_norm": 0.735457181930542, + "learning_rate": 2.6579942706121566e-06, + "loss": 0.6146, + "step": 10446 + }, + { + "epoch": 2.8939058171745153, + "grad_norm": 0.7325124144554138, + "learning_rate": 2.6576307122693474e-06, + "loss": 0.5624, + "step": 10447 + }, + { + "epoch": 2.894182825484765, + "grad_norm": 0.7304285764694214, + "learning_rate": 2.6572671505796357e-06, + "loss": 0.5689, + "step": 10448 + }, + { + "epoch": 2.894459833795014, + "grad_norm": 0.7730352878570557, + "learning_rate": 2.656903585550741e-06, + "loss": 0.5711, + "step": 10449 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.7142786383628845, + "learning_rate": 2.656540017190382e-06, + "loss": 0.5513, + "step": 10450 + }, + { + "epoch": 2.8950138504155127, + "grad_norm": 0.7653011083602905, + "learning_rate": 2.6561764455062793e-06, + "loss": 0.5771, + "step": 10451 + }, + { + "epoch": 2.8952908587257618, + "grad_norm": 0.7704102993011475, + "learning_rate": 2.655812870506151e-06, + "loss": 0.6245, + "step": 10452 + }, + { + "epoch": 2.895567867036011, + "grad_norm": 0.703545093536377, + "learning_rate": 2.6554492921977188e-06, + "loss": 0.5881, + "step": 10453 + }, + { + "epoch": 2.8958448753462602, + "grad_norm": 0.7527859807014465, + "learning_rate": 2.6550857105887e-06, + "loss": 0.6229, + "step": 10454 + }, + { + "epoch": 2.8961218836565097, + "grad_norm": 0.7578487992286682, + "learning_rate": 2.654722125686816e-06, + "loss": 0.6109, + "step": 10455 + }, + { + "epoch": 2.896398891966759, + "grad_norm": 0.7411022782325745, + "learning_rate": 2.654358537499786e-06, + "loss": 0.6115, + "step": 10456 + }, + { + "epoch": 2.896675900277008, + "grad_norm": 0.712331235408783, + "learning_rate": 2.6539949460353297e-06, + "loss": 0.6168, + "step": 10457 + }, + { + "epoch": 2.8969529085872576, + "grad_norm": 0.7301256060600281, + "learning_rate": 2.653631351301168e-06, + "loss": 0.5976, + "step": 10458 + }, + { + "epoch": 2.897229916897507, + "grad_norm": 0.7509595155715942, + "learning_rate": 2.6532677533050207e-06, + "loss": 0.5968, + "step": 10459 + }, + { + "epoch": 2.897506925207756, + "grad_norm": 0.7287399172782898, + "learning_rate": 2.6529041520546072e-06, + "loss": 0.5736, + "step": 10460 + }, + { + "epoch": 2.8977839335180056, + "grad_norm": 0.7702319025993347, + "learning_rate": 2.6525405475576478e-06, + "loss": 0.5706, + "step": 10461 + }, + { + "epoch": 2.8980609418282546, + "grad_norm": 0.749108076095581, + "learning_rate": 2.6521769398218635e-06, + "loss": 0.5925, + "step": 10462 + }, + { + "epoch": 2.898337950138504, + "grad_norm": 0.757893979549408, + "learning_rate": 2.651813328854974e-06, + "loss": 0.5766, + "step": 10463 + }, + { + "epoch": 2.8986149584487535, + "grad_norm": 0.7170742750167847, + "learning_rate": 2.6514497146647e-06, + "loss": 0.5713, + "step": 10464 + }, + { + "epoch": 2.8988919667590025, + "grad_norm": 0.7580187916755676, + "learning_rate": 2.651086097258761e-06, + "loss": 0.5913, + "step": 10465 + }, + { + "epoch": 2.899168975069252, + "grad_norm": 0.7491013407707214, + "learning_rate": 2.6507224766448797e-06, + "loss": 0.5965, + "step": 10466 + }, + { + "epoch": 2.8994459833795014, + "grad_norm": 0.7685227990150452, + "learning_rate": 2.650358852830774e-06, + "loss": 0.5856, + "step": 10467 + }, + { + "epoch": 2.8997229916897505, + "grad_norm": 0.7391490340232849, + "learning_rate": 2.6499952258241664e-06, + "loss": 0.6273, + "step": 10468 + }, + { + "epoch": 2.9, + "grad_norm": 0.7236970067024231, + "learning_rate": 2.649631595632777e-06, + "loss": 0.6031, + "step": 10469 + }, + { + "epoch": 2.9002770083102494, + "grad_norm": 0.7581817507743835, + "learning_rate": 2.649267962264327e-06, + "loss": 0.6149, + "step": 10470 + }, + { + "epoch": 2.9005540166204984, + "grad_norm": 0.7591217756271362, + "learning_rate": 2.6489043257265362e-06, + "loss": 0.6211, + "step": 10471 + }, + { + "epoch": 2.900831024930748, + "grad_norm": 0.7190767526626587, + "learning_rate": 2.648540686027127e-06, + "loss": 0.618, + "step": 10472 + }, + { + "epoch": 2.9011080332409973, + "grad_norm": 0.6588174700737, + "learning_rate": 2.6481770431738202e-06, + "loss": 0.5314, + "step": 10473 + }, + { + "epoch": 2.9013850415512463, + "grad_norm": 0.7454791069030762, + "learning_rate": 2.647813397174335e-06, + "loss": 0.5785, + "step": 10474 + }, + { + "epoch": 2.901662049861496, + "grad_norm": 0.7779990434646606, + "learning_rate": 2.647449748036395e-06, + "loss": 0.5702, + "step": 10475 + }, + { + "epoch": 2.9019390581717452, + "grad_norm": 0.725070595741272, + "learning_rate": 2.6470860957677203e-06, + "loss": 0.5901, + "step": 10476 + }, + { + "epoch": 2.9022160664819943, + "grad_norm": 0.766870379447937, + "learning_rate": 2.6467224403760315e-06, + "loss": 0.5372, + "step": 10477 + }, + { + "epoch": 2.9024930747922437, + "grad_norm": 0.6898723840713501, + "learning_rate": 2.646358781869051e-06, + "loss": 0.5443, + "step": 10478 + }, + { + "epoch": 2.902770083102493, + "grad_norm": 0.7398709654808044, + "learning_rate": 2.6459951202544998e-06, + "loss": 0.5762, + "step": 10479 + }, + { + "epoch": 2.903047091412742, + "grad_norm": 0.7504476308822632, + "learning_rate": 2.645631455540099e-06, + "loss": 0.6037, + "step": 10480 + }, + { + "epoch": 2.9033240997229917, + "grad_norm": 0.7410624623298645, + "learning_rate": 2.645267787733571e-06, + "loss": 0.593, + "step": 10481 + }, + { + "epoch": 2.903601108033241, + "grad_norm": 0.694694459438324, + "learning_rate": 2.6449041168426372e-06, + "loss": 0.5549, + "step": 10482 + }, + { + "epoch": 2.90387811634349, + "grad_norm": 0.8081617951393127, + "learning_rate": 2.6445404428750183e-06, + "loss": 0.6227, + "step": 10483 + }, + { + "epoch": 2.9041551246537396, + "grad_norm": 0.7287876009941101, + "learning_rate": 2.6441767658384363e-06, + "loss": 0.576, + "step": 10484 + }, + { + "epoch": 2.904432132963989, + "grad_norm": 0.7433827519416809, + "learning_rate": 2.643813085740614e-06, + "loss": 0.6118, + "step": 10485 + }, + { + "epoch": 2.904709141274238, + "grad_norm": 0.7551594376564026, + "learning_rate": 2.6434494025892726e-06, + "loss": 0.5844, + "step": 10486 + }, + { + "epoch": 2.9049861495844875, + "grad_norm": 0.7610915303230286, + "learning_rate": 2.6430857163921335e-06, + "loss": 0.5876, + "step": 10487 + }, + { + "epoch": 2.905263157894737, + "grad_norm": 0.6847673654556274, + "learning_rate": 2.6427220271569206e-06, + "loss": 0.6123, + "step": 10488 + }, + { + "epoch": 2.905540166204986, + "grad_norm": 0.7411179542541504, + "learning_rate": 2.6423583348913533e-06, + "loss": 0.5867, + "step": 10489 + }, + { + "epoch": 2.9058171745152355, + "grad_norm": 0.7440828084945679, + "learning_rate": 2.6419946396031552e-06, + "loss": 0.5836, + "step": 10490 + }, + { + "epoch": 2.906094182825485, + "grad_norm": 0.7707085609436035, + "learning_rate": 2.641630941300049e-06, + "loss": 0.5885, + "step": 10491 + }, + { + "epoch": 2.906371191135734, + "grad_norm": 0.75754314661026, + "learning_rate": 2.641267239989756e-06, + "loss": 0.596, + "step": 10492 + }, + { + "epoch": 2.9066481994459834, + "grad_norm": 0.7997472882270813, + "learning_rate": 2.640903535679998e-06, + "loss": 0.6185, + "step": 10493 + }, + { + "epoch": 2.906925207756233, + "grad_norm": 0.754387617111206, + "learning_rate": 2.6405398283784984e-06, + "loss": 0.5758, + "step": 10494 + }, + { + "epoch": 2.907202216066482, + "grad_norm": 0.7376065254211426, + "learning_rate": 2.6401761180929798e-06, + "loss": 0.5902, + "step": 10495 + }, + { + "epoch": 2.9074792243767313, + "grad_norm": 0.7251534461975098, + "learning_rate": 2.639812404831164e-06, + "loss": 0.6071, + "step": 10496 + }, + { + "epoch": 2.907756232686981, + "grad_norm": 0.8182569742202759, + "learning_rate": 2.6394486886007743e-06, + "loss": 0.5888, + "step": 10497 + }, + { + "epoch": 2.90803324099723, + "grad_norm": 0.7495296001434326, + "learning_rate": 2.6390849694095322e-06, + "loss": 0.6519, + "step": 10498 + }, + { + "epoch": 2.9083102493074793, + "grad_norm": 0.7202491164207458, + "learning_rate": 2.6387212472651606e-06, + "loss": 0.607, + "step": 10499 + }, + { + "epoch": 2.9085872576177287, + "grad_norm": 0.7387695908546448, + "learning_rate": 2.638357522175383e-06, + "loss": 0.6281, + "step": 10500 + }, + { + "epoch": 2.9088642659279778, + "grad_norm": 0.7193158268928528, + "learning_rate": 2.6379937941479226e-06, + "loss": 0.6095, + "step": 10501 + }, + { + "epoch": 2.909141274238227, + "grad_norm": 0.7286669015884399, + "learning_rate": 2.6376300631905005e-06, + "loss": 0.5486, + "step": 10502 + }, + { + "epoch": 2.9094182825484767, + "grad_norm": 0.7364821434020996, + "learning_rate": 2.6372663293108415e-06, + "loss": 0.586, + "step": 10503 + }, + { + "epoch": 2.9096952908587257, + "grad_norm": 0.7616895437240601, + "learning_rate": 2.6369025925166674e-06, + "loss": 0.5801, + "step": 10504 + }, + { + "epoch": 2.909972299168975, + "grad_norm": 0.7166840434074402, + "learning_rate": 2.636538852815702e-06, + "loss": 0.6128, + "step": 10505 + }, + { + "epoch": 2.9102493074792246, + "grad_norm": 0.7373641729354858, + "learning_rate": 2.6361751102156673e-06, + "loss": 0.5976, + "step": 10506 + }, + { + "epoch": 2.9105263157894736, + "grad_norm": 0.7657269239425659, + "learning_rate": 2.6358113647242885e-06, + "loss": 0.5798, + "step": 10507 + }, + { + "epoch": 2.910803324099723, + "grad_norm": 0.775659441947937, + "learning_rate": 2.6354476163492865e-06, + "loss": 0.5781, + "step": 10508 + }, + { + "epoch": 2.9110803324099725, + "grad_norm": 0.7320591807365417, + "learning_rate": 2.635083865098386e-06, + "loss": 0.5592, + "step": 10509 + }, + { + "epoch": 2.9113573407202216, + "grad_norm": 0.7523388266563416, + "learning_rate": 2.63472011097931e-06, + "loss": 0.6334, + "step": 10510 + }, + { + "epoch": 2.911634349030471, + "grad_norm": 0.7414600253105164, + "learning_rate": 2.634356353999783e-06, + "loss": 0.5845, + "step": 10511 + }, + { + "epoch": 2.9119113573407205, + "grad_norm": 0.7154373526573181, + "learning_rate": 2.633992594167526e-06, + "loss": 0.5643, + "step": 10512 + }, + { + "epoch": 2.9121883656509695, + "grad_norm": 0.6937295198440552, + "learning_rate": 2.6336288314902645e-06, + "loss": 0.581, + "step": 10513 + }, + { + "epoch": 2.912465373961219, + "grad_norm": 0.7316800951957703, + "learning_rate": 2.6332650659757223e-06, + "loss": 0.5846, + "step": 10514 + }, + { + "epoch": 2.912742382271468, + "grad_norm": 0.7038574814796448, + "learning_rate": 2.6329012976316216e-06, + "loss": 0.5028, + "step": 10515 + }, + { + "epoch": 2.9130193905817174, + "grad_norm": 0.7551959156990051, + "learning_rate": 2.632537526465687e-06, + "loss": 0.5651, + "step": 10516 + }, + { + "epoch": 2.913296398891967, + "grad_norm": 0.762638509273529, + "learning_rate": 2.6321737524856434e-06, + "loss": 0.6465, + "step": 10517 + }, + { + "epoch": 2.913573407202216, + "grad_norm": 0.7476597428321838, + "learning_rate": 2.631809975699213e-06, + "loss": 0.5952, + "step": 10518 + }, + { + "epoch": 2.9138504155124654, + "grad_norm": 0.7609908580780029, + "learning_rate": 2.6314461961141193e-06, + "loss": 0.5852, + "step": 10519 + }, + { + "epoch": 2.914127423822715, + "grad_norm": 0.7875975966453552, + "learning_rate": 2.631082413738088e-06, + "loss": 0.5712, + "step": 10520 + }, + { + "epoch": 2.914404432132964, + "grad_norm": 0.7203830480575562, + "learning_rate": 2.6307186285788423e-06, + "loss": 0.5757, + "step": 10521 + }, + { + "epoch": 2.9146814404432133, + "grad_norm": 0.7585301399230957, + "learning_rate": 2.630354840644106e-06, + "loss": 0.5814, + "step": 10522 + }, + { + "epoch": 2.9149584487534623, + "grad_norm": 0.74942946434021, + "learning_rate": 2.6299910499416036e-06, + "loss": 0.5819, + "step": 10523 + }, + { + "epoch": 2.915235457063712, + "grad_norm": 0.7211716771125793, + "learning_rate": 2.6296272564790594e-06, + "loss": 0.5855, + "step": 10524 + }, + { + "epoch": 2.9155124653739612, + "grad_norm": 0.7344101071357727, + "learning_rate": 2.629263460264197e-06, + "loss": 0.6082, + "step": 10525 + }, + { + "epoch": 2.9157894736842103, + "grad_norm": 0.7525090575218201, + "learning_rate": 2.6288996613047424e-06, + "loss": 0.5673, + "step": 10526 + }, + { + "epoch": 2.9160664819944597, + "grad_norm": 0.7705866694450378, + "learning_rate": 2.6285358596084175e-06, + "loss": 0.5953, + "step": 10527 + }, + { + "epoch": 2.916343490304709, + "grad_norm": 0.7172234654426575, + "learning_rate": 2.628172055182948e-06, + "loss": 0.6043, + "step": 10528 + }, + { + "epoch": 2.916620498614958, + "grad_norm": 0.7324336171150208, + "learning_rate": 2.6278082480360602e-06, + "loss": 0.5722, + "step": 10529 + }, + { + "epoch": 2.9168975069252077, + "grad_norm": 0.7875596880912781, + "learning_rate": 2.627444438175475e-06, + "loss": 0.6104, + "step": 10530 + }, + { + "epoch": 2.917174515235457, + "grad_norm": 0.7457826733589172, + "learning_rate": 2.62708062560892e-06, + "loss": 0.6176, + "step": 10531 + }, + { + "epoch": 2.917451523545706, + "grad_norm": 0.7246255874633789, + "learning_rate": 2.6267168103441185e-06, + "loss": 0.5951, + "step": 10532 + }, + { + "epoch": 2.9177285318559556, + "grad_norm": 0.7618518471717834, + "learning_rate": 2.6263529923887954e-06, + "loss": 0.6172, + "step": 10533 + }, + { + "epoch": 2.918005540166205, + "grad_norm": 0.715311586856842, + "learning_rate": 2.625989171750675e-06, + "loss": 0.5528, + "step": 10534 + }, + { + "epoch": 2.918282548476454, + "grad_norm": 0.7357887029647827, + "learning_rate": 2.625625348437484e-06, + "loss": 0.6057, + "step": 10535 + }, + { + "epoch": 2.9185595567867035, + "grad_norm": 0.7042737007141113, + "learning_rate": 2.6252615224569455e-06, + "loss": 0.5605, + "step": 10536 + }, + { + "epoch": 2.918836565096953, + "grad_norm": 0.7708760499954224, + "learning_rate": 2.6248976938167855e-06, + "loss": 0.5948, + "step": 10537 + }, + { + "epoch": 2.919113573407202, + "grad_norm": 0.7178238034248352, + "learning_rate": 2.6245338625247274e-06, + "loss": 0.569, + "step": 10538 + }, + { + "epoch": 2.9193905817174515, + "grad_norm": 0.7370444536209106, + "learning_rate": 2.6241700285884985e-06, + "loss": 0.5457, + "step": 10539 + }, + { + "epoch": 2.919667590027701, + "grad_norm": 0.7676613926887512, + "learning_rate": 2.623806192015822e-06, + "loss": 0.5601, + "step": 10540 + }, + { + "epoch": 2.91994459833795, + "grad_norm": 0.7429258823394775, + "learning_rate": 2.6234423528144243e-06, + "loss": 0.6178, + "step": 10541 + }, + { + "epoch": 2.9202216066481994, + "grad_norm": 0.71916264295578, + "learning_rate": 2.62307851099203e-06, + "loss": 0.5349, + "step": 10542 + }, + { + "epoch": 2.920498614958449, + "grad_norm": 0.7575047612190247, + "learning_rate": 2.6227146665563642e-06, + "loss": 0.6412, + "step": 10543 + }, + { + "epoch": 2.920775623268698, + "grad_norm": 0.7574516534805298, + "learning_rate": 2.6223508195151532e-06, + "loss": 0.5718, + "step": 10544 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.7422577142715454, + "learning_rate": 2.621986969876122e-06, + "loss": 0.5846, + "step": 10545 + }, + { + "epoch": 2.921329639889197, + "grad_norm": 0.7510814666748047, + "learning_rate": 2.621623117646996e-06, + "loss": 0.5693, + "step": 10546 + }, + { + "epoch": 2.921606648199446, + "grad_norm": 0.7736993432044983, + "learning_rate": 2.6212592628354994e-06, + "loss": 0.6027, + "step": 10547 + }, + { + "epoch": 2.9218836565096953, + "grad_norm": 0.7615543007850647, + "learning_rate": 2.6208954054493602e-06, + "loss": 0.6346, + "step": 10548 + }, + { + "epoch": 2.9221606648199447, + "grad_norm": 0.7570958137512207, + "learning_rate": 2.6205315454963026e-06, + "loss": 0.6041, + "step": 10549 + }, + { + "epoch": 2.9224376731301938, + "grad_norm": 0.7129548192024231, + "learning_rate": 2.620167682984052e-06, + "loss": 0.5973, + "step": 10550 + }, + { + "epoch": 2.922714681440443, + "grad_norm": 0.7714894413948059, + "learning_rate": 2.6198038179203357e-06, + "loss": 0.6143, + "step": 10551 + }, + { + "epoch": 2.9229916897506927, + "grad_norm": 0.7106122374534607, + "learning_rate": 2.6194399503128776e-06, + "loss": 0.5939, + "step": 10552 + }, + { + "epoch": 2.9232686980609417, + "grad_norm": 0.7235246300697327, + "learning_rate": 2.6190760801694037e-06, + "loss": 0.61, + "step": 10553 + }, + { + "epoch": 2.923545706371191, + "grad_norm": 0.7699087262153625, + "learning_rate": 2.618712207497642e-06, + "loss": 0.608, + "step": 10554 + }, + { + "epoch": 2.9238227146814406, + "grad_norm": 0.7375195622444153, + "learning_rate": 2.618348332305316e-06, + "loss": 0.5718, + "step": 10555 + }, + { + "epoch": 2.9240997229916896, + "grad_norm": 0.7720207571983337, + "learning_rate": 2.617984454600153e-06, + "loss": 0.6399, + "step": 10556 + }, + { + "epoch": 2.924376731301939, + "grad_norm": 0.806556224822998, + "learning_rate": 2.617620574389878e-06, + "loss": 0.6269, + "step": 10557 + }, + { + "epoch": 2.9246537396121886, + "grad_norm": 0.811886727809906, + "learning_rate": 2.617256691682219e-06, + "loss": 0.6067, + "step": 10558 + }, + { + "epoch": 2.9249307479224376, + "grad_norm": 0.7293379306793213, + "learning_rate": 2.6168928064849004e-06, + "loss": 0.5955, + "step": 10559 + }, + { + "epoch": 2.925207756232687, + "grad_norm": 0.7429826259613037, + "learning_rate": 2.6165289188056485e-06, + "loss": 0.539, + "step": 10560 + }, + { + "epoch": 2.9254847645429365, + "grad_norm": 0.763191819190979, + "learning_rate": 2.616165028652192e-06, + "loss": 0.5943, + "step": 10561 + }, + { + "epoch": 2.9257617728531855, + "grad_norm": 0.7543225288391113, + "learning_rate": 2.615801136032253e-06, + "loss": 0.5751, + "step": 10562 + }, + { + "epoch": 2.926038781163435, + "grad_norm": 0.7704012989997864, + "learning_rate": 2.6154372409535613e-06, + "loss": 0.5902, + "step": 10563 + }, + { + "epoch": 2.9263157894736844, + "grad_norm": 0.8033764958381653, + "learning_rate": 2.615073343423842e-06, + "loss": 0.5975, + "step": 10564 + }, + { + "epoch": 2.9265927977839334, + "grad_norm": 0.7130028009414673, + "learning_rate": 2.614709443450822e-06, + "loss": 0.5654, + "step": 10565 + }, + { + "epoch": 2.926869806094183, + "grad_norm": 0.778659462928772, + "learning_rate": 2.6143455410422263e-06, + "loss": 0.6033, + "step": 10566 + }, + { + "epoch": 2.9271468144044324, + "grad_norm": 0.7242159247398376, + "learning_rate": 2.613981636205784e-06, + "loss": 0.5871, + "step": 10567 + }, + { + "epoch": 2.9274238227146814, + "grad_norm": 0.7225329279899597, + "learning_rate": 2.6136177289492205e-06, + "loss": 0.6044, + "step": 10568 + }, + { + "epoch": 2.927700831024931, + "grad_norm": 0.7506078481674194, + "learning_rate": 2.6132538192802624e-06, + "loss": 0.5996, + "step": 10569 + }, + { + "epoch": 2.9279778393351803, + "grad_norm": 0.7476010322570801, + "learning_rate": 2.612889907206636e-06, + "loss": 0.6208, + "step": 10570 + }, + { + "epoch": 2.9282548476454293, + "grad_norm": 0.7440158128738403, + "learning_rate": 2.612525992736069e-06, + "loss": 0.5913, + "step": 10571 + }, + { + "epoch": 2.9285318559556788, + "grad_norm": 0.7561903595924377, + "learning_rate": 2.6121620758762877e-06, + "loss": 0.5772, + "step": 10572 + }, + { + "epoch": 2.9288088642659282, + "grad_norm": 0.8481956124305725, + "learning_rate": 2.6117981566350187e-06, + "loss": 0.6188, + "step": 10573 + }, + { + "epoch": 2.9290858725761773, + "grad_norm": 0.7656933069229126, + "learning_rate": 2.6114342350199907e-06, + "loss": 0.6179, + "step": 10574 + }, + { + "epoch": 2.9293628808864267, + "grad_norm": 0.7416088581085205, + "learning_rate": 2.6110703110389274e-06, + "loss": 0.5994, + "step": 10575 + }, + { + "epoch": 2.929639889196676, + "grad_norm": 0.7594820857048035, + "learning_rate": 2.610706384699559e-06, + "loss": 0.6221, + "step": 10576 + }, + { + "epoch": 2.929916897506925, + "grad_norm": 0.7453796863555908, + "learning_rate": 2.6103424560096114e-06, + "loss": 0.5513, + "step": 10577 + }, + { + "epoch": 2.9301939058171746, + "grad_norm": 0.7599956393241882, + "learning_rate": 2.609978524976811e-06, + "loss": 0.61, + "step": 10578 + }, + { + "epoch": 2.9304709141274237, + "grad_norm": 0.7576059103012085, + "learning_rate": 2.6096145916088855e-06, + "loss": 0.5663, + "step": 10579 + }, + { + "epoch": 2.930747922437673, + "grad_norm": 0.711409330368042, + "learning_rate": 2.609250655913564e-06, + "loss": 0.5377, + "step": 10580 + }, + { + "epoch": 2.9310249307479226, + "grad_norm": 0.7231387495994568, + "learning_rate": 2.60888671789857e-06, + "loss": 0.6061, + "step": 10581 + }, + { + "epoch": 2.9313019390581716, + "grad_norm": 0.7678942084312439, + "learning_rate": 2.6085227775716338e-06, + "loss": 0.6152, + "step": 10582 + }, + { + "epoch": 2.931578947368421, + "grad_norm": 0.7268967032432556, + "learning_rate": 2.6081588349404825e-06, + "loss": 0.5442, + "step": 10583 + }, + { + "epoch": 2.9318559556786705, + "grad_norm": 0.7112714648246765, + "learning_rate": 2.607794890012841e-06, + "loss": 0.6094, + "step": 10584 + }, + { + "epoch": 2.9321329639889195, + "grad_norm": 0.7436681985855103, + "learning_rate": 2.6074309427964405e-06, + "loss": 0.5888, + "step": 10585 + }, + { + "epoch": 2.932409972299169, + "grad_norm": 0.7648245692253113, + "learning_rate": 2.607066993299007e-06, + "loss": 0.5992, + "step": 10586 + }, + { + "epoch": 2.932686980609418, + "grad_norm": 0.7910863161087036, + "learning_rate": 2.606703041528267e-06, + "loss": 0.6206, + "step": 10587 + }, + { + "epoch": 2.9329639889196675, + "grad_norm": 0.7461734414100647, + "learning_rate": 2.6063390874919482e-06, + "loss": 0.6609, + "step": 10588 + }, + { + "epoch": 2.933240997229917, + "grad_norm": 0.7565977573394775, + "learning_rate": 2.60597513119778e-06, + "loss": 0.5607, + "step": 10589 + }, + { + "epoch": 2.933518005540166, + "grad_norm": 0.7210617065429688, + "learning_rate": 2.605611172653489e-06, + "loss": 0.6282, + "step": 10590 + }, + { + "epoch": 2.9337950138504154, + "grad_norm": 0.765498697757721, + "learning_rate": 2.605247211866803e-06, + "loss": 0.5997, + "step": 10591 + }, + { + "epoch": 2.934072022160665, + "grad_norm": 0.7618787288665771, + "learning_rate": 2.6048832488454495e-06, + "loss": 0.5885, + "step": 10592 + }, + { + "epoch": 2.934349030470914, + "grad_norm": 0.7326313853263855, + "learning_rate": 2.6045192835971578e-06, + "loss": 0.5532, + "step": 10593 + }, + { + "epoch": 2.9346260387811633, + "grad_norm": 0.7318726181983948, + "learning_rate": 2.604155316129654e-06, + "loss": 0.6062, + "step": 10594 + }, + { + "epoch": 2.934903047091413, + "grad_norm": 0.729059100151062, + "learning_rate": 2.603791346450667e-06, + "loss": 0.5883, + "step": 10595 + }, + { + "epoch": 2.935180055401662, + "grad_norm": 0.7532776594161987, + "learning_rate": 2.603427374567925e-06, + "loss": 0.582, + "step": 10596 + }, + { + "epoch": 2.9354570637119113, + "grad_norm": 0.7572320103645325, + "learning_rate": 2.603063400489156e-06, + "loss": 0.5723, + "step": 10597 + }, + { + "epoch": 2.9357340720221607, + "grad_norm": 0.7214776873588562, + "learning_rate": 2.6026994242220866e-06, + "loss": 0.6043, + "step": 10598 + }, + { + "epoch": 2.9360110803324098, + "grad_norm": 0.7422539591789246, + "learning_rate": 2.602335445774447e-06, + "loss": 0.6308, + "step": 10599 + }, + { + "epoch": 2.936288088642659, + "grad_norm": 0.7235873937606812, + "learning_rate": 2.6019714651539647e-06, + "loss": 0.6344, + "step": 10600 + }, + { + "epoch": 2.9365650969529087, + "grad_norm": 0.7317372560501099, + "learning_rate": 2.6016074823683666e-06, + "loss": 0.5976, + "step": 10601 + }, + { + "epoch": 2.9368421052631577, + "grad_norm": 0.7706679105758667, + "learning_rate": 2.6012434974253832e-06, + "loss": 0.6308, + "step": 10602 + }, + { + "epoch": 2.937119113573407, + "grad_norm": 0.7357689738273621, + "learning_rate": 2.6008795103327416e-06, + "loss": 0.5425, + "step": 10603 + }, + { + "epoch": 2.9373961218836566, + "grad_norm": 0.770684540271759, + "learning_rate": 2.600515521098171e-06, + "loss": 0.6076, + "step": 10604 + }, + { + "epoch": 2.9376731301939056, + "grad_norm": 0.7773721814155579, + "learning_rate": 2.6001515297293982e-06, + "loss": 0.5507, + "step": 10605 + }, + { + "epoch": 2.937950138504155, + "grad_norm": 0.7833162546157837, + "learning_rate": 2.5997875362341535e-06, + "loss": 0.5845, + "step": 10606 + }, + { + "epoch": 2.9382271468144046, + "grad_norm": 0.7654193639755249, + "learning_rate": 2.599423540620164e-06, + "loss": 0.5849, + "step": 10607 + }, + { + "epoch": 2.9385041551246536, + "grad_norm": 0.771862268447876, + "learning_rate": 2.5990595428951586e-06, + "loss": 0.6145, + "step": 10608 + }, + { + "epoch": 2.938781163434903, + "grad_norm": 0.7339853644371033, + "learning_rate": 2.598695543066867e-06, + "loss": 0.5774, + "step": 10609 + }, + { + "epoch": 2.9390581717451525, + "grad_norm": 0.7562407851219177, + "learning_rate": 2.5983315411430166e-06, + "loss": 0.5958, + "step": 10610 + }, + { + "epoch": 2.9393351800554015, + "grad_norm": 0.7739925384521484, + "learning_rate": 2.5979675371313357e-06, + "loss": 0.5826, + "step": 10611 + }, + { + "epoch": 2.939612188365651, + "grad_norm": 0.7429248690605164, + "learning_rate": 2.597603531039555e-06, + "loss": 0.5429, + "step": 10612 + }, + { + "epoch": 2.9398891966759004, + "grad_norm": 0.7736132144927979, + "learning_rate": 2.5972395228754017e-06, + "loss": 0.5891, + "step": 10613 + }, + { + "epoch": 2.9401662049861494, + "grad_norm": 0.7905640602111816, + "learning_rate": 2.5968755126466044e-06, + "loss": 0.5947, + "step": 10614 + }, + { + "epoch": 2.940443213296399, + "grad_norm": 0.8147552609443665, + "learning_rate": 2.596511500360894e-06, + "loss": 0.6118, + "step": 10615 + }, + { + "epoch": 2.9407202216066484, + "grad_norm": 0.7183418869972229, + "learning_rate": 2.596147486025996e-06, + "loss": 0.5513, + "step": 10616 + }, + { + "epoch": 2.9409972299168974, + "grad_norm": 0.7094548344612122, + "learning_rate": 2.5957834696496424e-06, + "loss": 0.5686, + "step": 10617 + }, + { + "epoch": 2.941274238227147, + "grad_norm": 0.7191303968429565, + "learning_rate": 2.5954194512395614e-06, + "loss": 0.5691, + "step": 10618 + }, + { + "epoch": 2.9415512465373963, + "grad_norm": 0.680117666721344, + "learning_rate": 2.5950554308034813e-06, + "loss": 0.5616, + "step": 10619 + }, + { + "epoch": 2.9418282548476453, + "grad_norm": 0.779163122177124, + "learning_rate": 2.5946914083491314e-06, + "loss": 0.6328, + "step": 10620 + }, + { + "epoch": 2.942105263157895, + "grad_norm": 0.726834237575531, + "learning_rate": 2.594327383884242e-06, + "loss": 0.5863, + "step": 10621 + }, + { + "epoch": 2.9423822714681442, + "grad_norm": 0.7219435572624207, + "learning_rate": 2.5939633574165408e-06, + "loss": 0.6116, + "step": 10622 + }, + { + "epoch": 2.9426592797783933, + "grad_norm": 0.7471601366996765, + "learning_rate": 2.593599328953758e-06, + "loss": 0.597, + "step": 10623 + }, + { + "epoch": 2.9429362880886427, + "grad_norm": 0.7578615546226501, + "learning_rate": 2.593235298503621e-06, + "loss": 0.6315, + "step": 10624 + }, + { + "epoch": 2.943213296398892, + "grad_norm": 0.716760516166687, + "learning_rate": 2.592871266073863e-06, + "loss": 0.5851, + "step": 10625 + }, + { + "epoch": 2.943490304709141, + "grad_norm": 0.7250704765319824, + "learning_rate": 2.592507231672209e-06, + "loss": 0.5971, + "step": 10626 + }, + { + "epoch": 2.9437673130193907, + "grad_norm": 0.7800359725952148, + "learning_rate": 2.5921431953063903e-06, + "loss": 0.5734, + "step": 10627 + }, + { + "epoch": 2.94404432132964, + "grad_norm": 0.7490966320037842, + "learning_rate": 2.591779156984137e-06, + "loss": 0.5976, + "step": 10628 + }, + { + "epoch": 2.944321329639889, + "grad_norm": 0.7280365824699402, + "learning_rate": 2.591415116713177e-06, + "loss": 0.6236, + "step": 10629 + }, + { + "epoch": 2.9445983379501386, + "grad_norm": 0.7410351037979126, + "learning_rate": 2.5910510745012413e-06, + "loss": 0.5541, + "step": 10630 + }, + { + "epoch": 2.944875346260388, + "grad_norm": 0.7178124785423279, + "learning_rate": 2.590687030356058e-06, + "loss": 0.6049, + "step": 10631 + }, + { + "epoch": 2.945152354570637, + "grad_norm": 0.7093154191970825, + "learning_rate": 2.590322984285359e-06, + "loss": 0.5973, + "step": 10632 + }, + { + "epoch": 2.9454293628808865, + "grad_norm": 0.7441872358322144, + "learning_rate": 2.589958936296871e-06, + "loss": 0.5981, + "step": 10633 + }, + { + "epoch": 2.945706371191136, + "grad_norm": 0.7678994536399841, + "learning_rate": 2.5895948863983257e-06, + "loss": 0.6114, + "step": 10634 + }, + { + "epoch": 2.945983379501385, + "grad_norm": 0.7277671694755554, + "learning_rate": 2.5892308345974517e-06, + "loss": 0.649, + "step": 10635 + }, + { + "epoch": 2.9462603878116345, + "grad_norm": 0.7727528810501099, + "learning_rate": 2.5888667809019792e-06, + "loss": 0.5758, + "step": 10636 + }, + { + "epoch": 2.946537396121884, + "grad_norm": 0.7813266515731812, + "learning_rate": 2.5885027253196384e-06, + "loss": 0.6389, + "step": 10637 + }, + { + "epoch": 2.946814404432133, + "grad_norm": 0.7353212237358093, + "learning_rate": 2.5881386678581587e-06, + "loss": 0.6088, + "step": 10638 + }, + { + "epoch": 2.9470914127423824, + "grad_norm": 0.7160367369651794, + "learning_rate": 2.587774608525269e-06, + "loss": 0.5732, + "step": 10639 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.7289338111877441, + "learning_rate": 2.5874105473287015e-06, + "loss": 0.6094, + "step": 10640 + }, + { + "epoch": 2.947645429362881, + "grad_norm": 0.7848336696624756, + "learning_rate": 2.5870464842761844e-06, + "loss": 0.5968, + "step": 10641 + }, + { + "epoch": 2.9479224376731303, + "grad_norm": 0.7372105717658997, + "learning_rate": 2.5866824193754482e-06, + "loss": 0.5872, + "step": 10642 + }, + { + "epoch": 2.9481994459833794, + "grad_norm": 0.7537047863006592, + "learning_rate": 2.586318352634222e-06, + "loss": 0.592, + "step": 10643 + }, + { + "epoch": 2.948476454293629, + "grad_norm": 0.7874873280525208, + "learning_rate": 2.585954284060238e-06, + "loss": 0.5799, + "step": 10644 + }, + { + "epoch": 2.9487534626038783, + "grad_norm": 0.6731414198875427, + "learning_rate": 2.5855902136612247e-06, + "loss": 0.5007, + "step": 10645 + }, + { + "epoch": 2.9490304709141273, + "grad_norm": 0.747714638710022, + "learning_rate": 2.585226141444912e-06, + "loss": 0.5941, + "step": 10646 + }, + { + "epoch": 2.9493074792243767, + "grad_norm": 0.7554249167442322, + "learning_rate": 2.584862067419032e-06, + "loss": 0.5363, + "step": 10647 + }, + { + "epoch": 2.9495844875346258, + "grad_norm": 0.7585732340812683, + "learning_rate": 2.584497991591313e-06, + "loss": 0.6131, + "step": 10648 + }, + { + "epoch": 2.9498614958448752, + "grad_norm": 0.7329699397087097, + "learning_rate": 2.5841339139694856e-06, + "loss": 0.5882, + "step": 10649 + }, + { + "epoch": 2.9501385041551247, + "grad_norm": 0.7495070695877075, + "learning_rate": 2.58376983456128e-06, + "loss": 0.582, + "step": 10650 + }, + { + "epoch": 2.9504155124653737, + "grad_norm": 0.7455748319625854, + "learning_rate": 2.5834057533744272e-06, + "loss": 0.578, + "step": 10651 + }, + { + "epoch": 2.950692520775623, + "grad_norm": 0.7285454869270325, + "learning_rate": 2.583041670416657e-06, + "loss": 0.625, + "step": 10652 + }, + { + "epoch": 2.9509695290858726, + "grad_norm": 0.7877806425094604, + "learning_rate": 2.5826775856957005e-06, + "loss": 0.6188, + "step": 10653 + }, + { + "epoch": 2.9512465373961216, + "grad_norm": 0.7521032094955444, + "learning_rate": 2.5823134992192877e-06, + "loss": 0.619, + "step": 10654 + }, + { + "epoch": 2.951523545706371, + "grad_norm": 0.7143591046333313, + "learning_rate": 2.5819494109951486e-06, + "loss": 0.5876, + "step": 10655 + }, + { + "epoch": 2.9518005540166206, + "grad_norm": 0.7743513584136963, + "learning_rate": 2.5815853210310154e-06, + "loss": 0.6256, + "step": 10656 + }, + { + "epoch": 2.9520775623268696, + "grad_norm": 0.7508295178413391, + "learning_rate": 2.5812212293346163e-06, + "loss": 0.5781, + "step": 10657 + }, + { + "epoch": 2.952354570637119, + "grad_norm": 0.7436931133270264, + "learning_rate": 2.5808571359136836e-06, + "loss": 0.6053, + "step": 10658 + }, + { + "epoch": 2.9526315789473685, + "grad_norm": 0.7597281336784363, + "learning_rate": 2.5804930407759477e-06, + "loss": 0.586, + "step": 10659 + }, + { + "epoch": 2.9529085872576175, + "grad_norm": 0.7129910588264465, + "learning_rate": 2.580128943929139e-06, + "loss": 0.5702, + "step": 10660 + }, + { + "epoch": 2.953185595567867, + "grad_norm": 0.7608397006988525, + "learning_rate": 2.579764845380987e-06, + "loss": 0.6034, + "step": 10661 + }, + { + "epoch": 2.9534626038781164, + "grad_norm": 0.7523941993713379, + "learning_rate": 2.5794007451392246e-06, + "loss": 0.5824, + "step": 10662 + }, + { + "epoch": 2.9537396121883654, + "grad_norm": 0.7492373585700989, + "learning_rate": 2.5790366432115815e-06, + "loss": 0.6168, + "step": 10663 + }, + { + "epoch": 2.954016620498615, + "grad_norm": 0.7219147682189941, + "learning_rate": 2.5786725396057886e-06, + "loss": 0.571, + "step": 10664 + }, + { + "epoch": 2.9542936288088644, + "grad_norm": 0.7466295957565308, + "learning_rate": 2.578308434329576e-06, + "loss": 0.5993, + "step": 10665 + }, + { + "epoch": 2.9545706371191134, + "grad_norm": 0.7330300807952881, + "learning_rate": 2.5779443273906773e-06, + "loss": 0.5982, + "step": 10666 + }, + { + "epoch": 2.954847645429363, + "grad_norm": 0.7423272132873535, + "learning_rate": 2.5775802187968197e-06, + "loss": 0.6273, + "step": 10667 + }, + { + "epoch": 2.9551246537396123, + "grad_norm": 0.7298309206962585, + "learning_rate": 2.5772161085557367e-06, + "loss": 0.5768, + "step": 10668 + }, + { + "epoch": 2.9554016620498613, + "grad_norm": 0.7723406553268433, + "learning_rate": 2.5768519966751587e-06, + "loss": 0.605, + "step": 10669 + }, + { + "epoch": 2.955678670360111, + "grad_norm": 0.7510731220245361, + "learning_rate": 2.5764878831628154e-06, + "loss": 0.596, + "step": 10670 + }, + { + "epoch": 2.9559556786703602, + "grad_norm": 0.7381873726844788, + "learning_rate": 2.57612376802644e-06, + "loss": 0.6044, + "step": 10671 + }, + { + "epoch": 2.9562326869806093, + "grad_norm": 0.7327526807785034, + "learning_rate": 2.5757596512737626e-06, + "loss": 0.6019, + "step": 10672 + }, + { + "epoch": 2.9565096952908587, + "grad_norm": 0.7478799223899841, + "learning_rate": 2.5753955329125148e-06, + "loss": 0.6089, + "step": 10673 + }, + { + "epoch": 2.956786703601108, + "grad_norm": 0.7106507420539856, + "learning_rate": 2.5750314129504255e-06, + "loss": 0.5661, + "step": 10674 + }, + { + "epoch": 2.957063711911357, + "grad_norm": 0.8187487721443176, + "learning_rate": 2.574667291395229e-06, + "loss": 0.6081, + "step": 10675 + }, + { + "epoch": 2.9573407202216067, + "grad_norm": 0.7382457852363586, + "learning_rate": 2.5743031682546555e-06, + "loss": 0.5384, + "step": 10676 + }, + { + "epoch": 2.957617728531856, + "grad_norm": 0.7540406584739685, + "learning_rate": 2.573939043536436e-06, + "loss": 0.6568, + "step": 10677 + }, + { + "epoch": 2.957894736842105, + "grad_norm": 0.7558385729789734, + "learning_rate": 2.5735749172483003e-06, + "loss": 0.5759, + "step": 10678 + }, + { + "epoch": 2.9581717451523546, + "grad_norm": 0.7608779072761536, + "learning_rate": 2.5732107893979834e-06, + "loss": 0.5739, + "step": 10679 + }, + { + "epoch": 2.958448753462604, + "grad_norm": 0.7513423562049866, + "learning_rate": 2.5728466599932127e-06, + "loss": 0.5456, + "step": 10680 + }, + { + "epoch": 2.958725761772853, + "grad_norm": 0.7584231495857239, + "learning_rate": 2.5724825290417223e-06, + "loss": 0.586, + "step": 10681 + }, + { + "epoch": 2.9590027700831025, + "grad_norm": 0.7205784916877747, + "learning_rate": 2.5721183965512424e-06, + "loss": 0.5438, + "step": 10682 + }, + { + "epoch": 2.959279778393352, + "grad_norm": 0.7798346281051636, + "learning_rate": 2.5717542625295044e-06, + "loss": 0.5974, + "step": 10683 + }, + { + "epoch": 2.959556786703601, + "grad_norm": 0.7180570960044861, + "learning_rate": 2.5713901269842405e-06, + "loss": 0.5726, + "step": 10684 + }, + { + "epoch": 2.9598337950138505, + "grad_norm": 0.7144060134887695, + "learning_rate": 2.5710259899231827e-06, + "loss": 0.5293, + "step": 10685 + }, + { + "epoch": 2.9601108033241, + "grad_norm": 0.7333686947822571, + "learning_rate": 2.570661851354061e-06, + "loss": 0.5776, + "step": 10686 + }, + { + "epoch": 2.960387811634349, + "grad_norm": 0.7347108721733093, + "learning_rate": 2.5702977112846073e-06, + "loss": 0.5931, + "step": 10687 + }, + { + "epoch": 2.9606648199445984, + "grad_norm": 0.7476058602333069, + "learning_rate": 2.5699335697225545e-06, + "loss": 0.6298, + "step": 10688 + }, + { + "epoch": 2.960941828254848, + "grad_norm": 0.760506808757782, + "learning_rate": 2.5695694266756326e-06, + "loss": 0.6123, + "step": 10689 + }, + { + "epoch": 2.961218836565097, + "grad_norm": 0.7429872155189514, + "learning_rate": 2.569205282151575e-06, + "loss": 0.5671, + "step": 10690 + }, + { + "epoch": 2.9614958448753463, + "grad_norm": 0.7676592469215393, + "learning_rate": 2.5688411361581123e-06, + "loss": 0.5654, + "step": 10691 + }, + { + "epoch": 2.961772853185596, + "grad_norm": 0.7633059024810791, + "learning_rate": 2.5684769887029764e-06, + "loss": 0.5604, + "step": 10692 + }, + { + "epoch": 2.962049861495845, + "grad_norm": 0.719962477684021, + "learning_rate": 2.5681128397938987e-06, + "loss": 0.6238, + "step": 10693 + }, + { + "epoch": 2.9623268698060943, + "grad_norm": 0.7325805425643921, + "learning_rate": 2.5677486894386118e-06, + "loss": 0.5728, + "step": 10694 + }, + { + "epoch": 2.9626038781163437, + "grad_norm": 0.7231216430664062, + "learning_rate": 2.5673845376448473e-06, + "loss": 0.5906, + "step": 10695 + }, + { + "epoch": 2.9628808864265928, + "grad_norm": 0.7547472715377808, + "learning_rate": 2.567020384420337e-06, + "loss": 0.6111, + "step": 10696 + }, + { + "epoch": 2.963157894736842, + "grad_norm": 0.7770267128944397, + "learning_rate": 2.566656229772812e-06, + "loss": 0.5794, + "step": 10697 + }, + { + "epoch": 2.9634349030470917, + "grad_norm": 0.7602913975715637, + "learning_rate": 2.566292073710006e-06, + "loss": 0.6388, + "step": 10698 + }, + { + "epoch": 2.9637119113573407, + "grad_norm": 0.752708375453949, + "learning_rate": 2.56592791623965e-06, + "loss": 0.5852, + "step": 10699 + }, + { + "epoch": 2.96398891966759, + "grad_norm": 0.7117756009101868, + "learning_rate": 2.565563757369475e-06, + "loss": 0.5784, + "step": 10700 + }, + { + "epoch": 2.9642659279778396, + "grad_norm": 0.7588861584663391, + "learning_rate": 2.565199597107216e-06, + "loss": 0.5937, + "step": 10701 + }, + { + "epoch": 2.9645429362880886, + "grad_norm": 0.7347100377082825, + "learning_rate": 2.5648354354606007e-06, + "loss": 0.6163, + "step": 10702 + }, + { + "epoch": 2.964819944598338, + "grad_norm": 0.751549482345581, + "learning_rate": 2.5644712724373648e-06, + "loss": 0.6389, + "step": 10703 + }, + { + "epoch": 2.965096952908587, + "grad_norm": 0.7782813906669617, + "learning_rate": 2.564107108045239e-06, + "loss": 0.6091, + "step": 10704 + }, + { + "epoch": 2.9653739612188366, + "grad_norm": 0.6835278868675232, + "learning_rate": 2.5637429422919556e-06, + "loss": 0.5734, + "step": 10705 + }, + { + "epoch": 2.965650969529086, + "grad_norm": 0.7344762086868286, + "learning_rate": 2.563378775185246e-06, + "loss": 0.5731, + "step": 10706 + }, + { + "epoch": 2.965927977839335, + "grad_norm": 0.7300355434417725, + "learning_rate": 2.563014606732844e-06, + "loss": 0.6025, + "step": 10707 + }, + { + "epoch": 2.9662049861495845, + "grad_norm": 0.7507689595222473, + "learning_rate": 2.562650436942481e-06, + "loss": 0.5756, + "step": 10708 + }, + { + "epoch": 2.966481994459834, + "grad_norm": 0.7238181233406067, + "learning_rate": 2.5622862658218893e-06, + "loss": 0.5627, + "step": 10709 + }, + { + "epoch": 2.966759002770083, + "grad_norm": 0.7482140064239502, + "learning_rate": 2.5619220933788007e-06, + "loss": 0.5917, + "step": 10710 + }, + { + "epoch": 2.9670360110803324, + "grad_norm": 0.7118654847145081, + "learning_rate": 2.5615579196209477e-06, + "loss": 0.595, + "step": 10711 + }, + { + "epoch": 2.9673130193905815, + "grad_norm": 0.7792865037918091, + "learning_rate": 2.5611937445560635e-06, + "loss": 0.5944, + "step": 10712 + }, + { + "epoch": 2.967590027700831, + "grad_norm": 0.7508601546287537, + "learning_rate": 2.5608295681918794e-06, + "loss": 0.6245, + "step": 10713 + }, + { + "epoch": 2.9678670360110804, + "grad_norm": 0.7758817672729492, + "learning_rate": 2.5604653905361286e-06, + "loss": 0.6158, + "step": 10714 + }, + { + "epoch": 2.9681440443213294, + "grad_norm": 0.7338311076164246, + "learning_rate": 2.5601012115965425e-06, + "loss": 0.557, + "step": 10715 + }, + { + "epoch": 2.968421052631579, + "grad_norm": 0.7640522122383118, + "learning_rate": 2.5597370313808552e-06, + "loss": 0.6413, + "step": 10716 + }, + { + "epoch": 2.9686980609418283, + "grad_norm": 0.7294602394104004, + "learning_rate": 2.5593728498967975e-06, + "loss": 0.5962, + "step": 10717 + }, + { + "epoch": 2.9689750692520773, + "grad_norm": 0.744451642036438, + "learning_rate": 2.5590086671521023e-06, + "loss": 0.5961, + "step": 10718 + }, + { + "epoch": 2.969252077562327, + "grad_norm": 0.7545126676559448, + "learning_rate": 2.5586444831545027e-06, + "loss": 0.5904, + "step": 10719 + }, + { + "epoch": 2.9695290858725762, + "grad_norm": 0.7299168705940247, + "learning_rate": 2.5582802979117316e-06, + "loss": 0.5959, + "step": 10720 + }, + { + "epoch": 2.9698060941828253, + "grad_norm": 0.7411100268363953, + "learning_rate": 2.55791611143152e-06, + "loss": 0.6012, + "step": 10721 + }, + { + "epoch": 2.9700831024930747, + "grad_norm": 0.7515658736228943, + "learning_rate": 2.557551923721602e-06, + "loss": 0.6036, + "step": 10722 + }, + { + "epoch": 2.970360110803324, + "grad_norm": 0.7271835207939148, + "learning_rate": 2.5571877347897096e-06, + "loss": 0.55, + "step": 10723 + }, + { + "epoch": 2.970637119113573, + "grad_norm": 0.7838777303695679, + "learning_rate": 2.5568235446435757e-06, + "loss": 0.5862, + "step": 10724 + }, + { + "epoch": 2.9709141274238227, + "grad_norm": 0.7445371150970459, + "learning_rate": 2.556459353290932e-06, + "loss": 0.6078, + "step": 10725 + }, + { + "epoch": 2.971191135734072, + "grad_norm": 0.7580215930938721, + "learning_rate": 2.556095160739513e-06, + "loss": 0.6034, + "step": 10726 + }, + { + "epoch": 2.971468144044321, + "grad_norm": 0.7328695058822632, + "learning_rate": 2.55573096699705e-06, + "loss": 0.5894, + "step": 10727 + }, + { + "epoch": 2.9717451523545706, + "grad_norm": 0.7478681802749634, + "learning_rate": 2.555366772071276e-06, + "loss": 0.5915, + "step": 10728 + }, + { + "epoch": 2.97202216066482, + "grad_norm": 0.7482941150665283, + "learning_rate": 2.555002575969924e-06, + "loss": 0.6263, + "step": 10729 + }, + { + "epoch": 2.972299168975069, + "grad_norm": 0.7541418075561523, + "learning_rate": 2.5546383787007276e-06, + "loss": 0.6189, + "step": 10730 + }, + { + "epoch": 2.9725761772853185, + "grad_norm": 0.7668107748031616, + "learning_rate": 2.5542741802714188e-06, + "loss": 0.6036, + "step": 10731 + }, + { + "epoch": 2.972853185595568, + "grad_norm": 0.770712673664093, + "learning_rate": 2.55390998068973e-06, + "loss": 0.5466, + "step": 10732 + }, + { + "epoch": 2.973130193905817, + "grad_norm": 0.801234245300293, + "learning_rate": 2.5535457799633957e-06, + "loss": 0.5953, + "step": 10733 + }, + { + "epoch": 2.9734072022160665, + "grad_norm": 0.708030104637146, + "learning_rate": 2.5531815781001462e-06, + "loss": 0.567, + "step": 10734 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.7175045013427734, + "learning_rate": 2.552817375107717e-06, + "loss": 0.5635, + "step": 10735 + }, + { + "epoch": 2.973961218836565, + "grad_norm": 0.7897862195968628, + "learning_rate": 2.55245317099384e-06, + "loss": 0.619, + "step": 10736 + }, + { + "epoch": 2.9742382271468144, + "grad_norm": 0.8118829727172852, + "learning_rate": 2.5520889657662482e-06, + "loss": 0.5977, + "step": 10737 + }, + { + "epoch": 2.974515235457064, + "grad_norm": 0.7560293078422546, + "learning_rate": 2.5517247594326744e-06, + "loss": 0.6059, + "step": 10738 + }, + { + "epoch": 2.974792243767313, + "grad_norm": 0.7857494354248047, + "learning_rate": 2.5513605520008516e-06, + "loss": 0.6039, + "step": 10739 + }, + { + "epoch": 2.9750692520775623, + "grad_norm": 0.7377294301986694, + "learning_rate": 2.550996343478514e-06, + "loss": 0.5649, + "step": 10740 + }, + { + "epoch": 2.975346260387812, + "grad_norm": 0.8070437908172607, + "learning_rate": 2.5506321338733924e-06, + "loss": 0.5845, + "step": 10741 + }, + { + "epoch": 2.975623268698061, + "grad_norm": 0.757997989654541, + "learning_rate": 2.5502679231932232e-06, + "loss": 0.5986, + "step": 10742 + }, + { + "epoch": 2.9759002770083103, + "grad_norm": 0.7478947639465332, + "learning_rate": 2.5499037114457358e-06, + "loss": 0.6166, + "step": 10743 + }, + { + "epoch": 2.9761772853185597, + "grad_norm": 0.7756154537200928, + "learning_rate": 2.549539498638666e-06, + "loss": 0.5843, + "step": 10744 + }, + { + "epoch": 2.9764542936288088, + "grad_norm": 0.8048810362815857, + "learning_rate": 2.549175284779746e-06, + "loss": 0.6279, + "step": 10745 + }, + { + "epoch": 2.976731301939058, + "grad_norm": 0.7790731191635132, + "learning_rate": 2.54881106987671e-06, + "loss": 0.5502, + "step": 10746 + }, + { + "epoch": 2.9770083102493077, + "grad_norm": 0.7683466076850891, + "learning_rate": 2.548446853937289e-06, + "loss": 0.5858, + "step": 10747 + }, + { + "epoch": 2.9772853185595567, + "grad_norm": 0.7681669592857361, + "learning_rate": 2.5480826369692178e-06, + "loss": 0.5647, + "step": 10748 + }, + { + "epoch": 2.977562326869806, + "grad_norm": 0.7260222434997559, + "learning_rate": 2.54771841898023e-06, + "loss": 0.6206, + "step": 10749 + }, + { + "epoch": 2.9778393351800556, + "grad_norm": 0.7287525534629822, + "learning_rate": 2.5473541999780583e-06, + "loss": 0.5813, + "step": 10750 + }, + { + "epoch": 2.9781163434903046, + "grad_norm": 0.7364113926887512, + "learning_rate": 2.5469899799704356e-06, + "loss": 0.5709, + "step": 10751 + }, + { + "epoch": 2.978393351800554, + "grad_norm": 0.7547378540039062, + "learning_rate": 2.546625758965096e-06, + "loss": 0.6324, + "step": 10752 + }, + { + "epoch": 2.9786703601108035, + "grad_norm": 0.7311744689941406, + "learning_rate": 2.5462615369697726e-06, + "loss": 0.5771, + "step": 10753 + }, + { + "epoch": 2.9789473684210526, + "grad_norm": 0.753720760345459, + "learning_rate": 2.5458973139921974e-06, + "loss": 0.5625, + "step": 10754 + }, + { + "epoch": 2.979224376731302, + "grad_norm": 0.7415842413902283, + "learning_rate": 2.5455330900401066e-06, + "loss": 0.5854, + "step": 10755 + }, + { + "epoch": 2.9795013850415515, + "grad_norm": 0.7530409693717957, + "learning_rate": 2.545168865121231e-06, + "loss": 0.5897, + "step": 10756 + }, + { + "epoch": 2.9797783933518005, + "grad_norm": 0.7591843008995056, + "learning_rate": 2.5448046392433057e-06, + "loss": 0.5495, + "step": 10757 + }, + { + "epoch": 2.98005540166205, + "grad_norm": 0.7462211847305298, + "learning_rate": 2.544440412414063e-06, + "loss": 0.5712, + "step": 10758 + }, + { + "epoch": 2.9803324099722994, + "grad_norm": 0.716187596321106, + "learning_rate": 2.544076184641237e-06, + "loss": 0.593, + "step": 10759 + }, + { + "epoch": 2.9806094182825484, + "grad_norm": 0.7935730814933777, + "learning_rate": 2.5437119559325606e-06, + "loss": 0.602, + "step": 10760 + }, + { + "epoch": 2.980886426592798, + "grad_norm": 0.7291024923324585, + "learning_rate": 2.5433477262957686e-06, + "loss": 0.5809, + "step": 10761 + }, + { + "epoch": 2.9811634349030474, + "grad_norm": 0.7451568841934204, + "learning_rate": 2.5429834957385937e-06, + "loss": 0.5973, + "step": 10762 + }, + { + "epoch": 2.9814404432132964, + "grad_norm": 0.8070505261421204, + "learning_rate": 2.5426192642687687e-06, + "loss": 0.6404, + "step": 10763 + }, + { + "epoch": 2.981717451523546, + "grad_norm": 0.7522998452186584, + "learning_rate": 2.5422550318940283e-06, + "loss": 0.58, + "step": 10764 + }, + { + "epoch": 2.981994459833795, + "grad_norm": 0.7420567274093628, + "learning_rate": 2.541890798622106e-06, + "loss": 0.5826, + "step": 10765 + }, + { + "epoch": 2.9822714681440443, + "grad_norm": 0.7697232961654663, + "learning_rate": 2.5415265644607338e-06, + "loss": 0.6362, + "step": 10766 + }, + { + "epoch": 2.9825484764542938, + "grad_norm": 0.7693185210227966, + "learning_rate": 2.5411623294176473e-06, + "loss": 0.6172, + "step": 10767 + }, + { + "epoch": 2.982825484764543, + "grad_norm": 0.6835063695907593, + "learning_rate": 2.54079809350058e-06, + "loss": 0.552, + "step": 10768 + }, + { + "epoch": 2.9831024930747922, + "grad_norm": 0.77583247423172, + "learning_rate": 2.5404338567172635e-06, + "loss": 0.6076, + "step": 10769 + }, + { + "epoch": 2.9833795013850417, + "grad_norm": 0.7879008054733276, + "learning_rate": 2.5400696190754347e-06, + "loss": 0.5839, + "step": 10770 + }, + { + "epoch": 2.9836565096952907, + "grad_norm": 0.7527600526809692, + "learning_rate": 2.5397053805828248e-06, + "loss": 0.5529, + "step": 10771 + }, + { + "epoch": 2.98393351800554, + "grad_norm": 0.7148284912109375, + "learning_rate": 2.5393411412471685e-06, + "loss": 0.6178, + "step": 10772 + }, + { + "epoch": 2.984210526315789, + "grad_norm": 0.8008612394332886, + "learning_rate": 2.538976901076199e-06, + "loss": 0.564, + "step": 10773 + }, + { + "epoch": 2.9844875346260387, + "grad_norm": 0.7103780508041382, + "learning_rate": 2.538612660077651e-06, + "loss": 0.5598, + "step": 10774 + }, + { + "epoch": 2.984764542936288, + "grad_norm": 0.7940399050712585, + "learning_rate": 2.5382484182592567e-06, + "loss": 0.6056, + "step": 10775 + }, + { + "epoch": 2.985041551246537, + "grad_norm": 0.7633231282234192, + "learning_rate": 2.537884175628751e-06, + "loss": 0.5574, + "step": 10776 + }, + { + "epoch": 2.9853185595567866, + "grad_norm": 0.7409139275550842, + "learning_rate": 2.537519932193868e-06, + "loss": 0.5227, + "step": 10777 + }, + { + "epoch": 2.985595567867036, + "grad_norm": 0.7334606051445007, + "learning_rate": 2.537155687962341e-06, + "loss": 0.6251, + "step": 10778 + }, + { + "epoch": 2.985872576177285, + "grad_norm": 0.7315794825553894, + "learning_rate": 2.536791442941903e-06, + "loss": 0.5971, + "step": 10779 + }, + { + "epoch": 2.9861495844875345, + "grad_norm": 0.735944390296936, + "learning_rate": 2.5364271971402893e-06, + "loss": 0.6195, + "step": 10780 + }, + { + "epoch": 2.986426592797784, + "grad_norm": 0.7769847512245178, + "learning_rate": 2.5360629505652334e-06, + "loss": 0.6028, + "step": 10781 + }, + { + "epoch": 2.986703601108033, + "grad_norm": 0.7395898103713989, + "learning_rate": 2.5356987032244686e-06, + "loss": 0.5689, + "step": 10782 + }, + { + "epoch": 2.9869806094182825, + "grad_norm": 0.7860537767410278, + "learning_rate": 2.535334455125729e-06, + "loss": 0.6261, + "step": 10783 + }, + { + "epoch": 2.987257617728532, + "grad_norm": 0.7746903300285339, + "learning_rate": 2.534970206276749e-06, + "loss": 0.5677, + "step": 10784 + }, + { + "epoch": 2.987534626038781, + "grad_norm": 0.7295857071876526, + "learning_rate": 2.5346059566852626e-06, + "loss": 0.5774, + "step": 10785 + }, + { + "epoch": 2.9878116343490304, + "grad_norm": 0.7481796741485596, + "learning_rate": 2.5342417063590023e-06, + "loss": 0.5934, + "step": 10786 + }, + { + "epoch": 2.98808864265928, + "grad_norm": 0.7683750987052917, + "learning_rate": 2.5338774553057044e-06, + "loss": 0.6032, + "step": 10787 + }, + { + "epoch": 2.988365650969529, + "grad_norm": 0.7441554665565491, + "learning_rate": 2.5335132035331006e-06, + "loss": 0.6215, + "step": 10788 + }, + { + "epoch": 2.9886426592797783, + "grad_norm": 0.729632556438446, + "learning_rate": 2.5331489510489265e-06, + "loss": 0.5929, + "step": 10789 + }, + { + "epoch": 2.988919667590028, + "grad_norm": 0.7185140252113342, + "learning_rate": 2.5327846978609154e-06, + "loss": 0.5782, + "step": 10790 + }, + { + "epoch": 2.989196675900277, + "grad_norm": 0.7277169227600098, + "learning_rate": 2.532420443976802e-06, + "loss": 0.5951, + "step": 10791 + }, + { + "epoch": 2.9894736842105263, + "grad_norm": 0.7362697720527649, + "learning_rate": 2.532056189404318e-06, + "loss": 0.5914, + "step": 10792 + }, + { + "epoch": 2.9897506925207757, + "grad_norm": 0.7093611359596252, + "learning_rate": 2.5316919341512013e-06, + "loss": 0.5739, + "step": 10793 + }, + { + "epoch": 2.9900277008310248, + "grad_norm": 0.7320445775985718, + "learning_rate": 2.531327678225183e-06, + "loss": 0.5517, + "step": 10794 + }, + { + "epoch": 2.990304709141274, + "grad_norm": 0.7347859740257263, + "learning_rate": 2.530963421633998e-06, + "loss": 0.564, + "step": 10795 + }, + { + "epoch": 2.9905817174515237, + "grad_norm": 0.7222632765769958, + "learning_rate": 2.5305991643853804e-06, + "loss": 0.599, + "step": 10796 + }, + { + "epoch": 2.9908587257617727, + "grad_norm": 0.7426832318305969, + "learning_rate": 2.5302349064870644e-06, + "loss": 0.5932, + "step": 10797 + }, + { + "epoch": 2.991135734072022, + "grad_norm": 0.7762872576713562, + "learning_rate": 2.5298706479467843e-06, + "loss": 0.5427, + "step": 10798 + }, + { + "epoch": 2.9914127423822716, + "grad_norm": 0.7226398587226868, + "learning_rate": 2.5295063887722742e-06, + "loss": 0.5753, + "step": 10799 + }, + { + "epoch": 2.9916897506925206, + "grad_norm": 0.7660290002822876, + "learning_rate": 2.529142128971268e-06, + "loss": 0.5759, + "step": 10800 + }, + { + "epoch": 2.99196675900277, + "grad_norm": 0.7273073792457581, + "learning_rate": 2.5287778685514998e-06, + "loss": 0.5764, + "step": 10801 + }, + { + "epoch": 2.9922437673130196, + "grad_norm": 0.7794416546821594, + "learning_rate": 2.5284136075207043e-06, + "loss": 0.6293, + "step": 10802 + }, + { + "epoch": 2.9925207756232686, + "grad_norm": 0.7536942362785339, + "learning_rate": 2.528049345886615e-06, + "loss": 0.5782, + "step": 10803 + }, + { + "epoch": 2.992797783933518, + "grad_norm": 0.7645159363746643, + "learning_rate": 2.5276850836569665e-06, + "loss": 0.6137, + "step": 10804 + }, + { + "epoch": 2.9930747922437675, + "grad_norm": 0.7383338212966919, + "learning_rate": 2.5273208208394927e-06, + "loss": 0.6231, + "step": 10805 + }, + { + "epoch": 2.9933518005540165, + "grad_norm": 0.7565742135047913, + "learning_rate": 2.5269565574419297e-06, + "loss": 0.5862, + "step": 10806 + }, + { + "epoch": 2.993628808864266, + "grad_norm": 0.772453248500824, + "learning_rate": 2.5265922934720082e-06, + "loss": 0.5827, + "step": 10807 + }, + { + "epoch": 2.9939058171745154, + "grad_norm": 0.751247227191925, + "learning_rate": 2.5262280289374652e-06, + "loss": 0.5273, + "step": 10808 + }, + { + "epoch": 2.9941828254847644, + "grad_norm": 0.7555593848228455, + "learning_rate": 2.525863763846034e-06, + "loss": 0.5786, + "step": 10809 + }, + { + "epoch": 2.994459833795014, + "grad_norm": 0.753018856048584, + "learning_rate": 2.5254994982054494e-06, + "loss": 0.5976, + "step": 10810 + }, + { + "epoch": 2.9947368421052634, + "grad_norm": 0.7415595650672913, + "learning_rate": 2.525135232023445e-06, + "loss": 0.5904, + "step": 10811 + }, + { + "epoch": 2.9950138504155124, + "grad_norm": 0.7019715905189514, + "learning_rate": 2.524770965307756e-06, + "loss": 0.5546, + "step": 10812 + }, + { + "epoch": 2.995290858725762, + "grad_norm": 0.7442302703857422, + "learning_rate": 2.5244066980661154e-06, + "loss": 0.5846, + "step": 10813 + }, + { + "epoch": 2.9955678670360113, + "grad_norm": 0.7354565262794495, + "learning_rate": 2.524042430306258e-06, + "loss": 0.6103, + "step": 10814 + }, + { + "epoch": 2.9958448753462603, + "grad_norm": 0.755585253238678, + "learning_rate": 2.5236781620359196e-06, + "loss": 0.6171, + "step": 10815 + }, + { + "epoch": 2.9961218836565098, + "grad_norm": 0.7500039935112, + "learning_rate": 2.5233138932628326e-06, + "loss": 0.5986, + "step": 10816 + }, + { + "epoch": 2.9963988919667592, + "grad_norm": 0.7086367011070251, + "learning_rate": 2.522949623994732e-06, + "loss": 0.6021, + "step": 10817 + }, + { + "epoch": 2.9966759002770083, + "grad_norm": 0.7707514762878418, + "learning_rate": 2.522585354239353e-06, + "loss": 0.6043, + "step": 10818 + }, + { + "epoch": 2.9969529085872577, + "grad_norm": 0.7526091933250427, + "learning_rate": 2.5222210840044287e-06, + "loss": 0.5745, + "step": 10819 + }, + { + "epoch": 2.997229916897507, + "grad_norm": 0.7560510635375977, + "learning_rate": 2.521856813297694e-06, + "loss": 0.6333, + "step": 10820 + }, + { + "epoch": 2.997506925207756, + "grad_norm": 0.7667151093482971, + "learning_rate": 2.521492542126883e-06, + "loss": 0.6159, + "step": 10821 + }, + { + "epoch": 2.9977839335180057, + "grad_norm": 0.7421969771385193, + "learning_rate": 2.5211282704997314e-06, + "loss": 0.6622, + "step": 10822 + }, + { + "epoch": 2.998060941828255, + "grad_norm": 0.7187398672103882, + "learning_rate": 2.5207639984239723e-06, + "loss": 0.5686, + "step": 10823 + }, + { + "epoch": 2.998337950138504, + "grad_norm": 0.7653238773345947, + "learning_rate": 2.5203997259073398e-06, + "loss": 0.5997, + "step": 10824 + }, + { + "epoch": 2.9986149584487536, + "grad_norm": 0.6978490352630615, + "learning_rate": 2.5200354529575693e-06, + "loss": 0.6116, + "step": 10825 + }, + { + "epoch": 2.998891966759003, + "grad_norm": 0.749488115310669, + "learning_rate": 2.5196711795823957e-06, + "loss": 0.5505, + "step": 10826 + }, + { + "epoch": 2.999168975069252, + "grad_norm": 0.7787477374076843, + "learning_rate": 2.519306905789552e-06, + "loss": 0.6134, + "step": 10827 + }, + { + "epoch": 2.9994459833795015, + "grad_norm": 0.7261177897453308, + "learning_rate": 2.518942631586774e-06, + "loss": 0.5708, + "step": 10828 + }, + { + "epoch": 2.9997229916897505, + "grad_norm": 0.7337374091148376, + "learning_rate": 2.5185783569817944e-06, + "loss": 0.5427, + "step": 10829 + }, + { + "epoch": 3.0, + "grad_norm": 0.7540554404258728, + "learning_rate": 2.5182140819823503e-06, + "loss": 0.5834, + "step": 10830 + } + ], + "logging_steps": 1, + "max_steps": 21660, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 3610, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.426074873341149e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}