diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43987 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 6279, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004778687055731438, + "grad_norm": 6.597299531195726, + "learning_rate": 0.0, + "loss": 0.9182, + "step": 1 + }, + { + "epoch": 0.0009557374111462876, + "grad_norm": 6.798268123961526, + "learning_rate": 6.369426751592358e-08, + "loss": 0.9348, + "step": 2 + }, + { + "epoch": 0.0014336061167194312, + "grad_norm": 6.547375642053667, + "learning_rate": 1.2738853503184715e-07, + "loss": 0.9002, + "step": 3 + }, + { + "epoch": 0.0019114748222925752, + "grad_norm": 6.714041140675801, + "learning_rate": 1.9108280254777072e-07, + "loss": 0.9221, + "step": 4 + }, + { + "epoch": 0.002389343527865719, + "grad_norm": 6.482638024201559, + "learning_rate": 2.547770700636943e-07, + "loss": 0.8989, + "step": 5 + }, + { + "epoch": 0.0028672122334388625, + "grad_norm": 6.628677060072881, + "learning_rate": 3.1847133757961787e-07, + "loss": 0.9246, + "step": 6 + }, + { + "epoch": 0.0033450809390120064, + "grad_norm": 6.54934804614698, + "learning_rate": 3.8216560509554143e-07, + "loss": 0.9118, + "step": 7 + }, + { + "epoch": 0.0038229496445851504, + "grad_norm": 6.822862171641388, + "learning_rate": 4.45859872611465e-07, + "loss": 0.9453, + "step": 8 + }, + { + "epoch": 0.004300818350158294, + "grad_norm": 6.436429750787215, + "learning_rate": 5.095541401273886e-07, + "loss": 0.8919, + "step": 9 + }, + { + "epoch": 0.004778687055731438, + "grad_norm": 6.20546872502828, + "learning_rate": 5.732484076433121e-07, + "loss": 0.9081, + "step": 10 + }, + { + "epoch": 0.0052565557613045814, + "grad_norm": 6.1990164932210305, + "learning_rate": 6.369426751592357e-07, + "loss": 0.9152, + "step": 11 + }, + { + "epoch": 0.005734424466877725, + "grad_norm": 6.213474019663624, + "learning_rate": 7.006369426751592e-07, + "loss": 0.9127, + "step": 12 + }, + { + "epoch": 0.006212293172450869, + "grad_norm": 4.774163980815126, + "learning_rate": 7.643312101910829e-07, + "loss": 0.8724, + "step": 13 + }, + { + "epoch": 0.006690161878024013, + "grad_norm": 4.635856494922676, + "learning_rate": 8.280254777070064e-07, + "loss": 0.8794, + "step": 14 + }, + { + "epoch": 0.007168030583597156, + "grad_norm": 4.4643552624491925, + "learning_rate": 8.9171974522293e-07, + "loss": 0.8574, + "step": 15 + }, + { + "epoch": 0.007645899289170301, + "grad_norm": 4.746112409024538, + "learning_rate": 9.554140127388537e-07, + "loss": 0.9037, + "step": 16 + }, + { + "epoch": 0.008123767994743444, + "grad_norm": 3.6663226655371237, + "learning_rate": 1.0191082802547772e-06, + "loss": 0.8572, + "step": 17 + }, + { + "epoch": 0.008601636700316589, + "grad_norm": 3.161655587285928, + "learning_rate": 1.0828025477707007e-06, + "loss": 0.8504, + "step": 18 + }, + { + "epoch": 0.009079505405889731, + "grad_norm": 3.023372031668388, + "learning_rate": 1.1464968152866242e-06, + "loss": 0.8404, + "step": 19 + }, + { + "epoch": 0.009557374111462876, + "grad_norm": 2.8123940051382466, + "learning_rate": 1.210191082802548e-06, + "loss": 0.8233, + "step": 20 + }, + { + "epoch": 0.01003524281703602, + "grad_norm": 2.3319495260663037, + "learning_rate": 1.2738853503184715e-06, + "loss": 0.8007, + "step": 21 + }, + { + "epoch": 0.010513111522609163, + "grad_norm": 2.1046299310129926, + "learning_rate": 1.337579617834395e-06, + "loss": 0.7893, + "step": 22 + }, + { + "epoch": 0.010990980228182307, + "grad_norm": 1.970451770951646, + "learning_rate": 1.4012738853503185e-06, + "loss": 0.8091, + "step": 23 + }, + { + "epoch": 0.01146884893375545, + "grad_norm": 2.2852013132038262, + "learning_rate": 1.4649681528662422e-06, + "loss": 0.761, + "step": 24 + }, + { + "epoch": 0.011946717639328594, + "grad_norm": 2.3125203117243527, + "learning_rate": 1.5286624203821657e-06, + "loss": 0.7576, + "step": 25 + }, + { + "epoch": 0.012424586344901739, + "grad_norm": 2.485516736089154, + "learning_rate": 1.5923566878980892e-06, + "loss": 0.7298, + "step": 26 + }, + { + "epoch": 0.012902455050474881, + "grad_norm": 2.328181944771006, + "learning_rate": 1.6560509554140127e-06, + "loss": 0.7241, + "step": 27 + }, + { + "epoch": 0.013380323756048026, + "grad_norm": 2.1332096308422592, + "learning_rate": 1.7197452229299363e-06, + "loss": 0.7561, + "step": 28 + }, + { + "epoch": 0.01385819246162117, + "grad_norm": 1.9124393579077286, + "learning_rate": 1.78343949044586e-06, + "loss": 0.7421, + "step": 29 + }, + { + "epoch": 0.014336061167194313, + "grad_norm": 1.868955513961223, + "learning_rate": 1.8471337579617835e-06, + "loss": 0.6917, + "step": 30 + }, + { + "epoch": 0.014813929872767457, + "grad_norm": 1.5698888685595818, + "learning_rate": 1.9108280254777074e-06, + "loss": 0.7297, + "step": 31 + }, + { + "epoch": 0.015291798578340602, + "grad_norm": 1.2265156599136022, + "learning_rate": 1.974522292993631e-06, + "loss": 0.698, + "step": 32 + }, + { + "epoch": 0.015769667283913744, + "grad_norm": 1.3091022856057957, + "learning_rate": 2.0382165605095544e-06, + "loss": 0.6542, + "step": 33 + }, + { + "epoch": 0.01624753598948689, + "grad_norm": 1.45790978867409, + "learning_rate": 2.101910828025478e-06, + "loss": 0.6759, + "step": 34 + }, + { + "epoch": 0.016725404695060033, + "grad_norm": 1.482461582005205, + "learning_rate": 2.1656050955414015e-06, + "loss": 0.6654, + "step": 35 + }, + { + "epoch": 0.017203273400633178, + "grad_norm": 1.3245355619344652, + "learning_rate": 2.229299363057325e-06, + "loss": 0.6541, + "step": 36 + }, + { + "epoch": 0.01768114210620632, + "grad_norm": 1.1472782080031219, + "learning_rate": 2.2929936305732485e-06, + "loss": 0.6515, + "step": 37 + }, + { + "epoch": 0.018159010811779463, + "grad_norm": 1.0289175415890455, + "learning_rate": 2.356687898089172e-06, + "loss": 0.6299, + "step": 38 + }, + { + "epoch": 0.018636879517352607, + "grad_norm": 0.9574361352811592, + "learning_rate": 2.420382165605096e-06, + "loss": 0.6258, + "step": 39 + }, + { + "epoch": 0.01911474822292575, + "grad_norm": 0.9277275080285016, + "learning_rate": 2.4840764331210194e-06, + "loss": 0.6337, + "step": 40 + }, + { + "epoch": 0.019592616928498896, + "grad_norm": 0.8945726217645557, + "learning_rate": 2.547770700636943e-06, + "loss": 0.6198, + "step": 41 + }, + { + "epoch": 0.02007048563407204, + "grad_norm": 0.8774284132253958, + "learning_rate": 2.6114649681528665e-06, + "loss": 0.6102, + "step": 42 + }, + { + "epoch": 0.02054835433964518, + "grad_norm": 0.9050997026733555, + "learning_rate": 2.67515923566879e-06, + "loss": 0.6221, + "step": 43 + }, + { + "epoch": 0.021026223045218326, + "grad_norm": 0.8632323727229961, + "learning_rate": 2.7388535031847135e-06, + "loss": 0.622, + "step": 44 + }, + { + "epoch": 0.02150409175079147, + "grad_norm": 0.813517779384421, + "learning_rate": 2.802547770700637e-06, + "loss": 0.6089, + "step": 45 + }, + { + "epoch": 0.021981960456364615, + "grad_norm": 0.8186427508339532, + "learning_rate": 2.8662420382165605e-06, + "loss": 0.6016, + "step": 46 + }, + { + "epoch": 0.02245982916193776, + "grad_norm": 0.8174387059211783, + "learning_rate": 2.9299363057324844e-06, + "loss": 0.5931, + "step": 47 + }, + { + "epoch": 0.0229376978675109, + "grad_norm": 1.305617651951218, + "learning_rate": 2.993630573248408e-06, + "loss": 0.5692, + "step": 48 + }, + { + "epoch": 0.023415566573084044, + "grad_norm": 0.8380237368547312, + "learning_rate": 3.0573248407643314e-06, + "loss": 0.5887, + "step": 49 + }, + { + "epoch": 0.02389343527865719, + "grad_norm": 0.8200238530137682, + "learning_rate": 3.121019108280255e-06, + "loss": 0.5929, + "step": 50 + }, + { + "epoch": 0.024371303984230333, + "grad_norm": 0.8403778738693355, + "learning_rate": 3.1847133757961785e-06, + "loss": 0.6014, + "step": 51 + }, + { + "epoch": 0.024849172689803477, + "grad_norm": 0.9908747877734703, + "learning_rate": 3.248407643312102e-06, + "loss": 0.5735, + "step": 52 + }, + { + "epoch": 0.025327041395376622, + "grad_norm": 0.8075484398827105, + "learning_rate": 3.3121019108280255e-06, + "loss": 0.5593, + "step": 53 + }, + { + "epoch": 0.025804910100949763, + "grad_norm": 0.7626574710843891, + "learning_rate": 3.375796178343949e-06, + "loss": 0.5806, + "step": 54 + }, + { + "epoch": 0.026282778806522907, + "grad_norm": 0.836758410598703, + "learning_rate": 3.4394904458598725e-06, + "loss": 0.5768, + "step": 55 + }, + { + "epoch": 0.02676064751209605, + "grad_norm": 0.7496247956447741, + "learning_rate": 3.5031847133757964e-06, + "loss": 0.5815, + "step": 56 + }, + { + "epoch": 0.027238516217669196, + "grad_norm": 0.7950751813645308, + "learning_rate": 3.56687898089172e-06, + "loss": 0.5893, + "step": 57 + }, + { + "epoch": 0.02771638492324234, + "grad_norm": 0.7934523889506495, + "learning_rate": 3.6305732484076435e-06, + "loss": 0.5675, + "step": 58 + }, + { + "epoch": 0.02819425362881548, + "grad_norm": 0.7469734117900833, + "learning_rate": 3.694267515923567e-06, + "loss": 0.5679, + "step": 59 + }, + { + "epoch": 0.028672122334388626, + "grad_norm": 0.7390161251771473, + "learning_rate": 3.757961783439491e-06, + "loss": 0.5464, + "step": 60 + }, + { + "epoch": 0.02914999103996177, + "grad_norm": 0.756427918659596, + "learning_rate": 3.821656050955415e-06, + "loss": 0.5422, + "step": 61 + }, + { + "epoch": 0.029627859745534914, + "grad_norm": 0.7389039375217277, + "learning_rate": 3.885350318471338e-06, + "loss": 0.5551, + "step": 62 + }, + { + "epoch": 0.03010572845110806, + "grad_norm": 0.7367851829841032, + "learning_rate": 3.949044585987262e-06, + "loss": 0.5449, + "step": 63 + }, + { + "epoch": 0.030583597156681203, + "grad_norm": 0.7123685102640923, + "learning_rate": 4.012738853503185e-06, + "loss": 0.5458, + "step": 64 + }, + { + "epoch": 0.031061465862254344, + "grad_norm": 0.7357352408857197, + "learning_rate": 4.076433121019109e-06, + "loss": 0.5341, + "step": 65 + }, + { + "epoch": 0.03153933456782749, + "grad_norm": 0.7214843181688534, + "learning_rate": 4.140127388535032e-06, + "loss": 0.5459, + "step": 66 + }, + { + "epoch": 0.032017203273400636, + "grad_norm": 0.7467241250442314, + "learning_rate": 4.203821656050956e-06, + "loss": 0.5396, + "step": 67 + }, + { + "epoch": 0.03249507197897378, + "grad_norm": 0.7727746910699702, + "learning_rate": 4.26751592356688e-06, + "loss": 0.5472, + "step": 68 + }, + { + "epoch": 0.03297294068454692, + "grad_norm": 0.7251863973183906, + "learning_rate": 4.331210191082803e-06, + "loss": 0.5597, + "step": 69 + }, + { + "epoch": 0.033450809390120066, + "grad_norm": 0.7272882874909277, + "learning_rate": 4.394904458598727e-06, + "loss": 0.5556, + "step": 70 + }, + { + "epoch": 0.03392867809569321, + "grad_norm": 0.8391711398530735, + "learning_rate": 4.45859872611465e-06, + "loss": 0.5496, + "step": 71 + }, + { + "epoch": 0.034406546801266355, + "grad_norm": 0.9360566101078907, + "learning_rate": 4.522292993630574e-06, + "loss": 0.5432, + "step": 72 + }, + { + "epoch": 0.034884415506839496, + "grad_norm": 0.7797140366402956, + "learning_rate": 4.585987261146497e-06, + "loss": 0.5395, + "step": 73 + }, + { + "epoch": 0.03536228421241264, + "grad_norm": 0.707051554509993, + "learning_rate": 4.649681528662421e-06, + "loss": 0.5311, + "step": 74 + }, + { + "epoch": 0.035840152917985785, + "grad_norm": 0.7834053957637365, + "learning_rate": 4.713375796178344e-06, + "loss": 0.5487, + "step": 75 + }, + { + "epoch": 0.036318021623558926, + "grad_norm": 0.7935586931488725, + "learning_rate": 4.777070063694268e-06, + "loss": 0.5327, + "step": 76 + }, + { + "epoch": 0.036795890329132074, + "grad_norm": 0.770263498050658, + "learning_rate": 4.840764331210192e-06, + "loss": 0.5146, + "step": 77 + }, + { + "epoch": 0.037273759034705214, + "grad_norm": 0.9115020793198642, + "learning_rate": 4.904458598726115e-06, + "loss": 0.5438, + "step": 78 + }, + { + "epoch": 0.037751627740278355, + "grad_norm": 0.8472805895573531, + "learning_rate": 4.968152866242039e-06, + "loss": 0.549, + "step": 79 + }, + { + "epoch": 0.0382294964458515, + "grad_norm": 0.7542037051233978, + "learning_rate": 5.031847133757962e-06, + "loss": 0.5351, + "step": 80 + }, + { + "epoch": 0.038707365151424644, + "grad_norm": 0.747781469023545, + "learning_rate": 5.095541401273886e-06, + "loss": 0.5084, + "step": 81 + }, + { + "epoch": 0.03918523385699779, + "grad_norm": 0.7476744681713057, + "learning_rate": 5.159235668789809e-06, + "loss": 0.5245, + "step": 82 + }, + { + "epoch": 0.03966310256257093, + "grad_norm": 0.7768862389201611, + "learning_rate": 5.222929936305733e-06, + "loss": 0.5271, + "step": 83 + }, + { + "epoch": 0.04014097126814408, + "grad_norm": 0.7111007495704235, + "learning_rate": 5.286624203821657e-06, + "loss": 0.5266, + "step": 84 + }, + { + "epoch": 0.04061883997371722, + "grad_norm": 0.732731140034443, + "learning_rate": 5.35031847133758e-06, + "loss": 0.5263, + "step": 85 + }, + { + "epoch": 0.04109670867929036, + "grad_norm": 0.8652091302250842, + "learning_rate": 5.414012738853504e-06, + "loss": 0.5273, + "step": 86 + }, + { + "epoch": 0.04157457738486351, + "grad_norm": 0.737707359165254, + "learning_rate": 5.477707006369427e-06, + "loss": 0.5061, + "step": 87 + }, + { + "epoch": 0.04205244609043665, + "grad_norm": 0.8065815895775212, + "learning_rate": 5.541401273885351e-06, + "loss": 0.5098, + "step": 88 + }, + { + "epoch": 0.0425303147960098, + "grad_norm": 0.7266312168372544, + "learning_rate": 5.605095541401274e-06, + "loss": 0.4999, + "step": 89 + }, + { + "epoch": 0.04300818350158294, + "grad_norm": 0.7293962461070471, + "learning_rate": 5.668789808917198e-06, + "loss": 0.5213, + "step": 90 + }, + { + "epoch": 0.04348605220715608, + "grad_norm": 0.7774805752991096, + "learning_rate": 5.732484076433121e-06, + "loss": 0.4995, + "step": 91 + }, + { + "epoch": 0.04396392091272923, + "grad_norm": 0.7530825142450961, + "learning_rate": 5.796178343949045e-06, + "loss": 0.5365, + "step": 92 + }, + { + "epoch": 0.04444178961830237, + "grad_norm": 0.6739843506427442, + "learning_rate": 5.859872611464969e-06, + "loss": 0.5146, + "step": 93 + }, + { + "epoch": 0.04491965832387552, + "grad_norm": 0.7872225186802001, + "learning_rate": 5.923566878980892e-06, + "loss": 0.5171, + "step": 94 + }, + { + "epoch": 0.04539752702944866, + "grad_norm": 0.7593936458726057, + "learning_rate": 5.987261146496816e-06, + "loss": 0.5075, + "step": 95 + }, + { + "epoch": 0.0458753957350218, + "grad_norm": 0.7402231844428684, + "learning_rate": 6.050955414012739e-06, + "loss": 0.5219, + "step": 96 + }, + { + "epoch": 0.04635326444059495, + "grad_norm": 0.8456741711717691, + "learning_rate": 6.114649681528663e-06, + "loss": 0.5179, + "step": 97 + }, + { + "epoch": 0.04683113314616809, + "grad_norm": 0.7479830972685164, + "learning_rate": 6.178343949044586e-06, + "loss": 0.516, + "step": 98 + }, + { + "epoch": 0.047309001851741236, + "grad_norm": 0.7472523172623668, + "learning_rate": 6.24203821656051e-06, + "loss": 0.498, + "step": 99 + }, + { + "epoch": 0.04778687055731438, + "grad_norm": 0.8971442569968484, + "learning_rate": 6.305732484076433e-06, + "loss": 0.51, + "step": 100 + }, + { + "epoch": 0.04826473926288752, + "grad_norm": 0.7564195148457464, + "learning_rate": 6.369426751592357e-06, + "loss": 0.5198, + "step": 101 + }, + { + "epoch": 0.048742607968460666, + "grad_norm": 1.0620219552470769, + "learning_rate": 6.433121019108281e-06, + "loss": 0.5, + "step": 102 + }, + { + "epoch": 0.04922047667403381, + "grad_norm": 0.7898571941764354, + "learning_rate": 6.496815286624204e-06, + "loss": 0.5154, + "step": 103 + }, + { + "epoch": 0.049698345379606955, + "grad_norm": 0.7334402709211295, + "learning_rate": 6.560509554140128e-06, + "loss": 0.499, + "step": 104 + }, + { + "epoch": 0.050176214085180096, + "grad_norm": 0.7398643150657089, + "learning_rate": 6.624203821656051e-06, + "loss": 0.506, + "step": 105 + }, + { + "epoch": 0.050654082790753244, + "grad_norm": 0.7563474458560936, + "learning_rate": 6.687898089171975e-06, + "loss": 0.5153, + "step": 106 + }, + { + "epoch": 0.051131951496326385, + "grad_norm": 0.7767148986192806, + "learning_rate": 6.751592356687898e-06, + "loss": 0.4994, + "step": 107 + }, + { + "epoch": 0.051609820201899526, + "grad_norm": 0.7746503252727361, + "learning_rate": 6.815286624203822e-06, + "loss": 0.4994, + "step": 108 + }, + { + "epoch": 0.05208768890747267, + "grad_norm": 0.8270879600000823, + "learning_rate": 6.878980891719745e-06, + "loss": 0.4912, + "step": 109 + }, + { + "epoch": 0.052565557613045814, + "grad_norm": 0.7271341530708423, + "learning_rate": 6.942675159235669e-06, + "loss": 0.5038, + "step": 110 + }, + { + "epoch": 0.05304342631861896, + "grad_norm": 0.7909333113850586, + "learning_rate": 7.006369426751593e-06, + "loss": 0.4965, + "step": 111 + }, + { + "epoch": 0.0535212950241921, + "grad_norm": 1.1920708554598451, + "learning_rate": 7.070063694267516e-06, + "loss": 0.4835, + "step": 112 + }, + { + "epoch": 0.053999163729765244, + "grad_norm": 1.1437207443713215, + "learning_rate": 7.13375796178344e-06, + "loss": 0.4871, + "step": 113 + }, + { + "epoch": 0.05447703243533839, + "grad_norm": 0.7995131778249069, + "learning_rate": 7.197452229299363e-06, + "loss": 0.5125, + "step": 114 + }, + { + "epoch": 0.05495490114091153, + "grad_norm": 0.7658107271492185, + "learning_rate": 7.261146496815287e-06, + "loss": 0.5043, + "step": 115 + }, + { + "epoch": 0.05543276984648468, + "grad_norm": 0.922334411856167, + "learning_rate": 7.32484076433121e-06, + "loss": 0.4989, + "step": 116 + }, + { + "epoch": 0.05591063855205782, + "grad_norm": 0.7836377405415712, + "learning_rate": 7.388535031847134e-06, + "loss": 0.4899, + "step": 117 + }, + { + "epoch": 0.05638850725763096, + "grad_norm": 0.8764338235424465, + "learning_rate": 7.452229299363057e-06, + "loss": 0.5001, + "step": 118 + }, + { + "epoch": 0.05686637596320411, + "grad_norm": 0.8139065283378535, + "learning_rate": 7.515923566878982e-06, + "loss": 0.5079, + "step": 119 + }, + { + "epoch": 0.05734424466877725, + "grad_norm": 0.8340053526116425, + "learning_rate": 7.579617834394906e-06, + "loss": 0.5087, + "step": 120 + }, + { + "epoch": 0.0578221133743504, + "grad_norm": 0.9688304321284679, + "learning_rate": 7.64331210191083e-06, + "loss": 0.4945, + "step": 121 + }, + { + "epoch": 0.05829998207992354, + "grad_norm": 0.8501127062676569, + "learning_rate": 7.707006369426753e-06, + "loss": 0.4736, + "step": 122 + }, + { + "epoch": 0.05877785078549669, + "grad_norm": 0.8146384000193851, + "learning_rate": 7.770700636942676e-06, + "loss": 0.4917, + "step": 123 + }, + { + "epoch": 0.05925571949106983, + "grad_norm": 0.8905411656723172, + "learning_rate": 7.8343949044586e-06, + "loss": 0.4931, + "step": 124 + }, + { + "epoch": 0.05973358819664297, + "grad_norm": 0.8610319305755783, + "learning_rate": 7.898089171974524e-06, + "loss": 0.5019, + "step": 125 + }, + { + "epoch": 0.06021145690221612, + "grad_norm": 0.8674477387963704, + "learning_rate": 7.961783439490447e-06, + "loss": 0.4896, + "step": 126 + }, + { + "epoch": 0.06068932560778926, + "grad_norm": 0.8577218357744752, + "learning_rate": 8.02547770700637e-06, + "loss": 0.4811, + "step": 127 + }, + { + "epoch": 0.06116719431336241, + "grad_norm": 0.8448292497264104, + "learning_rate": 8.089171974522295e-06, + "loss": 0.504, + "step": 128 + }, + { + "epoch": 0.06164506301893555, + "grad_norm": 0.937833097972482, + "learning_rate": 8.152866242038218e-06, + "loss": 0.4962, + "step": 129 + }, + { + "epoch": 0.06212293172450869, + "grad_norm": 0.7840157899279157, + "learning_rate": 8.21656050955414e-06, + "loss": 0.4985, + "step": 130 + }, + { + "epoch": 0.06260080043008183, + "grad_norm": 0.8889842008188884, + "learning_rate": 8.280254777070064e-06, + "loss": 0.4917, + "step": 131 + }, + { + "epoch": 0.06307866913565498, + "grad_norm": 0.8118606083274249, + "learning_rate": 8.343949044585989e-06, + "loss": 0.5121, + "step": 132 + }, + { + "epoch": 0.06355653784122813, + "grad_norm": 0.8160111895836885, + "learning_rate": 8.407643312101912e-06, + "loss": 0.4975, + "step": 133 + }, + { + "epoch": 0.06403440654680127, + "grad_norm": 0.8257529923932002, + "learning_rate": 8.471337579617835e-06, + "loss": 0.4905, + "step": 134 + }, + { + "epoch": 0.06451227525237441, + "grad_norm": 0.9316496549034691, + "learning_rate": 8.53503184713376e-06, + "loss": 0.4922, + "step": 135 + }, + { + "epoch": 0.06499014395794755, + "grad_norm": 3.6885784250742777, + "learning_rate": 8.598726114649683e-06, + "loss": 0.5088, + "step": 136 + }, + { + "epoch": 0.0654680126635207, + "grad_norm": 1.0431096223978598, + "learning_rate": 8.662420382165606e-06, + "loss": 0.4851, + "step": 137 + }, + { + "epoch": 0.06594588136909384, + "grad_norm": 0.9301103008954675, + "learning_rate": 8.726114649681529e-06, + "loss": 0.5035, + "step": 138 + }, + { + "epoch": 0.06642375007466698, + "grad_norm": 0.961097839927529, + "learning_rate": 8.789808917197454e-06, + "loss": 0.5035, + "step": 139 + }, + { + "epoch": 0.06690161878024013, + "grad_norm": 0.8559608590796796, + "learning_rate": 8.853503184713377e-06, + "loss": 0.4786, + "step": 140 + }, + { + "epoch": 0.06737948748581327, + "grad_norm": 0.8702110473814613, + "learning_rate": 8.9171974522293e-06, + "loss": 0.4769, + "step": 141 + }, + { + "epoch": 0.06785735619138641, + "grad_norm": 0.89503054543084, + "learning_rate": 8.980891719745225e-06, + "loss": 0.4936, + "step": 142 + }, + { + "epoch": 0.06833522489695956, + "grad_norm": 0.9405145862374199, + "learning_rate": 9.044585987261148e-06, + "loss": 0.4994, + "step": 143 + }, + { + "epoch": 0.06881309360253271, + "grad_norm": 0.8359505048050953, + "learning_rate": 9.10828025477707e-06, + "loss": 0.4888, + "step": 144 + }, + { + "epoch": 0.06929096230810584, + "grad_norm": 0.8335744765307723, + "learning_rate": 9.171974522292994e-06, + "loss": 0.486, + "step": 145 + }, + { + "epoch": 0.06976883101367899, + "grad_norm": 0.9191064552692454, + "learning_rate": 9.235668789808919e-06, + "loss": 0.4936, + "step": 146 + }, + { + "epoch": 0.07024669971925214, + "grad_norm": 0.8613976280696265, + "learning_rate": 9.299363057324842e-06, + "loss": 0.4892, + "step": 147 + }, + { + "epoch": 0.07072456842482527, + "grad_norm": 0.9085851257000545, + "learning_rate": 9.363057324840765e-06, + "loss": 0.4959, + "step": 148 + }, + { + "epoch": 0.07120243713039842, + "grad_norm": 0.8498504776265091, + "learning_rate": 9.426751592356688e-06, + "loss": 0.4799, + "step": 149 + }, + { + "epoch": 0.07168030583597157, + "grad_norm": 0.981381083456667, + "learning_rate": 9.490445859872613e-06, + "loss": 0.4759, + "step": 150 + }, + { + "epoch": 0.07215817454154472, + "grad_norm": 0.956769830775876, + "learning_rate": 9.554140127388536e-06, + "loss": 0.4898, + "step": 151 + }, + { + "epoch": 0.07263604324711785, + "grad_norm": 1.0038255875882836, + "learning_rate": 9.617834394904459e-06, + "loss": 0.4775, + "step": 152 + }, + { + "epoch": 0.073113911952691, + "grad_norm": 0.8713370885352425, + "learning_rate": 9.681528662420384e-06, + "loss": 0.4969, + "step": 153 + }, + { + "epoch": 0.07359178065826415, + "grad_norm": 0.8141213363595519, + "learning_rate": 9.745222929936307e-06, + "loss": 0.4705, + "step": 154 + }, + { + "epoch": 0.07406964936383728, + "grad_norm": 3.751367127935264, + "learning_rate": 9.80891719745223e-06, + "loss": 0.4864, + "step": 155 + }, + { + "epoch": 0.07454751806941043, + "grad_norm": 1.1148603281454004, + "learning_rate": 9.872611464968153e-06, + "loss": 0.4822, + "step": 156 + }, + { + "epoch": 0.07502538677498358, + "grad_norm": 0.8287232413246658, + "learning_rate": 9.936305732484078e-06, + "loss": 0.4642, + "step": 157 + }, + { + "epoch": 0.07550325548055671, + "grad_norm": 0.8915041269681552, + "learning_rate": 1e-05, + "loss": 0.4768, + "step": 158 + }, + { + "epoch": 0.07598112418612986, + "grad_norm": 0.873722784678586, + "learning_rate": 1.0063694267515924e-05, + "loss": 0.4808, + "step": 159 + }, + { + "epoch": 0.076458992891703, + "grad_norm": 0.8676339202348703, + "learning_rate": 1.0127388535031849e-05, + "loss": 0.4916, + "step": 160 + }, + { + "epoch": 0.07693686159727615, + "grad_norm": 0.9305589175210003, + "learning_rate": 1.0191082802547772e-05, + "loss": 0.4747, + "step": 161 + }, + { + "epoch": 0.07741473030284929, + "grad_norm": 0.8519987127043529, + "learning_rate": 1.0254777070063695e-05, + "loss": 0.4756, + "step": 162 + }, + { + "epoch": 0.07789259900842244, + "grad_norm": 0.8823350335352326, + "learning_rate": 1.0318471337579618e-05, + "loss": 0.4662, + "step": 163 + }, + { + "epoch": 0.07837046771399558, + "grad_norm": 0.7500687237850792, + "learning_rate": 1.0382165605095543e-05, + "loss": 0.485, + "step": 164 + }, + { + "epoch": 0.07884833641956872, + "grad_norm": 0.8517746565202567, + "learning_rate": 1.0445859872611466e-05, + "loss": 0.4863, + "step": 165 + }, + { + "epoch": 0.07932620512514187, + "grad_norm": 1.2439834788261996, + "learning_rate": 1.0509554140127389e-05, + "loss": 0.4818, + "step": 166 + }, + { + "epoch": 0.07980407383071501, + "grad_norm": 0.9673894115074996, + "learning_rate": 1.0573248407643314e-05, + "loss": 0.4916, + "step": 167 + }, + { + "epoch": 0.08028194253628816, + "grad_norm": 0.8778733640888571, + "learning_rate": 1.0636942675159237e-05, + "loss": 0.4786, + "step": 168 + }, + { + "epoch": 0.0807598112418613, + "grad_norm": 0.9193416256035223, + "learning_rate": 1.070063694267516e-05, + "loss": 0.4806, + "step": 169 + }, + { + "epoch": 0.08123767994743444, + "grad_norm": 1.0391078840034496, + "learning_rate": 1.0764331210191083e-05, + "loss": 0.4601, + "step": 170 + }, + { + "epoch": 0.08171554865300759, + "grad_norm": 0.8696302636427024, + "learning_rate": 1.0828025477707008e-05, + "loss": 0.4766, + "step": 171 + }, + { + "epoch": 0.08219341735858073, + "grad_norm": 0.8538992156431188, + "learning_rate": 1.089171974522293e-05, + "loss": 0.4929, + "step": 172 + }, + { + "epoch": 0.08267128606415387, + "grad_norm": 0.8784469546200593, + "learning_rate": 1.0955414012738854e-05, + "loss": 0.4827, + "step": 173 + }, + { + "epoch": 0.08314915476972702, + "grad_norm": 0.7996342900390987, + "learning_rate": 1.1019108280254777e-05, + "loss": 0.4768, + "step": 174 + }, + { + "epoch": 0.08362702347530016, + "grad_norm": 0.9283558874133578, + "learning_rate": 1.1082802547770702e-05, + "loss": 0.4654, + "step": 175 + }, + { + "epoch": 0.0841048921808733, + "grad_norm": 1.1381555316283323, + "learning_rate": 1.1146496815286625e-05, + "loss": 0.4741, + "step": 176 + }, + { + "epoch": 0.08458276088644645, + "grad_norm": 0.9965684448163404, + "learning_rate": 1.1210191082802548e-05, + "loss": 0.4836, + "step": 177 + }, + { + "epoch": 0.0850606295920196, + "grad_norm": 0.8998507742131849, + "learning_rate": 1.1273885350318473e-05, + "loss": 0.4996, + "step": 178 + }, + { + "epoch": 0.08553849829759273, + "grad_norm": 1.0467069720974493, + "learning_rate": 1.1337579617834396e-05, + "loss": 0.4824, + "step": 179 + }, + { + "epoch": 0.08601636700316588, + "grad_norm": 0.865608733409604, + "learning_rate": 1.1401273885350319e-05, + "loss": 0.4701, + "step": 180 + }, + { + "epoch": 0.08649423570873903, + "grad_norm": 0.8930941148308225, + "learning_rate": 1.1464968152866242e-05, + "loss": 0.4887, + "step": 181 + }, + { + "epoch": 0.08697210441431216, + "grad_norm": 0.9632697993858005, + "learning_rate": 1.1528662420382167e-05, + "loss": 0.4592, + "step": 182 + }, + { + "epoch": 0.08744997311988531, + "grad_norm": 0.8648798405759216, + "learning_rate": 1.159235668789809e-05, + "loss": 0.461, + "step": 183 + }, + { + "epoch": 0.08792784182545846, + "grad_norm": 1.1724901516943187, + "learning_rate": 1.1656050955414013e-05, + "loss": 0.4701, + "step": 184 + }, + { + "epoch": 0.08840571053103159, + "grad_norm": 1.428382716298115, + "learning_rate": 1.1719745222929938e-05, + "loss": 0.4774, + "step": 185 + }, + { + "epoch": 0.08888357923660474, + "grad_norm": 0.9014718070520222, + "learning_rate": 1.178343949044586e-05, + "loss": 0.4825, + "step": 186 + }, + { + "epoch": 0.08936144794217789, + "grad_norm": 0.9006881000139725, + "learning_rate": 1.1847133757961784e-05, + "loss": 0.4729, + "step": 187 + }, + { + "epoch": 0.08983931664775104, + "grad_norm": 0.8927655043358175, + "learning_rate": 1.1910828025477707e-05, + "loss": 0.4725, + "step": 188 + }, + { + "epoch": 0.09031718535332417, + "grad_norm": 0.8765220142542697, + "learning_rate": 1.1974522292993632e-05, + "loss": 0.4739, + "step": 189 + }, + { + "epoch": 0.09079505405889732, + "grad_norm": 0.8335515837070985, + "learning_rate": 1.2038216560509555e-05, + "loss": 0.4721, + "step": 190 + }, + { + "epoch": 0.09127292276447047, + "grad_norm": 0.8307106452230681, + "learning_rate": 1.2101910828025478e-05, + "loss": 0.4846, + "step": 191 + }, + { + "epoch": 0.0917507914700436, + "grad_norm": 0.8681247868696225, + "learning_rate": 1.2165605095541401e-05, + "loss": 0.4728, + "step": 192 + }, + { + "epoch": 0.09222866017561675, + "grad_norm": 0.8869954220873089, + "learning_rate": 1.2229299363057326e-05, + "loss": 0.4721, + "step": 193 + }, + { + "epoch": 0.0927065288811899, + "grad_norm": 0.8726364899218871, + "learning_rate": 1.2292993630573249e-05, + "loss": 0.4676, + "step": 194 + }, + { + "epoch": 0.09318439758676304, + "grad_norm": 0.9315747844747434, + "learning_rate": 1.2356687898089172e-05, + "loss": 0.4662, + "step": 195 + }, + { + "epoch": 0.09366226629233618, + "grad_norm": 0.8894305822680878, + "learning_rate": 1.2420382165605097e-05, + "loss": 0.4593, + "step": 196 + }, + { + "epoch": 0.09414013499790932, + "grad_norm": 0.8882716488428526, + "learning_rate": 1.248407643312102e-05, + "loss": 0.4805, + "step": 197 + }, + { + "epoch": 0.09461800370348247, + "grad_norm": 0.9004982374479936, + "learning_rate": 1.2547770700636943e-05, + "loss": 0.481, + "step": 198 + }, + { + "epoch": 0.0950958724090556, + "grad_norm": 0.8961471264918875, + "learning_rate": 1.2611464968152866e-05, + "loss": 0.4805, + "step": 199 + }, + { + "epoch": 0.09557374111462875, + "grad_norm": 0.883909235141162, + "learning_rate": 1.267515923566879e-05, + "loss": 0.4617, + "step": 200 + }, + { + "epoch": 0.0960516098202019, + "grad_norm": 0.8490947466546838, + "learning_rate": 1.2738853503184714e-05, + "loss": 0.4643, + "step": 201 + }, + { + "epoch": 0.09652947852577504, + "grad_norm": 1.0723263514380847, + "learning_rate": 1.2802547770700637e-05, + "loss": 0.4792, + "step": 202 + }, + { + "epoch": 0.09700734723134818, + "grad_norm": 0.8029199282504613, + "learning_rate": 1.2866242038216562e-05, + "loss": 0.4698, + "step": 203 + }, + { + "epoch": 0.09748521593692133, + "grad_norm": 0.8834995304331348, + "learning_rate": 1.2929936305732485e-05, + "loss": 0.4699, + "step": 204 + }, + { + "epoch": 0.09796308464249448, + "grad_norm": 0.8189162195122099, + "learning_rate": 1.2993630573248408e-05, + "loss": 0.4398, + "step": 205 + }, + { + "epoch": 0.09844095334806761, + "grad_norm": 0.9229854446430572, + "learning_rate": 1.3057324840764331e-05, + "loss": 0.454, + "step": 206 + }, + { + "epoch": 0.09891882205364076, + "grad_norm": 0.8530294598476205, + "learning_rate": 1.3121019108280256e-05, + "loss": 0.4644, + "step": 207 + }, + { + "epoch": 0.09939669075921391, + "grad_norm": 1.0909086978949185, + "learning_rate": 1.3184713375796179e-05, + "loss": 0.4736, + "step": 208 + }, + { + "epoch": 0.09987455946478704, + "grad_norm": 0.9272994665983898, + "learning_rate": 1.3248407643312102e-05, + "loss": 0.4515, + "step": 209 + }, + { + "epoch": 0.10035242817036019, + "grad_norm": 1.1625605063974396, + "learning_rate": 1.3312101910828025e-05, + "loss": 0.4477, + "step": 210 + }, + { + "epoch": 0.10083029687593334, + "grad_norm": 0.9352271125871605, + "learning_rate": 1.337579617834395e-05, + "loss": 0.4878, + "step": 211 + }, + { + "epoch": 0.10130816558150649, + "grad_norm": 1.2595391048195885, + "learning_rate": 1.3439490445859873e-05, + "loss": 0.4694, + "step": 212 + }, + { + "epoch": 0.10178603428707962, + "grad_norm": 0.8996354926203881, + "learning_rate": 1.3503184713375796e-05, + "loss": 0.4564, + "step": 213 + }, + { + "epoch": 0.10226390299265277, + "grad_norm": 0.9358758715512773, + "learning_rate": 1.356687898089172e-05, + "loss": 0.4866, + "step": 214 + }, + { + "epoch": 0.10274177169822592, + "grad_norm": 0.8988782843133788, + "learning_rate": 1.3630573248407644e-05, + "loss": 0.4657, + "step": 215 + }, + { + "epoch": 0.10321964040379905, + "grad_norm": 0.9416537008791075, + "learning_rate": 1.3694267515923567e-05, + "loss": 0.4563, + "step": 216 + }, + { + "epoch": 0.1036975091093722, + "grad_norm": 0.8472207027263581, + "learning_rate": 1.375796178343949e-05, + "loss": 0.4606, + "step": 217 + }, + { + "epoch": 0.10417537781494535, + "grad_norm": 0.8585181561986107, + "learning_rate": 1.3821656050955415e-05, + "loss": 0.4825, + "step": 218 + }, + { + "epoch": 0.10465324652051848, + "grad_norm": 1.015742170592993, + "learning_rate": 1.3885350318471338e-05, + "loss": 0.4576, + "step": 219 + }, + { + "epoch": 0.10513111522609163, + "grad_norm": 0.8766329989197548, + "learning_rate": 1.3949044585987261e-05, + "loss": 0.4648, + "step": 220 + }, + { + "epoch": 0.10560898393166478, + "grad_norm": 0.8917767684612441, + "learning_rate": 1.4012738853503186e-05, + "loss": 0.4713, + "step": 221 + }, + { + "epoch": 0.10608685263723792, + "grad_norm": 0.8758511266694687, + "learning_rate": 1.4076433121019109e-05, + "loss": 0.4741, + "step": 222 + }, + { + "epoch": 0.10656472134281106, + "grad_norm": 0.8916775950330903, + "learning_rate": 1.4140127388535032e-05, + "loss": 0.4544, + "step": 223 + }, + { + "epoch": 0.1070425900483842, + "grad_norm": 1.4793552859878714, + "learning_rate": 1.4203821656050955e-05, + "loss": 0.4497, + "step": 224 + }, + { + "epoch": 0.10752045875395735, + "grad_norm": 1.1872856996535994, + "learning_rate": 1.426751592356688e-05, + "loss": 0.4749, + "step": 225 + }, + { + "epoch": 0.10799832745953049, + "grad_norm": 0.8700025220590151, + "learning_rate": 1.4331210191082803e-05, + "loss": 0.4751, + "step": 226 + }, + { + "epoch": 0.10847619616510364, + "grad_norm": 0.9905036123666954, + "learning_rate": 1.4394904458598726e-05, + "loss": 0.4734, + "step": 227 + }, + { + "epoch": 0.10895406487067678, + "grad_norm": 0.9544401508765005, + "learning_rate": 1.4458598726114649e-05, + "loss": 0.4662, + "step": 228 + }, + { + "epoch": 0.10943193357624993, + "grad_norm": 0.8769851114803372, + "learning_rate": 1.4522292993630574e-05, + "loss": 0.4669, + "step": 229 + }, + { + "epoch": 0.10990980228182307, + "grad_norm": 0.8733018885948226, + "learning_rate": 1.4585987261146497e-05, + "loss": 0.4696, + "step": 230 + }, + { + "epoch": 0.11038767098739621, + "grad_norm": 0.9441368673354331, + "learning_rate": 1.464968152866242e-05, + "loss": 0.4546, + "step": 231 + }, + { + "epoch": 0.11086553969296936, + "grad_norm": 0.789316515274095, + "learning_rate": 1.4713375796178345e-05, + "loss": 0.4927, + "step": 232 + }, + { + "epoch": 0.1113434083985425, + "grad_norm": 0.8994006543929829, + "learning_rate": 1.4777070063694268e-05, + "loss": 0.4768, + "step": 233 + }, + { + "epoch": 0.11182127710411564, + "grad_norm": 0.8060765297549664, + "learning_rate": 1.4840764331210191e-05, + "loss": 0.4663, + "step": 234 + }, + { + "epoch": 0.11229914580968879, + "grad_norm": 0.8826274187881293, + "learning_rate": 1.4904458598726114e-05, + "loss": 0.4635, + "step": 235 + }, + { + "epoch": 0.11277701451526193, + "grad_norm": 0.9233679523033971, + "learning_rate": 1.4968152866242039e-05, + "loss": 0.469, + "step": 236 + }, + { + "epoch": 0.11325488322083507, + "grad_norm": 0.9109628801781714, + "learning_rate": 1.5031847133757964e-05, + "loss": 0.455, + "step": 237 + }, + { + "epoch": 0.11373275192640822, + "grad_norm": 0.8186652665707146, + "learning_rate": 1.5095541401273888e-05, + "loss": 0.4581, + "step": 238 + }, + { + "epoch": 0.11421062063198137, + "grad_norm": 0.9309662613156999, + "learning_rate": 1.5159235668789811e-05, + "loss": 0.4412, + "step": 239 + }, + { + "epoch": 0.1146884893375545, + "grad_norm": 3.931134407255136, + "learning_rate": 1.5222929936305735e-05, + "loss": 0.4509, + "step": 240 + }, + { + "epoch": 0.11516635804312765, + "grad_norm": 1.1827238868781336, + "learning_rate": 1.528662420382166e-05, + "loss": 0.4574, + "step": 241 + }, + { + "epoch": 0.1156442267487008, + "grad_norm": 0.8016143281537751, + "learning_rate": 1.5350318471337582e-05, + "loss": 0.4493, + "step": 242 + }, + { + "epoch": 0.11612209545427393, + "grad_norm": 0.968479721862784, + "learning_rate": 1.5414012738853506e-05, + "loss": 0.4698, + "step": 243 + }, + { + "epoch": 0.11659996415984708, + "grad_norm": 2.2168571117681246, + "learning_rate": 1.547770700636943e-05, + "loss": 0.4564, + "step": 244 + }, + { + "epoch": 0.11707783286542023, + "grad_norm": 0.9881765074402735, + "learning_rate": 1.5541401273885352e-05, + "loss": 0.4639, + "step": 245 + }, + { + "epoch": 0.11755570157099338, + "grad_norm": 0.941404090505303, + "learning_rate": 1.5605095541401275e-05, + "loss": 0.4631, + "step": 246 + }, + { + "epoch": 0.11803357027656651, + "grad_norm": 0.9698016606767922, + "learning_rate": 1.56687898089172e-05, + "loss": 0.444, + "step": 247 + }, + { + "epoch": 0.11851143898213966, + "grad_norm": 0.915195330989377, + "learning_rate": 1.5732484076433124e-05, + "loss": 0.4581, + "step": 248 + }, + { + "epoch": 0.1189893076877128, + "grad_norm": 0.8642148011456234, + "learning_rate": 1.5796178343949047e-05, + "loss": 0.4506, + "step": 249 + }, + { + "epoch": 0.11946717639328594, + "grad_norm": 0.935450575737815, + "learning_rate": 1.585987261146497e-05, + "loss": 0.461, + "step": 250 + }, + { + "epoch": 0.11994504509885909, + "grad_norm": 0.892113620702839, + "learning_rate": 1.5923566878980894e-05, + "loss": 0.4761, + "step": 251 + }, + { + "epoch": 0.12042291380443224, + "grad_norm": 0.9066524856260676, + "learning_rate": 1.5987261146496817e-05, + "loss": 0.4625, + "step": 252 + }, + { + "epoch": 0.12090078251000537, + "grad_norm": 1.1306712148067306, + "learning_rate": 1.605095541401274e-05, + "loss": 0.4496, + "step": 253 + }, + { + "epoch": 0.12137865121557852, + "grad_norm": 1.0415733798385562, + "learning_rate": 1.6114649681528666e-05, + "loss": 0.448, + "step": 254 + }, + { + "epoch": 0.12185651992115167, + "grad_norm": 0.8142111756751625, + "learning_rate": 1.617834394904459e-05, + "loss": 0.4602, + "step": 255 + }, + { + "epoch": 0.12233438862672481, + "grad_norm": 0.9011444127512608, + "learning_rate": 1.6242038216560512e-05, + "loss": 0.4743, + "step": 256 + }, + { + "epoch": 0.12281225733229795, + "grad_norm": 0.8423920386335364, + "learning_rate": 1.6305732484076436e-05, + "loss": 0.4584, + "step": 257 + }, + { + "epoch": 0.1232901260378711, + "grad_norm": 0.8878027469639385, + "learning_rate": 1.636942675159236e-05, + "loss": 0.4586, + "step": 258 + }, + { + "epoch": 0.12376799474344424, + "grad_norm": 0.9181872747662326, + "learning_rate": 1.643312101910828e-05, + "loss": 0.4559, + "step": 259 + }, + { + "epoch": 0.12424586344901738, + "grad_norm": 0.8298428319861421, + "learning_rate": 1.6496815286624205e-05, + "loss": 0.4719, + "step": 260 + }, + { + "epoch": 0.12472373215459052, + "grad_norm": 0.8689700122227426, + "learning_rate": 1.6560509554140128e-05, + "loss": 0.4837, + "step": 261 + }, + { + "epoch": 0.12520160086016366, + "grad_norm": 0.8387758000094565, + "learning_rate": 1.6624203821656054e-05, + "loss": 0.4727, + "step": 262 + }, + { + "epoch": 0.1256794695657368, + "grad_norm": 0.7571078182093177, + "learning_rate": 1.6687898089171977e-05, + "loss": 0.47, + "step": 263 + }, + { + "epoch": 0.12615733827130995, + "grad_norm": 0.9152714319886641, + "learning_rate": 1.67515923566879e-05, + "loss": 0.4498, + "step": 264 + }, + { + "epoch": 0.1266352069768831, + "grad_norm": 0.7676184859264714, + "learning_rate": 1.6815286624203824e-05, + "loss": 0.452, + "step": 265 + }, + { + "epoch": 0.12711307568245625, + "grad_norm": 0.893887271149616, + "learning_rate": 1.6878980891719747e-05, + "loss": 0.4571, + "step": 266 + }, + { + "epoch": 0.1275909443880294, + "grad_norm": 0.7664207431197599, + "learning_rate": 1.694267515923567e-05, + "loss": 0.4605, + "step": 267 + }, + { + "epoch": 0.12806881309360255, + "grad_norm": 0.8462357846724339, + "learning_rate": 1.7006369426751593e-05, + "loss": 0.4627, + "step": 268 + }, + { + "epoch": 0.12854668179917567, + "grad_norm": 0.7917183376178123, + "learning_rate": 1.707006369426752e-05, + "loss": 0.4587, + "step": 269 + }, + { + "epoch": 0.12902455050474881, + "grad_norm": 0.8751109922497367, + "learning_rate": 1.7133757961783442e-05, + "loss": 0.4593, + "step": 270 + }, + { + "epoch": 0.12950241921032196, + "grad_norm": 0.8869857964850081, + "learning_rate": 1.7197452229299365e-05, + "loss": 0.4455, + "step": 271 + }, + { + "epoch": 0.1299802879158951, + "grad_norm": 0.8652417097770274, + "learning_rate": 1.726114649681529e-05, + "loss": 0.4532, + "step": 272 + }, + { + "epoch": 0.13045815662146826, + "grad_norm": 0.8319161291537053, + "learning_rate": 1.732484076433121e-05, + "loss": 0.4453, + "step": 273 + }, + { + "epoch": 0.1309360253270414, + "grad_norm": 0.816357574302165, + "learning_rate": 1.7388535031847135e-05, + "loss": 0.446, + "step": 274 + }, + { + "epoch": 0.13141389403261453, + "grad_norm": 0.8885905160698827, + "learning_rate": 1.7452229299363058e-05, + "loss": 0.4695, + "step": 275 + }, + { + "epoch": 0.13189176273818767, + "grad_norm": 1.0861739881800032, + "learning_rate": 1.7515923566878984e-05, + "loss": 0.4567, + "step": 276 + }, + { + "epoch": 0.13236963144376082, + "grad_norm": 0.7809548635639034, + "learning_rate": 1.7579617834394907e-05, + "loss": 0.459, + "step": 277 + }, + { + "epoch": 0.13284750014933397, + "grad_norm": 0.7635231358009456, + "learning_rate": 1.764331210191083e-05, + "loss": 0.4589, + "step": 278 + }, + { + "epoch": 0.13332536885490712, + "grad_norm": 0.7560665612792802, + "learning_rate": 1.7707006369426754e-05, + "loss": 0.4528, + "step": 279 + }, + { + "epoch": 0.13380323756048026, + "grad_norm": 0.8086543758881466, + "learning_rate": 1.7770700636942677e-05, + "loss": 0.472, + "step": 280 + }, + { + "epoch": 0.1342811062660534, + "grad_norm": 0.7550004521071806, + "learning_rate": 1.78343949044586e-05, + "loss": 0.4551, + "step": 281 + }, + { + "epoch": 0.13475897497162653, + "grad_norm": 0.7788235365516403, + "learning_rate": 1.7898089171974523e-05, + "loss": 0.4759, + "step": 282 + }, + { + "epoch": 0.13523684367719968, + "grad_norm": 0.7296889721103695, + "learning_rate": 1.796178343949045e-05, + "loss": 0.4504, + "step": 283 + }, + { + "epoch": 0.13571471238277283, + "grad_norm": 0.8658688096621077, + "learning_rate": 1.8025477707006372e-05, + "loss": 0.4645, + "step": 284 + }, + { + "epoch": 0.13619258108834598, + "grad_norm": 0.7398631780173166, + "learning_rate": 1.8089171974522295e-05, + "loss": 0.4476, + "step": 285 + }, + { + "epoch": 0.13667044979391912, + "grad_norm": 0.8865227853224859, + "learning_rate": 1.815286624203822e-05, + "loss": 0.465, + "step": 286 + }, + { + "epoch": 0.13714831849949227, + "grad_norm": 0.8311750620796008, + "learning_rate": 1.821656050955414e-05, + "loss": 0.4652, + "step": 287 + }, + { + "epoch": 0.13762618720506542, + "grad_norm": 0.8878939015819043, + "learning_rate": 1.8280254777070065e-05, + "loss": 0.4714, + "step": 288 + }, + { + "epoch": 0.13810405591063854, + "grad_norm": 0.8154300177125134, + "learning_rate": 1.8343949044585988e-05, + "loss": 0.4458, + "step": 289 + }, + { + "epoch": 0.1385819246162117, + "grad_norm": 0.8769838209673224, + "learning_rate": 1.8407643312101914e-05, + "loss": 0.4425, + "step": 290 + }, + { + "epoch": 0.13905979332178484, + "grad_norm": 0.8218246714550715, + "learning_rate": 1.8471337579617837e-05, + "loss": 0.4555, + "step": 291 + }, + { + "epoch": 0.13953766202735798, + "grad_norm": 0.9038739220509564, + "learning_rate": 1.853503184713376e-05, + "loss": 0.4542, + "step": 292 + }, + { + "epoch": 0.14001553073293113, + "grad_norm": 0.9290101054130957, + "learning_rate": 1.8598726114649684e-05, + "loss": 0.4573, + "step": 293 + }, + { + "epoch": 0.14049339943850428, + "grad_norm": 0.8231788424091275, + "learning_rate": 1.8662420382165607e-05, + "loss": 0.454, + "step": 294 + }, + { + "epoch": 0.14097126814407743, + "grad_norm": 0.8068528909020721, + "learning_rate": 1.872611464968153e-05, + "loss": 0.4703, + "step": 295 + }, + { + "epoch": 0.14144913684965055, + "grad_norm": 0.8841324441539453, + "learning_rate": 1.8789808917197453e-05, + "loss": 0.4576, + "step": 296 + }, + { + "epoch": 0.1419270055552237, + "grad_norm": 0.8103277577643994, + "learning_rate": 1.8853503184713376e-05, + "loss": 0.4624, + "step": 297 + }, + { + "epoch": 0.14240487426079684, + "grad_norm": 0.7407111914182409, + "learning_rate": 1.8917197452229302e-05, + "loss": 0.4539, + "step": 298 + }, + { + "epoch": 0.14288274296637, + "grad_norm": 0.8494222480318973, + "learning_rate": 1.8980891719745225e-05, + "loss": 0.4633, + "step": 299 + }, + { + "epoch": 0.14336061167194314, + "grad_norm": 0.760718892931761, + "learning_rate": 1.904458598726115e-05, + "loss": 0.4491, + "step": 300 + }, + { + "epoch": 0.1438384803775163, + "grad_norm": 0.883575953116712, + "learning_rate": 1.910828025477707e-05, + "loss": 0.4579, + "step": 301 + }, + { + "epoch": 0.14431634908308943, + "grad_norm": 0.8706151177176045, + "learning_rate": 1.9171974522292995e-05, + "loss": 0.4747, + "step": 302 + }, + { + "epoch": 0.14479421778866255, + "grad_norm": 0.8387944329782351, + "learning_rate": 1.9235668789808918e-05, + "loss": 0.4484, + "step": 303 + }, + { + "epoch": 0.1452720864942357, + "grad_norm": 1.1446786646678901, + "learning_rate": 1.929936305732484e-05, + "loss": 0.4694, + "step": 304 + }, + { + "epoch": 0.14574995519980885, + "grad_norm": 0.895347496956798, + "learning_rate": 1.9363057324840767e-05, + "loss": 0.4488, + "step": 305 + }, + { + "epoch": 0.146227823905382, + "grad_norm": 1.5869264897706326, + "learning_rate": 1.942675159235669e-05, + "loss": 0.4478, + "step": 306 + }, + { + "epoch": 0.14670569261095515, + "grad_norm": 0.9876414045670644, + "learning_rate": 1.9490445859872614e-05, + "loss": 0.4593, + "step": 307 + }, + { + "epoch": 0.1471835613165283, + "grad_norm": 0.8428176446808334, + "learning_rate": 1.9554140127388537e-05, + "loss": 0.4603, + "step": 308 + }, + { + "epoch": 0.14766143002210141, + "grad_norm": 0.9091407347525033, + "learning_rate": 1.961783439490446e-05, + "loss": 0.4635, + "step": 309 + }, + { + "epoch": 0.14813929872767456, + "grad_norm": 0.8541069012050329, + "learning_rate": 1.9681528662420383e-05, + "loss": 0.4556, + "step": 310 + }, + { + "epoch": 0.1486171674332477, + "grad_norm": 0.8866008034684022, + "learning_rate": 1.9745222929936306e-05, + "loss": 0.4595, + "step": 311 + }, + { + "epoch": 0.14909503613882086, + "grad_norm": 1.0041418570948004, + "learning_rate": 1.9808917197452232e-05, + "loss": 0.4575, + "step": 312 + }, + { + "epoch": 0.149572904844394, + "grad_norm": 0.8608989524709532, + "learning_rate": 1.9872611464968155e-05, + "loss": 0.4592, + "step": 313 + }, + { + "epoch": 0.15005077354996715, + "grad_norm": 0.9193782324674579, + "learning_rate": 1.993630573248408e-05, + "loss": 0.4702, + "step": 314 + }, + { + "epoch": 0.1505286422555403, + "grad_norm": 0.8953098442701494, + "learning_rate": 2e-05, + "loss": 0.4372, + "step": 315 + }, + { + "epoch": 0.15100651096111342, + "grad_norm": 0.9315345476094137, + "learning_rate": 1.99999986130882e-05, + "loss": 0.4561, + "step": 316 + }, + { + "epoch": 0.15148437966668657, + "grad_norm": 0.8193614925815302, + "learning_rate": 1.999999445235318e-05, + "loss": 0.4419, + "step": 317 + }, + { + "epoch": 0.15196224837225972, + "grad_norm": 0.8580436146437501, + "learning_rate": 1.9999987517796095e-05, + "loss": 0.4517, + "step": 318 + }, + { + "epoch": 0.15244011707783287, + "grad_norm": 0.823969742063536, + "learning_rate": 1.9999977809418872e-05, + "loss": 0.465, + "step": 319 + }, + { + "epoch": 0.152917985783406, + "grad_norm": 0.878097134652308, + "learning_rate": 1.99999653272242e-05, + "loss": 0.4515, + "step": 320 + }, + { + "epoch": 0.15339585448897916, + "grad_norm": 0.8475324958361699, + "learning_rate": 1.999995007121554e-05, + "loss": 0.4491, + "step": 321 + }, + { + "epoch": 0.1538737231945523, + "grad_norm": 0.8481861989745667, + "learning_rate": 1.9999932041397128e-05, + "loss": 0.4623, + "step": 322 + }, + { + "epoch": 0.15435159190012543, + "grad_norm": 0.7962447154311731, + "learning_rate": 1.9999911237773964e-05, + "loss": 0.4525, + "step": 323 + }, + { + "epoch": 0.15482946060569858, + "grad_norm": 0.7407282750796474, + "learning_rate": 1.999988766035182e-05, + "loss": 0.4465, + "step": 324 + }, + { + "epoch": 0.15530732931127172, + "grad_norm": 0.8298386043292453, + "learning_rate": 1.9999861309137232e-05, + "loss": 0.4542, + "step": 325 + }, + { + "epoch": 0.15578519801684487, + "grad_norm": 0.7485540031873075, + "learning_rate": 1.999983218413751e-05, + "loss": 0.4413, + "step": 326 + }, + { + "epoch": 0.15626306672241802, + "grad_norm": 1.5089153257896015, + "learning_rate": 1.9999800285360736e-05, + "loss": 0.4631, + "step": 327 + }, + { + "epoch": 0.15674093542799117, + "grad_norm": 0.8158254057514103, + "learning_rate": 1.999976561281576e-05, + "loss": 0.4548, + "step": 328 + }, + { + "epoch": 0.15721880413356432, + "grad_norm": 0.8034746528062102, + "learning_rate": 1.9999728166512187e-05, + "loss": 0.4399, + "step": 329 + }, + { + "epoch": 0.15769667283913744, + "grad_norm": 0.7519006657356639, + "learning_rate": 1.999968794646042e-05, + "loss": 0.4503, + "step": 330 + }, + { + "epoch": 0.15817454154471058, + "grad_norm": 0.8336919761973133, + "learning_rate": 1.9999644952671604e-05, + "loss": 0.4574, + "step": 331 + }, + { + "epoch": 0.15865241025028373, + "grad_norm": 0.7895923256720561, + "learning_rate": 1.9999599185157673e-05, + "loss": 0.445, + "step": 332 + }, + { + "epoch": 0.15913027895585688, + "grad_norm": 0.8990492990399171, + "learning_rate": 1.9999550643931313e-05, + "loss": 0.4542, + "step": 333 + }, + { + "epoch": 0.15960814766143003, + "grad_norm": 0.7540142922122045, + "learning_rate": 1.9999499329005995e-05, + "loss": 0.4701, + "step": 334 + }, + { + "epoch": 0.16008601636700318, + "grad_norm": 0.8744480175459001, + "learning_rate": 1.9999445240395953e-05, + "loss": 0.4587, + "step": 335 + }, + { + "epoch": 0.16056388507257632, + "grad_norm": 0.7760610520617973, + "learning_rate": 1.9999388378116186e-05, + "loss": 0.4431, + "step": 336 + }, + { + "epoch": 0.16104175377814944, + "grad_norm": 0.9367580126034849, + "learning_rate": 1.9999328742182472e-05, + "loss": 0.4471, + "step": 337 + }, + { + "epoch": 0.1615196224837226, + "grad_norm": 0.8695840825168545, + "learning_rate": 1.999926633261135e-05, + "loss": 0.4369, + "step": 338 + }, + { + "epoch": 0.16199749118929574, + "grad_norm": 0.8085802780374626, + "learning_rate": 1.9999201149420128e-05, + "loss": 0.4412, + "step": 339 + }, + { + "epoch": 0.1624753598948689, + "grad_norm": 0.9521212177197098, + "learning_rate": 1.9999133192626893e-05, + "loss": 0.434, + "step": 340 + }, + { + "epoch": 0.16295322860044203, + "grad_norm": 0.821086659719905, + "learning_rate": 1.999906246225049e-05, + "loss": 0.4601, + "step": 341 + }, + { + "epoch": 0.16343109730601518, + "grad_norm": 0.8520130760092065, + "learning_rate": 1.9998988958310542e-05, + "loss": 0.4604, + "step": 342 + }, + { + "epoch": 0.1639089660115883, + "grad_norm": 0.90728532792054, + "learning_rate": 1.9998912680827436e-05, + "loss": 0.44, + "step": 343 + }, + { + "epoch": 0.16438683471716145, + "grad_norm": 0.8144812444652371, + "learning_rate": 1.9998833629822328e-05, + "loss": 0.4398, + "step": 344 + }, + { + "epoch": 0.1648647034227346, + "grad_norm": 0.9394455637761423, + "learning_rate": 1.9998751805317152e-05, + "loss": 0.4472, + "step": 345 + }, + { + "epoch": 0.16534257212830775, + "grad_norm": 0.7942127753027973, + "learning_rate": 1.9998667207334596e-05, + "loss": 0.4518, + "step": 346 + }, + { + "epoch": 0.1658204408338809, + "grad_norm": 0.8729986142728988, + "learning_rate": 1.9998579835898135e-05, + "loss": 0.4671, + "step": 347 + }, + { + "epoch": 0.16629830953945404, + "grad_norm": 0.8038525219516176, + "learning_rate": 1.9998489691031994e-05, + "loss": 0.4475, + "step": 348 + }, + { + "epoch": 0.1667761782450272, + "grad_norm": 0.7793008423025405, + "learning_rate": 1.9998396772761186e-05, + "loss": 0.4397, + "step": 349 + }, + { + "epoch": 0.1672540469506003, + "grad_norm": 0.8347706044801682, + "learning_rate": 1.999830108111148e-05, + "loss": 0.4471, + "step": 350 + }, + { + "epoch": 0.16773191565617346, + "grad_norm": 0.775692619220453, + "learning_rate": 1.999820261610942e-05, + "loss": 0.4539, + "step": 351 + }, + { + "epoch": 0.1682097843617466, + "grad_norm": 0.823420887104832, + "learning_rate": 1.9998101377782322e-05, + "loss": 0.4544, + "step": 352 + }, + { + "epoch": 0.16868765306731975, + "grad_norm": 0.81678869462245, + "learning_rate": 1.9997997366158264e-05, + "loss": 0.4365, + "step": 353 + }, + { + "epoch": 0.1691655217728929, + "grad_norm": 0.7743470553025751, + "learning_rate": 1.99978905812661e-05, + "loss": 0.4554, + "step": 354 + }, + { + "epoch": 0.16964339047846605, + "grad_norm": 0.7641953458649339, + "learning_rate": 1.999778102313545e-05, + "loss": 0.4578, + "step": 355 + }, + { + "epoch": 0.1701212591840392, + "grad_norm": 1.0296472768958496, + "learning_rate": 1.99976686917967e-05, + "loss": 0.4389, + "step": 356 + }, + { + "epoch": 0.17059912788961232, + "grad_norm": 0.7672656783937326, + "learning_rate": 1.9997553587281012e-05, + "loss": 0.4253, + "step": 357 + }, + { + "epoch": 0.17107699659518547, + "grad_norm": 0.7273787482171606, + "learning_rate": 1.999743570962031e-05, + "loss": 0.4458, + "step": 358 + }, + { + "epoch": 0.1715548653007586, + "grad_norm": 0.8352123781017939, + "learning_rate": 1.9997315058847296e-05, + "loss": 0.4449, + "step": 359 + }, + { + "epoch": 0.17203273400633176, + "grad_norm": 0.8661207476943021, + "learning_rate": 1.999719163499543e-05, + "loss": 0.4416, + "step": 360 + }, + { + "epoch": 0.1725106027119049, + "grad_norm": 0.814549911316121, + "learning_rate": 1.999706543809896e-05, + "loss": 0.4512, + "step": 361 + }, + { + "epoch": 0.17298847141747806, + "grad_norm": 0.783161823300539, + "learning_rate": 1.9996936468192874e-05, + "loss": 0.4426, + "step": 362 + }, + { + "epoch": 0.1734663401230512, + "grad_norm": 0.8535445094255457, + "learning_rate": 1.9996804725312963e-05, + "loss": 0.4492, + "step": 363 + }, + { + "epoch": 0.17394420882862432, + "grad_norm": 0.8668749659680126, + "learning_rate": 1.9996670209495757e-05, + "loss": 0.4574, + "step": 364 + }, + { + "epoch": 0.17442207753419747, + "grad_norm": 0.9384280918032981, + "learning_rate": 1.999653292077857e-05, + "loss": 0.4308, + "step": 365 + }, + { + "epoch": 0.17489994623977062, + "grad_norm": 0.8148125307985487, + "learning_rate": 1.999639285919949e-05, + "loss": 0.4501, + "step": 366 + }, + { + "epoch": 0.17537781494534377, + "grad_norm": 0.8327358415612538, + "learning_rate": 1.9996250024797364e-05, + "loss": 0.4318, + "step": 367 + }, + { + "epoch": 0.17585568365091692, + "grad_norm": 0.8013196691254347, + "learning_rate": 1.9996104417611815e-05, + "loss": 0.4501, + "step": 368 + }, + { + "epoch": 0.17633355235649006, + "grad_norm": 0.7247950871190025, + "learning_rate": 1.9995956037683225e-05, + "loss": 0.4371, + "step": 369 + }, + { + "epoch": 0.17681142106206318, + "grad_norm": 0.8177374937697399, + "learning_rate": 1.999580488505276e-05, + "loss": 0.4561, + "step": 370 + }, + { + "epoch": 0.17728928976763633, + "grad_norm": 0.7929393091666411, + "learning_rate": 1.999565095976234e-05, + "loss": 0.454, + "step": 371 + }, + { + "epoch": 0.17776715847320948, + "grad_norm": 0.7843978974842676, + "learning_rate": 1.999549426185466e-05, + "loss": 0.4514, + "step": 372 + }, + { + "epoch": 0.17824502717878263, + "grad_norm": 0.7290779188482008, + "learning_rate": 1.9995334791373194e-05, + "loss": 0.456, + "step": 373 + }, + { + "epoch": 0.17872289588435578, + "grad_norm": 0.7580582343922024, + "learning_rate": 1.9995172548362172e-05, + "loss": 0.4519, + "step": 374 + }, + { + "epoch": 0.17920076458992892, + "grad_norm": 0.8860454919596451, + "learning_rate": 1.9995007532866594e-05, + "loss": 0.4356, + "step": 375 + }, + { + "epoch": 0.17967863329550207, + "grad_norm": 0.9228385090291951, + "learning_rate": 1.9994839744932237e-05, + "loss": 0.4411, + "step": 376 + }, + { + "epoch": 0.1801565020010752, + "grad_norm": 0.7361256362322965, + "learning_rate": 1.9994669184605642e-05, + "loss": 0.4644, + "step": 377 + }, + { + "epoch": 0.18063437070664834, + "grad_norm": 0.7285467924733723, + "learning_rate": 1.9994495851934116e-05, + "loss": 0.4504, + "step": 378 + }, + { + "epoch": 0.1811122394122215, + "grad_norm": 0.6852384080647346, + "learning_rate": 1.9994319746965743e-05, + "loss": 0.4367, + "step": 379 + }, + { + "epoch": 0.18159010811779464, + "grad_norm": 0.7016915322810489, + "learning_rate": 1.9994140869749366e-05, + "loss": 0.4423, + "step": 380 + }, + { + "epoch": 0.18206797682336778, + "grad_norm": 0.7419915375299211, + "learning_rate": 1.9993959220334608e-05, + "loss": 0.4319, + "step": 381 + }, + { + "epoch": 0.18254584552894093, + "grad_norm": 0.691366860088611, + "learning_rate": 1.999377479877185e-05, + "loss": 0.4396, + "step": 382 + }, + { + "epoch": 0.18302371423451408, + "grad_norm": 0.8101619011356389, + "learning_rate": 1.9993587605112252e-05, + "loss": 0.437, + "step": 383 + }, + { + "epoch": 0.1835015829400872, + "grad_norm": 0.7680909891808453, + "learning_rate": 1.9993397639407736e-05, + "loss": 0.4415, + "step": 384 + }, + { + "epoch": 0.18397945164566035, + "grad_norm": 0.7391098105965442, + "learning_rate": 1.9993204901710995e-05, + "loss": 0.4519, + "step": 385 + }, + { + "epoch": 0.1844573203512335, + "grad_norm": 0.7022079848845771, + "learning_rate": 1.999300939207549e-05, + "loss": 0.4378, + "step": 386 + }, + { + "epoch": 0.18493518905680664, + "grad_norm": 0.8413020737595335, + "learning_rate": 1.999281111055545e-05, + "loss": 0.4259, + "step": 387 + }, + { + "epoch": 0.1854130577623798, + "grad_norm": 0.675966589527395, + "learning_rate": 1.9992610057205888e-05, + "loss": 0.4401, + "step": 388 + }, + { + "epoch": 0.18589092646795294, + "grad_norm": 0.7936827487639535, + "learning_rate": 1.9992406232082557e-05, + "loss": 0.4364, + "step": 389 + }, + { + "epoch": 0.1863687951735261, + "grad_norm": 0.6543232226417673, + "learning_rate": 1.9992199635241997e-05, + "loss": 0.4505, + "step": 390 + }, + { + "epoch": 0.1868466638790992, + "grad_norm": 0.8239352462676031, + "learning_rate": 1.9991990266741524e-05, + "loss": 0.4405, + "step": 391 + }, + { + "epoch": 0.18732453258467235, + "grad_norm": 0.7573116961718065, + "learning_rate": 1.9991778126639202e-05, + "loss": 0.4219, + "step": 392 + }, + { + "epoch": 0.1878024012902455, + "grad_norm": 0.7835516359179223, + "learning_rate": 1.9991563214993885e-05, + "loss": 0.4374, + "step": 393 + }, + { + "epoch": 0.18828026999581865, + "grad_norm": 0.7604499069077229, + "learning_rate": 1.9991345531865173e-05, + "loss": 0.4589, + "step": 394 + }, + { + "epoch": 0.1887581387013918, + "grad_norm": 0.76870199204744, + "learning_rate": 1.999112507731346e-05, + "loss": 0.4508, + "step": 395 + }, + { + "epoch": 0.18923600740696495, + "grad_norm": 0.6786887636756039, + "learning_rate": 1.999090185139989e-05, + "loss": 0.4342, + "step": 396 + }, + { + "epoch": 0.1897138761125381, + "grad_norm": 0.6999332563120491, + "learning_rate": 1.9990675854186384e-05, + "loss": 0.4407, + "step": 397 + }, + { + "epoch": 0.1901917448181112, + "grad_norm": 0.8334575931899786, + "learning_rate": 1.9990447085735624e-05, + "loss": 0.4429, + "step": 398 + }, + { + "epoch": 0.19066961352368436, + "grad_norm": 0.7685745229409217, + "learning_rate": 1.9990215546111074e-05, + "loss": 0.4398, + "step": 399 + }, + { + "epoch": 0.1911474822292575, + "grad_norm": 0.7005717268603726, + "learning_rate": 1.9989981235376956e-05, + "loss": 0.4486, + "step": 400 + }, + { + "epoch": 0.19162535093483066, + "grad_norm": 0.6603349007597211, + "learning_rate": 1.9989744153598264e-05, + "loss": 0.4581, + "step": 401 + }, + { + "epoch": 0.1921032196404038, + "grad_norm": 0.7880156496415623, + "learning_rate": 1.998950430084076e-05, + "loss": 0.4381, + "step": 402 + }, + { + "epoch": 0.19258108834597695, + "grad_norm": 0.7736124963496666, + "learning_rate": 1.998926167717097e-05, + "loss": 0.4202, + "step": 403 + }, + { + "epoch": 0.19305895705155007, + "grad_norm": 0.7008884273159984, + "learning_rate": 1.9989016282656207e-05, + "loss": 0.4539, + "step": 404 + }, + { + "epoch": 0.19353682575712322, + "grad_norm": 0.7575001466030076, + "learning_rate": 1.9988768117364526e-05, + "loss": 0.4367, + "step": 405 + }, + { + "epoch": 0.19401469446269637, + "grad_norm": 0.7446911712881855, + "learning_rate": 1.9988517181364767e-05, + "loss": 0.4587, + "step": 406 + }, + { + "epoch": 0.19449256316826952, + "grad_norm": 0.7076619424650085, + "learning_rate": 1.9988263474726536e-05, + "loss": 0.4322, + "step": 407 + }, + { + "epoch": 0.19497043187384266, + "grad_norm": 0.7386357724353199, + "learning_rate": 1.9988006997520208e-05, + "loss": 0.446, + "step": 408 + }, + { + "epoch": 0.1954483005794158, + "grad_norm": 0.7180682823947508, + "learning_rate": 1.9987747749816923e-05, + "loss": 0.4367, + "step": 409 + }, + { + "epoch": 0.19592616928498896, + "grad_norm": 0.8350524472024324, + "learning_rate": 1.9987485731688595e-05, + "loss": 0.4691, + "step": 410 + }, + { + "epoch": 0.19640403799056208, + "grad_norm": 0.7262068323613972, + "learning_rate": 1.9987220943207903e-05, + "loss": 0.4412, + "step": 411 + }, + { + "epoch": 0.19688190669613523, + "grad_norm": 0.7033003276566966, + "learning_rate": 1.998695338444829e-05, + "loss": 0.4516, + "step": 412 + }, + { + "epoch": 0.19735977540170838, + "grad_norm": 0.740747812864168, + "learning_rate": 1.9986683055483975e-05, + "loss": 0.4444, + "step": 413 + }, + { + "epoch": 0.19783764410728152, + "grad_norm": 0.8106322648081262, + "learning_rate": 1.9986409956389946e-05, + "loss": 0.421, + "step": 414 + }, + { + "epoch": 0.19831551281285467, + "grad_norm": 1.343586386680477, + "learning_rate": 1.998613408724195e-05, + "loss": 0.4443, + "step": 415 + }, + { + "epoch": 0.19879338151842782, + "grad_norm": 0.8243575543930032, + "learning_rate": 1.9985855448116507e-05, + "loss": 0.4256, + "step": 416 + }, + { + "epoch": 0.19927125022400097, + "grad_norm": 0.7616433208806905, + "learning_rate": 1.9985574039090912e-05, + "loss": 0.4333, + "step": 417 + }, + { + "epoch": 0.1997491189295741, + "grad_norm": 0.8197988566665843, + "learning_rate": 1.9985289860243222e-05, + "loss": 0.4292, + "step": 418 + }, + { + "epoch": 0.20022698763514724, + "grad_norm": 0.6780749889189807, + "learning_rate": 1.9985002911652262e-05, + "loss": 0.4454, + "step": 419 + }, + { + "epoch": 0.20070485634072038, + "grad_norm": 0.8323686570996995, + "learning_rate": 1.998471319339763e-05, + "loss": 0.4357, + "step": 420 + }, + { + "epoch": 0.20118272504629353, + "grad_norm": 0.7320142206171304, + "learning_rate": 1.998442070555968e-05, + "loss": 0.4145, + "step": 421 + }, + { + "epoch": 0.20166059375186668, + "grad_norm": 0.782381732360279, + "learning_rate": 1.9984125448219555e-05, + "loss": 0.4424, + "step": 422 + }, + { + "epoch": 0.20213846245743983, + "grad_norm": 0.748058238639162, + "learning_rate": 1.998382742145914e-05, + "loss": 0.4426, + "step": 423 + }, + { + "epoch": 0.20261633116301297, + "grad_norm": 0.7799942137521412, + "learning_rate": 1.9983526625361115e-05, + "loss": 0.4427, + "step": 424 + }, + { + "epoch": 0.2030941998685861, + "grad_norm": 0.703037954101251, + "learning_rate": 1.9983223060008908e-05, + "loss": 0.4298, + "step": 425 + }, + { + "epoch": 0.20357206857415924, + "grad_norm": 0.7344901897075886, + "learning_rate": 1.998291672548673e-05, + "loss": 0.4369, + "step": 426 + }, + { + "epoch": 0.2040499372797324, + "grad_norm": 0.743895441020455, + "learning_rate": 1.9982607621879545e-05, + "loss": 0.4235, + "step": 427 + }, + { + "epoch": 0.20452780598530554, + "grad_norm": 0.6646898981475607, + "learning_rate": 1.9982295749273093e-05, + "loss": 0.4378, + "step": 428 + }, + { + "epoch": 0.2050056746908787, + "grad_norm": 0.7093024591727115, + "learning_rate": 1.998198110775389e-05, + "loss": 0.428, + "step": 429 + }, + { + "epoch": 0.20548354339645183, + "grad_norm": 0.7212954419357257, + "learning_rate": 1.9981663697409203e-05, + "loss": 0.4356, + "step": 430 + }, + { + "epoch": 0.20596141210202498, + "grad_norm": 0.723396603925273, + "learning_rate": 1.998134351832708e-05, + "loss": 0.4309, + "step": 431 + }, + { + "epoch": 0.2064392808075981, + "grad_norm": 0.7369743637876824, + "learning_rate": 1.9981020570596334e-05, + "loss": 0.446, + "step": 432 + }, + { + "epoch": 0.20691714951317125, + "grad_norm": 0.7097781588794618, + "learning_rate": 1.9980694854306545e-05, + "loss": 0.4302, + "step": 433 + }, + { + "epoch": 0.2073950182187444, + "grad_norm": 0.6844965007718748, + "learning_rate": 1.998036636954806e-05, + "loss": 0.4293, + "step": 434 + }, + { + "epoch": 0.20787288692431755, + "grad_norm": 0.7056310983568724, + "learning_rate": 1.998003511641199e-05, + "loss": 0.4541, + "step": 435 + }, + { + "epoch": 0.2083507556298907, + "grad_norm": 0.6540869976167216, + "learning_rate": 1.9979701094990226e-05, + "loss": 0.4187, + "step": 436 + }, + { + "epoch": 0.20882862433546384, + "grad_norm": 0.7375068029416245, + "learning_rate": 1.997936430537542e-05, + "loss": 0.4377, + "step": 437 + }, + { + "epoch": 0.20930649304103696, + "grad_norm": 0.6814790625160986, + "learning_rate": 1.9979024747660985e-05, + "loss": 0.4445, + "step": 438 + }, + { + "epoch": 0.2097843617466101, + "grad_norm": 0.7103604757794988, + "learning_rate": 1.9978682421941114e-05, + "loss": 0.4313, + "step": 439 + }, + { + "epoch": 0.21026223045218326, + "grad_norm": 0.7165788798454129, + "learning_rate": 1.997833732831076e-05, + "loss": 0.4419, + "step": 440 + }, + { + "epoch": 0.2107400991577564, + "grad_norm": 0.7673501597057077, + "learning_rate": 1.9977989466865645e-05, + "loss": 0.435, + "step": 441 + }, + { + "epoch": 0.21121796786332955, + "grad_norm": 0.6931708197147421, + "learning_rate": 1.9977638837702263e-05, + "loss": 0.4317, + "step": 442 + }, + { + "epoch": 0.2116958365689027, + "grad_norm": 0.6687440502755292, + "learning_rate": 1.997728544091787e-05, + "loss": 0.4575, + "step": 443 + }, + { + "epoch": 0.21217370527447585, + "grad_norm": 0.7234453514272886, + "learning_rate": 1.997692927661049e-05, + "loss": 0.4186, + "step": 444 + }, + { + "epoch": 0.21265157398004897, + "grad_norm": 0.8989542148130306, + "learning_rate": 1.9976570344878916e-05, + "loss": 0.4277, + "step": 445 + }, + { + "epoch": 0.21312944268562212, + "grad_norm": 0.6798823076977999, + "learning_rate": 1.9976208645822716e-05, + "loss": 0.4203, + "step": 446 + }, + { + "epoch": 0.21360731139119526, + "grad_norm": 0.7297558856068362, + "learning_rate": 1.9975844179542213e-05, + "loss": 0.4361, + "step": 447 + }, + { + "epoch": 0.2140851800967684, + "grad_norm": 0.7211293256955031, + "learning_rate": 1.9975476946138506e-05, + "loss": 0.4296, + "step": 448 + }, + { + "epoch": 0.21456304880234156, + "grad_norm": 0.7882118710161115, + "learning_rate": 1.997510694571346e-05, + "loss": 0.4462, + "step": 449 + }, + { + "epoch": 0.2150409175079147, + "grad_norm": 0.7179853617013857, + "learning_rate": 1.9974734178369702e-05, + "loss": 0.4296, + "step": 450 + }, + { + "epoch": 0.21551878621348786, + "grad_norm": 0.7095782872691374, + "learning_rate": 1.9974358644210635e-05, + "loss": 0.4411, + "step": 451 + }, + { + "epoch": 0.21599665491906098, + "grad_norm": 0.6772035963394542, + "learning_rate": 1.997398034334043e-05, + "loss": 0.4317, + "step": 452 + }, + { + "epoch": 0.21647452362463412, + "grad_norm": 0.6743627379910052, + "learning_rate": 1.997359927586401e-05, + "loss": 0.4401, + "step": 453 + }, + { + "epoch": 0.21695239233020727, + "grad_norm": 0.7837750007770807, + "learning_rate": 1.9973215441887085e-05, + "loss": 0.4448, + "step": 454 + }, + { + "epoch": 0.21743026103578042, + "grad_norm": 0.7047243533444423, + "learning_rate": 1.997282884151612e-05, + "loss": 0.4473, + "step": 455 + }, + { + "epoch": 0.21790812974135357, + "grad_norm": 1.4379264367718632, + "learning_rate": 1.9972439474858348e-05, + "loss": 0.454, + "step": 456 + }, + { + "epoch": 0.21838599844692672, + "grad_norm": 0.8191228771916826, + "learning_rate": 1.9972047342021784e-05, + "loss": 0.4322, + "step": 457 + }, + { + "epoch": 0.21886386715249986, + "grad_norm": 0.7133604179165115, + "learning_rate": 1.9971652443115186e-05, + "loss": 0.4153, + "step": 458 + }, + { + "epoch": 0.21934173585807298, + "grad_norm": 0.6701729667469203, + "learning_rate": 1.99712547782481e-05, + "loss": 0.4286, + "step": 459 + }, + { + "epoch": 0.21981960456364613, + "grad_norm": 0.7794564597547953, + "learning_rate": 1.9970854347530828e-05, + "loss": 0.4385, + "step": 460 + }, + { + "epoch": 0.22029747326921928, + "grad_norm": 0.8024059316138707, + "learning_rate": 1.9970451151074442e-05, + "loss": 0.4374, + "step": 461 + }, + { + "epoch": 0.22077534197479243, + "grad_norm": 0.6631388176963441, + "learning_rate": 1.997004518899078e-05, + "loss": 0.4368, + "step": 462 + }, + { + "epoch": 0.22125321068036558, + "grad_norm": 0.7042713011158211, + "learning_rate": 1.9969636461392454e-05, + "loss": 0.4398, + "step": 463 + }, + { + "epoch": 0.22173107938593872, + "grad_norm": 0.7010671119076932, + "learning_rate": 1.9969224968392837e-05, + "loss": 0.4541, + "step": 464 + }, + { + "epoch": 0.22220894809151187, + "grad_norm": 0.7354792559659475, + "learning_rate": 1.9968810710106065e-05, + "loss": 0.4397, + "step": 465 + }, + { + "epoch": 0.222686816797085, + "grad_norm": 0.7095812173991619, + "learning_rate": 1.9968393686647046e-05, + "loss": 0.4487, + "step": 466 + }, + { + "epoch": 0.22316468550265814, + "grad_norm": 0.6677575026918666, + "learning_rate": 1.9967973898131462e-05, + "loss": 0.4468, + "step": 467 + }, + { + "epoch": 0.2236425542082313, + "grad_norm": 0.7939463667123086, + "learning_rate": 1.9967551344675752e-05, + "loss": 0.4281, + "step": 468 + }, + { + "epoch": 0.22412042291380443, + "grad_norm": 0.7618631702940217, + "learning_rate": 1.996712602639712e-05, + "loss": 0.4283, + "step": 469 + }, + { + "epoch": 0.22459829161937758, + "grad_norm": 0.7333732414541805, + "learning_rate": 1.9966697943413548e-05, + "loss": 0.4305, + "step": 470 + }, + { + "epoch": 0.22507616032495073, + "grad_norm": 0.7779594408406364, + "learning_rate": 1.9966267095843776e-05, + "loss": 0.4052, + "step": 471 + }, + { + "epoch": 0.22555402903052385, + "grad_norm": 0.6757227788709603, + "learning_rate": 1.996583348380731e-05, + "loss": 0.4196, + "step": 472 + }, + { + "epoch": 0.226031897736097, + "grad_norm": 0.7580164038914531, + "learning_rate": 1.9965397107424434e-05, + "loss": 0.404, + "step": 473 + }, + { + "epoch": 0.22650976644167015, + "grad_norm": 0.7558024725157709, + "learning_rate": 1.9964957966816184e-05, + "loss": 0.4426, + "step": 474 + }, + { + "epoch": 0.2269876351472433, + "grad_norm": 0.780681584126318, + "learning_rate": 1.9964516062104377e-05, + "loss": 0.4283, + "step": 475 + }, + { + "epoch": 0.22746550385281644, + "grad_norm": 0.9899203517841794, + "learning_rate": 1.996407139341158e-05, + "loss": 0.4241, + "step": 476 + }, + { + "epoch": 0.2279433725583896, + "grad_norm": 0.6521695714753446, + "learning_rate": 1.9963623960861144e-05, + "loss": 0.4198, + "step": 477 + }, + { + "epoch": 0.22842124126396274, + "grad_norm": 0.725943514393016, + "learning_rate": 1.9963173764577178e-05, + "loss": 0.4447, + "step": 478 + }, + { + "epoch": 0.22889910996953586, + "grad_norm": 0.6805293528269828, + "learning_rate": 1.9962720804684555e-05, + "loss": 0.4276, + "step": 479 + }, + { + "epoch": 0.229376978675109, + "grad_norm": 0.7471831254807846, + "learning_rate": 1.996226508130892e-05, + "loss": 0.4209, + "step": 480 + }, + { + "epoch": 0.22985484738068215, + "grad_norm": 0.6443809058619084, + "learning_rate": 1.9961806594576684e-05, + "loss": 0.4351, + "step": 481 + }, + { + "epoch": 0.2303327160862553, + "grad_norm": 1.3415000224918492, + "learning_rate": 1.996134534461502e-05, + "loss": 0.459, + "step": 482 + }, + { + "epoch": 0.23081058479182845, + "grad_norm": 0.7134420590449075, + "learning_rate": 1.996088133155188e-05, + "loss": 0.4387, + "step": 483 + }, + { + "epoch": 0.2312884534974016, + "grad_norm": 0.7813274776781214, + "learning_rate": 1.9960414555515958e-05, + "loss": 0.4126, + "step": 484 + }, + { + "epoch": 0.23176632220297474, + "grad_norm": 0.6898643605397596, + "learning_rate": 1.995994501663674e-05, + "loss": 0.4246, + "step": 485 + }, + { + "epoch": 0.23224419090854787, + "grad_norm": 0.9121587741249505, + "learning_rate": 1.9959472715044463e-05, + "loss": 0.4244, + "step": 486 + }, + { + "epoch": 0.232722059614121, + "grad_norm": 0.7147251675369962, + "learning_rate": 1.9958997650870137e-05, + "loss": 0.4224, + "step": 487 + }, + { + "epoch": 0.23319992831969416, + "grad_norm": 0.7899586618920265, + "learning_rate": 1.9958519824245536e-05, + "loss": 0.4112, + "step": 488 + }, + { + "epoch": 0.2336777970252673, + "grad_norm": 0.7904708831006497, + "learning_rate": 1.99580392353032e-05, + "loss": 0.4389, + "step": 489 + }, + { + "epoch": 0.23415566573084046, + "grad_norm": 0.6741342579240754, + "learning_rate": 1.995755588417644e-05, + "loss": 0.4243, + "step": 490 + }, + { + "epoch": 0.2346335344364136, + "grad_norm": 0.738629652421803, + "learning_rate": 1.9957069770999324e-05, + "loss": 0.4437, + "step": 491 + }, + { + "epoch": 0.23511140314198675, + "grad_norm": 0.6924217432219697, + "learning_rate": 1.9956580895906694e-05, + "loss": 0.43, + "step": 492 + }, + { + "epoch": 0.23558927184755987, + "grad_norm": 0.7369180221087179, + "learning_rate": 1.9956089259034154e-05, + "loss": 0.4339, + "step": 493 + }, + { + "epoch": 0.23606714055313302, + "grad_norm": 0.805559465873891, + "learning_rate": 1.995559486051808e-05, + "loss": 0.4361, + "step": 494 + }, + { + "epoch": 0.23654500925870617, + "grad_norm": 0.7823248018398075, + "learning_rate": 1.99550977004956e-05, + "loss": 0.4359, + "step": 495 + }, + { + "epoch": 0.23702287796427932, + "grad_norm": 0.7554148112351713, + "learning_rate": 1.9954597779104624e-05, + "loss": 0.4242, + "step": 496 + }, + { + "epoch": 0.23750074666985246, + "grad_norm": 0.7027609576658066, + "learning_rate": 1.995409509648382e-05, + "loss": 0.4309, + "step": 497 + }, + { + "epoch": 0.2379786153754256, + "grad_norm": 0.705129151371866, + "learning_rate": 1.9953589652772627e-05, + "loss": 0.4464, + "step": 498 + }, + { + "epoch": 0.23845648408099873, + "grad_norm": 0.7214815482253628, + "learning_rate": 1.995308144811124e-05, + "loss": 0.4427, + "step": 499 + }, + { + "epoch": 0.23893435278657188, + "grad_norm": 0.7183066127583475, + "learning_rate": 1.9952570482640628e-05, + "loss": 0.4301, + "step": 500 + }, + { + "epoch": 0.23941222149214503, + "grad_norm": 0.797831280147801, + "learning_rate": 1.9952056756502525e-05, + "loss": 0.4367, + "step": 501 + }, + { + "epoch": 0.23989009019771818, + "grad_norm": 0.7379315395749698, + "learning_rate": 1.9951540269839428e-05, + "loss": 0.4399, + "step": 502 + }, + { + "epoch": 0.24036795890329132, + "grad_norm": 0.7945392228374598, + "learning_rate": 1.9951021022794602e-05, + "loss": 0.4234, + "step": 503 + }, + { + "epoch": 0.24084582760886447, + "grad_norm": 0.7264140524348444, + "learning_rate": 1.995049901551208e-05, + "loss": 0.4264, + "step": 504 + }, + { + "epoch": 0.24132369631443762, + "grad_norm": 0.668632865034176, + "learning_rate": 1.9949974248136655e-05, + "loss": 0.4288, + "step": 505 + }, + { + "epoch": 0.24180156502001074, + "grad_norm": 0.8309640004970509, + "learning_rate": 1.9949446720813886e-05, + "loss": 0.4438, + "step": 506 + }, + { + "epoch": 0.2422794337255839, + "grad_norm": 0.6904435675318091, + "learning_rate": 1.9948916433690103e-05, + "loss": 0.4439, + "step": 507 + }, + { + "epoch": 0.24275730243115703, + "grad_norm": 0.7354434100825048, + "learning_rate": 1.99483833869124e-05, + "loss": 0.4283, + "step": 508 + }, + { + "epoch": 0.24323517113673018, + "grad_norm": 0.6765398629661438, + "learning_rate": 1.9947847580628625e-05, + "loss": 0.4387, + "step": 509 + }, + { + "epoch": 0.24371303984230333, + "grad_norm": 0.7239562081327908, + "learning_rate": 1.9947309014987414e-05, + "loss": 0.4375, + "step": 510 + }, + { + "epoch": 0.24419090854787648, + "grad_norm": 0.6978300565774074, + "learning_rate": 1.9946767690138146e-05, + "loss": 0.4356, + "step": 511 + }, + { + "epoch": 0.24466877725344963, + "grad_norm": 0.7117480219425797, + "learning_rate": 1.994622360623098e-05, + "loss": 0.4403, + "step": 512 + }, + { + "epoch": 0.24514664595902275, + "grad_norm": 0.7477545746890273, + "learning_rate": 1.994567676341683e-05, + "loss": 0.4245, + "step": 513 + }, + { + "epoch": 0.2456245146645959, + "grad_norm": 0.7374013791361971, + "learning_rate": 1.9945127161847393e-05, + "loss": 0.4223, + "step": 514 + }, + { + "epoch": 0.24610238337016904, + "grad_norm": 0.6642173278949521, + "learning_rate": 1.9944574801675106e-05, + "loss": 0.4341, + "step": 515 + }, + { + "epoch": 0.2465802520757422, + "grad_norm": 0.7259774358658097, + "learning_rate": 1.994401968305319e-05, + "loss": 0.4252, + "step": 516 + }, + { + "epoch": 0.24705812078131534, + "grad_norm": 0.7180794651515113, + "learning_rate": 1.994346180613562e-05, + "loss": 0.4348, + "step": 517 + }, + { + "epoch": 0.24753598948688849, + "grad_norm": 0.6955289749178415, + "learning_rate": 1.9942901171077146e-05, + "loss": 0.4375, + "step": 518 + }, + { + "epoch": 0.24801385819246163, + "grad_norm": 0.7245432909819279, + "learning_rate": 1.994233777803328e-05, + "loss": 0.4399, + "step": 519 + }, + { + "epoch": 0.24849172689803475, + "grad_norm": 0.6067174282133342, + "learning_rate": 1.9941771627160287e-05, + "loss": 0.4217, + "step": 520 + }, + { + "epoch": 0.2489695956036079, + "grad_norm": 0.703988857071276, + "learning_rate": 1.994120271861522e-05, + "loss": 0.4412, + "step": 521 + }, + { + "epoch": 0.24944746430918105, + "grad_norm": 0.8991621677494505, + "learning_rate": 1.9940631052555882e-05, + "loss": 0.3995, + "step": 522 + }, + { + "epoch": 0.2499253330147542, + "grad_norm": 0.7976604424556334, + "learning_rate": 1.9940056629140835e-05, + "loss": 0.4105, + "step": 523 + }, + { + "epoch": 0.2504032017203273, + "grad_norm": 0.6019680965372577, + "learning_rate": 1.9939479448529418e-05, + "loss": 0.4192, + "step": 524 + }, + { + "epoch": 0.25088107042590047, + "grad_norm": 0.704506878051367, + "learning_rate": 1.9938899510881732e-05, + "loss": 0.4196, + "step": 525 + }, + { + "epoch": 0.2513589391314736, + "grad_norm": 0.6554375963262496, + "learning_rate": 1.9938316816358644e-05, + "loss": 0.4343, + "step": 526 + }, + { + "epoch": 0.25183680783704676, + "grad_norm": 0.992457986922531, + "learning_rate": 1.9937731365121777e-05, + "loss": 0.4328, + "step": 527 + }, + { + "epoch": 0.2523146765426199, + "grad_norm": 0.6658083513417471, + "learning_rate": 1.9937143157333528e-05, + "loss": 0.4298, + "step": 528 + }, + { + "epoch": 0.25279254524819306, + "grad_norm": 0.7222172239047646, + "learning_rate": 1.9936552193157055e-05, + "loss": 0.428, + "step": 529 + }, + { + "epoch": 0.2532704139537662, + "grad_norm": 0.6390063477285883, + "learning_rate": 1.9935958472756283e-05, + "loss": 0.4348, + "step": 530 + }, + { + "epoch": 0.25374828265933935, + "grad_norm": 0.749202476640269, + "learning_rate": 1.9935361996295896e-05, + "loss": 0.4424, + "step": 531 + }, + { + "epoch": 0.2542261513649125, + "grad_norm": 0.7192147324773318, + "learning_rate": 1.993476276394135e-05, + "loss": 0.447, + "step": 532 + }, + { + "epoch": 0.25470402007048565, + "grad_norm": 0.7498159675525952, + "learning_rate": 1.9934160775858856e-05, + "loss": 0.416, + "step": 533 + }, + { + "epoch": 0.2551818887760588, + "grad_norm": 0.7187313720282974, + "learning_rate": 1.9933556032215402e-05, + "loss": 0.4033, + "step": 534 + }, + { + "epoch": 0.25565975748163194, + "grad_norm": 0.7075911635545219, + "learning_rate": 1.993294853317873e-05, + "loss": 0.4233, + "step": 535 + }, + { + "epoch": 0.2561376261872051, + "grad_norm": 0.7116488897227271, + "learning_rate": 1.9932338278917348e-05, + "loss": 0.4469, + "step": 536 + }, + { + "epoch": 0.2566154948927782, + "grad_norm": 0.6870926308030396, + "learning_rate": 1.993172526960053e-05, + "loss": 0.4429, + "step": 537 + }, + { + "epoch": 0.25709336359835133, + "grad_norm": 0.7109693256013045, + "learning_rate": 1.9931109505398316e-05, + "loss": 0.4263, + "step": 538 + }, + { + "epoch": 0.2575712323039245, + "grad_norm": 0.6545214091784688, + "learning_rate": 1.9930490986481507e-05, + "loss": 0.4346, + "step": 539 + }, + { + "epoch": 0.25804910100949763, + "grad_norm": 0.8547757226945168, + "learning_rate": 1.9929869713021668e-05, + "loss": 0.4203, + "step": 540 + }, + { + "epoch": 0.2585269697150708, + "grad_norm": 0.6813604542557925, + "learning_rate": 1.9929245685191133e-05, + "loss": 0.4267, + "step": 541 + }, + { + "epoch": 0.2590048384206439, + "grad_norm": 0.6641936751299076, + "learning_rate": 1.9928618903162992e-05, + "loss": 0.4339, + "step": 542 + }, + { + "epoch": 0.25948270712621707, + "grad_norm": 0.9793051618462918, + "learning_rate": 1.9927989367111102e-05, + "loss": 0.4389, + "step": 543 + }, + { + "epoch": 0.2599605758317902, + "grad_norm": 0.5970607719650806, + "learning_rate": 1.9927357077210093e-05, + "loss": 0.4285, + "step": 544 + }, + { + "epoch": 0.26043844453736337, + "grad_norm": 0.6677945714629177, + "learning_rate": 1.9926722033635343e-05, + "loss": 0.4185, + "step": 545 + }, + { + "epoch": 0.2609163132429365, + "grad_norm": 0.7032868326166338, + "learning_rate": 1.992608423656301e-05, + "loss": 0.4248, + "step": 546 + }, + { + "epoch": 0.26139418194850966, + "grad_norm": 0.675972367560421, + "learning_rate": 1.9925443686169998e-05, + "loss": 0.4358, + "step": 547 + }, + { + "epoch": 0.2618720506540828, + "grad_norm": 0.6648018975438753, + "learning_rate": 1.992480038263399e-05, + "loss": 0.4243, + "step": 548 + }, + { + "epoch": 0.26234991935965596, + "grad_norm": 0.7300070842008606, + "learning_rate": 1.992415432613343e-05, + "loss": 0.426, + "step": 549 + }, + { + "epoch": 0.26282778806522905, + "grad_norm": 0.6966820393669727, + "learning_rate": 1.9923505516847514e-05, + "loss": 0.4462, + "step": 550 + }, + { + "epoch": 0.2633056567708022, + "grad_norm": 0.6799521833338832, + "learning_rate": 1.9922853954956217e-05, + "loss": 0.4165, + "step": 551 + }, + { + "epoch": 0.26378352547637535, + "grad_norm": 0.6842858880612487, + "learning_rate": 1.9922199640640272e-05, + "loss": 0.4215, + "step": 552 + }, + { + "epoch": 0.2642613941819485, + "grad_norm": 0.684498742912379, + "learning_rate": 1.9921542574081168e-05, + "loss": 0.4376, + "step": 553 + }, + { + "epoch": 0.26473926288752164, + "grad_norm": 0.7472555717042567, + "learning_rate": 1.992088275546117e-05, + "loss": 0.4281, + "step": 554 + }, + { + "epoch": 0.2652171315930948, + "grad_norm": 0.6526006080700162, + "learning_rate": 1.9920220184963296e-05, + "loss": 0.4203, + "step": 555 + }, + { + "epoch": 0.26569500029866794, + "grad_norm": 0.6864252975558336, + "learning_rate": 1.991955486277133e-05, + "loss": 0.4253, + "step": 556 + }, + { + "epoch": 0.2661728690042411, + "grad_norm": 0.7010911608893313, + "learning_rate": 1.9918886789069824e-05, + "loss": 0.4268, + "step": 557 + }, + { + "epoch": 0.26665073770981423, + "grad_norm": 0.6432530703558913, + "learning_rate": 1.991821596404409e-05, + "loss": 0.4095, + "step": 558 + }, + { + "epoch": 0.2671286064153874, + "grad_norm": 0.6714812788316195, + "learning_rate": 1.99175423878802e-05, + "loss": 0.439, + "step": 559 + }, + { + "epoch": 0.26760647512096053, + "grad_norm": 0.7313943296188351, + "learning_rate": 1.9916866060764994e-05, + "loss": 0.4207, + "step": 560 + }, + { + "epoch": 0.2680843438265337, + "grad_norm": 0.6753023749397835, + "learning_rate": 1.9916186982886074e-05, + "loss": 0.4232, + "step": 561 + }, + { + "epoch": 0.2685622125321068, + "grad_norm": 0.6812854395562149, + "learning_rate": 1.9915505154431806e-05, + "loss": 0.4312, + "step": 562 + }, + { + "epoch": 0.26904008123768, + "grad_norm": 0.7326468942961418, + "learning_rate": 1.9914820575591315e-05, + "loss": 0.4106, + "step": 563 + }, + { + "epoch": 0.26951794994325307, + "grad_norm": 0.710879982047124, + "learning_rate": 1.9914133246554486e-05, + "loss": 0.4312, + "step": 564 + }, + { + "epoch": 0.2699958186488262, + "grad_norm": 0.6098533019354313, + "learning_rate": 1.991344316751198e-05, + "loss": 0.4269, + "step": 565 + }, + { + "epoch": 0.27047368735439936, + "grad_norm": 0.6527319247618629, + "learning_rate": 1.9912750338655207e-05, + "loss": 0.4253, + "step": 566 + }, + { + "epoch": 0.2709515560599725, + "grad_norm": 0.6554170109236619, + "learning_rate": 1.9912054760176352e-05, + "loss": 0.4175, + "step": 567 + }, + { + "epoch": 0.27142942476554566, + "grad_norm": 0.6487793205519652, + "learning_rate": 1.991135643226835e-05, + "loss": 0.4306, + "step": 568 + }, + { + "epoch": 0.2719072934711188, + "grad_norm": 0.6830594174977985, + "learning_rate": 1.9910655355124905e-05, + "loss": 0.4062, + "step": 569 + }, + { + "epoch": 0.27238516217669195, + "grad_norm": 0.7070510514831456, + "learning_rate": 1.9909951528940485e-05, + "loss": 0.4244, + "step": 570 + }, + { + "epoch": 0.2728630308822651, + "grad_norm": 0.6919752747886392, + "learning_rate": 1.9909244953910324e-05, + "loss": 0.4374, + "step": 571 + }, + { + "epoch": 0.27334089958783825, + "grad_norm": 0.627083497227152, + "learning_rate": 1.990853563023041e-05, + "loss": 0.4293, + "step": 572 + }, + { + "epoch": 0.2738187682934114, + "grad_norm": 0.6811946382742817, + "learning_rate": 1.990782355809749e-05, + "loss": 0.4314, + "step": 573 + }, + { + "epoch": 0.27429663699898454, + "grad_norm": 0.8011136341064639, + "learning_rate": 1.9907108737709088e-05, + "loss": 0.4313, + "step": 574 + }, + { + "epoch": 0.2747745057045577, + "grad_norm": 0.7022098540651683, + "learning_rate": 1.990639116926348e-05, + "loss": 0.4102, + "step": 575 + }, + { + "epoch": 0.27525237441013084, + "grad_norm": 0.7139794448818395, + "learning_rate": 1.9905670852959707e-05, + "loss": 0.4131, + "step": 576 + }, + { + "epoch": 0.275730243115704, + "grad_norm": 0.6566170557292315, + "learning_rate": 1.9904947788997572e-05, + "loss": 0.4492, + "step": 577 + }, + { + "epoch": 0.2762081118212771, + "grad_norm": 0.6960200627247798, + "learning_rate": 1.9904221977577644e-05, + "loss": 0.4236, + "step": 578 + }, + { + "epoch": 0.27668598052685023, + "grad_norm": 0.7038180733792689, + "learning_rate": 1.9903493418901246e-05, + "loss": 0.4156, + "step": 579 + }, + { + "epoch": 0.2771638492324234, + "grad_norm": 0.6995395704949278, + "learning_rate": 1.9902762113170467e-05, + "loss": 0.427, + "step": 580 + }, + { + "epoch": 0.2776417179379965, + "grad_norm": 0.7141523375442532, + "learning_rate": 1.990202806058816e-05, + "loss": 0.4075, + "step": 581 + }, + { + "epoch": 0.27811958664356967, + "grad_norm": 0.6927565201255096, + "learning_rate": 1.990129126135794e-05, + "loss": 0.4238, + "step": 582 + }, + { + "epoch": 0.2785974553491428, + "grad_norm": 0.6417220798020897, + "learning_rate": 1.9900551715684175e-05, + "loss": 0.4144, + "step": 583 + }, + { + "epoch": 0.27907532405471597, + "grad_norm": 0.6392690865600442, + "learning_rate": 1.989980942377201e-05, + "loss": 0.415, + "step": 584 + }, + { + "epoch": 0.2795531927602891, + "grad_norm": 0.639620744803394, + "learning_rate": 1.989906438582734e-05, + "loss": 0.4251, + "step": 585 + }, + { + "epoch": 0.28003106146586226, + "grad_norm": 0.6707471902399823, + "learning_rate": 1.9898316602056825e-05, + "loss": 0.4225, + "step": 586 + }, + { + "epoch": 0.2805089301714354, + "grad_norm": 0.6292145878798392, + "learning_rate": 1.989756607266789e-05, + "loss": 0.4281, + "step": 587 + }, + { + "epoch": 0.28098679887700856, + "grad_norm": 0.6650049148813777, + "learning_rate": 1.9896812797868714e-05, + "loss": 0.4273, + "step": 588 + }, + { + "epoch": 0.2814646675825817, + "grad_norm": 0.6989760949516968, + "learning_rate": 1.9896056777868245e-05, + "loss": 0.4133, + "step": 589 + }, + { + "epoch": 0.28194253628815485, + "grad_norm": 0.6273420717769904, + "learning_rate": 1.9895298012876192e-05, + "loss": 0.4117, + "step": 590 + }, + { + "epoch": 0.28242040499372795, + "grad_norm": 0.6513446444776289, + "learning_rate": 1.9894536503103018e-05, + "loss": 0.4243, + "step": 591 + }, + { + "epoch": 0.2828982736993011, + "grad_norm": 0.7257067616669561, + "learning_rate": 1.9893772248759956e-05, + "loss": 0.4377, + "step": 592 + }, + { + "epoch": 0.28337614240487424, + "grad_norm": 0.6449292794210849, + "learning_rate": 1.9893005250058994e-05, + "loss": 0.4262, + "step": 593 + }, + { + "epoch": 0.2838540111104474, + "grad_norm": 0.7101669933858333, + "learning_rate": 1.9892235507212885e-05, + "loss": 0.4166, + "step": 594 + }, + { + "epoch": 0.28433187981602054, + "grad_norm": 0.9532443107189144, + "learning_rate": 1.9891463020435144e-05, + "loss": 0.412, + "step": 595 + }, + { + "epoch": 0.2848097485215937, + "grad_norm": 0.6763594227901946, + "learning_rate": 1.9890687789940044e-05, + "loss": 0.4305, + "step": 596 + }, + { + "epoch": 0.28528761722716683, + "grad_norm": 0.666645047214093, + "learning_rate": 1.9889909815942615e-05, + "loss": 0.4262, + "step": 597 + }, + { + "epoch": 0.28576548593274, + "grad_norm": 0.7187479088271995, + "learning_rate": 1.9889129098658662e-05, + "loss": 0.4127, + "step": 598 + }, + { + "epoch": 0.28624335463831313, + "grad_norm": 0.7191161821405772, + "learning_rate": 1.9888345638304737e-05, + "loss": 0.4345, + "step": 599 + }, + { + "epoch": 0.2867212233438863, + "grad_norm": 0.706994011714395, + "learning_rate": 1.9887559435098162e-05, + "loss": 0.41, + "step": 600 + }, + { + "epoch": 0.2871990920494594, + "grad_norm": 0.7497992817297052, + "learning_rate": 1.988677048925701e-05, + "loss": 0.4285, + "step": 601 + }, + { + "epoch": 0.2876769607550326, + "grad_norm": 0.652784327493937, + "learning_rate": 1.9885978801000124e-05, + "loss": 0.4256, + "step": 602 + }, + { + "epoch": 0.2881548294606057, + "grad_norm": 0.776621607360754, + "learning_rate": 1.988518437054711e-05, + "loss": 0.4322, + "step": 603 + }, + { + "epoch": 0.28863269816617887, + "grad_norm": 0.5839322380726024, + "learning_rate": 1.9884387198118316e-05, + "loss": 0.4167, + "step": 604 + }, + { + "epoch": 0.28911056687175196, + "grad_norm": 0.7907030900607616, + "learning_rate": 1.9883587283934875e-05, + "loss": 0.406, + "step": 605 + }, + { + "epoch": 0.2895884355773251, + "grad_norm": 1.0014227126496111, + "learning_rate": 1.988278462821866e-05, + "loss": 0.4151, + "step": 606 + }, + { + "epoch": 0.29006630428289826, + "grad_norm": 0.66994522088662, + "learning_rate": 1.9881979231192323e-05, + "loss": 0.4392, + "step": 607 + }, + { + "epoch": 0.2905441729884714, + "grad_norm": 0.5824214225034886, + "learning_rate": 1.9881171093079264e-05, + "loss": 0.4267, + "step": 608 + }, + { + "epoch": 0.29102204169404455, + "grad_norm": 0.6495953436821396, + "learning_rate": 1.988036021410364e-05, + "loss": 0.4334, + "step": 609 + }, + { + "epoch": 0.2914999103996177, + "grad_norm": 0.6878938109775274, + "learning_rate": 1.9879546594490383e-05, + "loss": 0.41, + "step": 610 + }, + { + "epoch": 0.29197777910519085, + "grad_norm": 0.7296069094771699, + "learning_rate": 1.987873023446517e-05, + "loss": 0.4134, + "step": 611 + }, + { + "epoch": 0.292455647810764, + "grad_norm": 0.7455591671402041, + "learning_rate": 1.987791113425445e-05, + "loss": 0.424, + "step": 612 + }, + { + "epoch": 0.29293351651633714, + "grad_norm": 0.6695483498248275, + "learning_rate": 1.9877089294085424e-05, + "loss": 0.425, + "step": 613 + }, + { + "epoch": 0.2934113852219103, + "grad_norm": 0.6796642781672113, + "learning_rate": 1.9876264714186054e-05, + "loss": 0.3983, + "step": 614 + }, + { + "epoch": 0.29388925392748344, + "grad_norm": 0.6095965308498806, + "learning_rate": 1.987543739478507e-05, + "loss": 0.4225, + "step": 615 + }, + { + "epoch": 0.2943671226330566, + "grad_norm": 0.6305677380212332, + "learning_rate": 1.987460733611195e-05, + "loss": 0.4356, + "step": 616 + }, + { + "epoch": 0.29484499133862974, + "grad_norm": 0.7050343310604908, + "learning_rate": 1.9873774538396945e-05, + "loss": 0.4283, + "step": 617 + }, + { + "epoch": 0.29532286004420283, + "grad_norm": 0.6530927062141915, + "learning_rate": 1.987293900187105e-05, + "loss": 0.4316, + "step": 618 + }, + { + "epoch": 0.295800728749776, + "grad_norm": 0.6928306836676819, + "learning_rate": 1.9872100726766028e-05, + "loss": 0.4066, + "step": 619 + }, + { + "epoch": 0.2962785974553491, + "grad_norm": 0.7451946223590281, + "learning_rate": 1.987125971331441e-05, + "loss": 0.4227, + "step": 620 + }, + { + "epoch": 0.29675646616092227, + "grad_norm": 0.7569155679553317, + "learning_rate": 1.9870415961749472e-05, + "loss": 0.4171, + "step": 621 + }, + { + "epoch": 0.2972343348664954, + "grad_norm": 0.7729456988114033, + "learning_rate": 1.986956947230526e-05, + "loss": 0.4374, + "step": 622 + }, + { + "epoch": 0.29771220357206857, + "grad_norm": 0.7222068350990688, + "learning_rate": 1.986872024521657e-05, + "loss": 0.4055, + "step": 623 + }, + { + "epoch": 0.2981900722776417, + "grad_norm": 0.777255093182333, + "learning_rate": 1.9867868280718966e-05, + "loss": 0.4184, + "step": 624 + }, + { + "epoch": 0.29866794098321486, + "grad_norm": 0.707058786281402, + "learning_rate": 1.9867013579048765e-05, + "loss": 0.4278, + "step": 625 + }, + { + "epoch": 0.299145809688788, + "grad_norm": 0.6978123671395932, + "learning_rate": 1.986615614044305e-05, + "loss": 0.3976, + "step": 626 + }, + { + "epoch": 0.29962367839436116, + "grad_norm": 0.6497844128498012, + "learning_rate": 1.9865295965139654e-05, + "loss": 0.4233, + "step": 627 + }, + { + "epoch": 0.3001015470999343, + "grad_norm": 0.7279868951567827, + "learning_rate": 1.9864433053377183e-05, + "loss": 0.421, + "step": 628 + }, + { + "epoch": 0.30057941580550745, + "grad_norm": 0.6228075334334027, + "learning_rate": 1.9863567405394987e-05, + "loss": 0.4182, + "step": 629 + }, + { + "epoch": 0.3010572845110806, + "grad_norm": 1.2081410007291293, + "learning_rate": 1.9862699021433186e-05, + "loss": 0.43, + "step": 630 + }, + { + "epoch": 0.30153515321665375, + "grad_norm": 0.6584806729516838, + "learning_rate": 1.9861827901732645e-05, + "loss": 0.4243, + "step": 631 + }, + { + "epoch": 0.30201302192222684, + "grad_norm": 0.7145104561793435, + "learning_rate": 1.986095404653501e-05, + "loss": 0.4149, + "step": 632 + }, + { + "epoch": 0.3024908906278, + "grad_norm": 0.6293456214888333, + "learning_rate": 1.986007745608266e-05, + "loss": 0.4059, + "step": 633 + }, + { + "epoch": 0.30296875933337314, + "grad_norm": 0.6105778855320209, + "learning_rate": 1.985919813061876e-05, + "loss": 0.4075, + "step": 634 + }, + { + "epoch": 0.3034466280389463, + "grad_norm": 0.5850039212779303, + "learning_rate": 1.9858316070387208e-05, + "loss": 0.4265, + "step": 635 + }, + { + "epoch": 0.30392449674451943, + "grad_norm": 0.6552242491777841, + "learning_rate": 1.985743127563268e-05, + "loss": 0.4211, + "step": 636 + }, + { + "epoch": 0.3044023654500926, + "grad_norm": 0.6302333591257354, + "learning_rate": 1.9856543746600593e-05, + "loss": 0.4223, + "step": 637 + }, + { + "epoch": 0.30488023415566573, + "grad_norm": 0.6537304777031439, + "learning_rate": 1.985565348353714e-05, + "loss": 0.4278, + "step": 638 + }, + { + "epoch": 0.3053581028612389, + "grad_norm": 0.6730389531773375, + "learning_rate": 1.9854760486689257e-05, + "loss": 0.4222, + "step": 639 + }, + { + "epoch": 0.305835971566812, + "grad_norm": 0.64240760182782, + "learning_rate": 1.9853864756304654e-05, + "loss": 0.4157, + "step": 640 + }, + { + "epoch": 0.3063138402723852, + "grad_norm": 0.6417456096649262, + "learning_rate": 1.9852966292631785e-05, + "loss": 0.3962, + "step": 641 + }, + { + "epoch": 0.3067917089779583, + "grad_norm": 0.6690827386602772, + "learning_rate": 1.985206509591987e-05, + "loss": 0.4155, + "step": 642 + }, + { + "epoch": 0.30726957768353147, + "grad_norm": 0.6670159655708577, + "learning_rate": 1.9851161166418888e-05, + "loss": 0.4209, + "step": 643 + }, + { + "epoch": 0.3077474463891046, + "grad_norm": 0.6837870559936386, + "learning_rate": 1.9850254504379568e-05, + "loss": 0.4239, + "step": 644 + }, + { + "epoch": 0.3082253150946777, + "grad_norm": 0.635771654798046, + "learning_rate": 1.9849345110053405e-05, + "loss": 0.4115, + "step": 645 + }, + { + "epoch": 0.30870318380025086, + "grad_norm": 0.6317438803320911, + "learning_rate": 1.9848432983692642e-05, + "loss": 0.4167, + "step": 646 + }, + { + "epoch": 0.309181052505824, + "grad_norm": 0.7355553259854325, + "learning_rate": 1.98475181255503e-05, + "loss": 0.4191, + "step": 647 + }, + { + "epoch": 0.30965892121139715, + "grad_norm": 0.6759418052818299, + "learning_rate": 1.9846600535880135e-05, + "loss": 0.4287, + "step": 648 + }, + { + "epoch": 0.3101367899169703, + "grad_norm": 0.8036676894573562, + "learning_rate": 1.9845680214936668e-05, + "loss": 0.4001, + "step": 649 + }, + { + "epoch": 0.31061465862254345, + "grad_norm": 0.6256985874959373, + "learning_rate": 1.984475716297519e-05, + "loss": 0.4126, + "step": 650 + }, + { + "epoch": 0.3110925273281166, + "grad_norm": 0.6830281771469799, + "learning_rate": 1.984383138025173e-05, + "loss": 0.4133, + "step": 651 + }, + { + "epoch": 0.31157039603368974, + "grad_norm": 0.7108602267936108, + "learning_rate": 1.984290286702309e-05, + "loss": 0.4056, + "step": 652 + }, + { + "epoch": 0.3120482647392629, + "grad_norm": 0.6539475916242298, + "learning_rate": 1.984197162354682e-05, + "loss": 0.4332, + "step": 653 + }, + { + "epoch": 0.31252613344483604, + "grad_norm": 0.7003890411182644, + "learning_rate": 1.984103765008123e-05, + "loss": 0.4152, + "step": 654 + }, + { + "epoch": 0.3130040021504092, + "grad_norm": 0.8734062183276717, + "learning_rate": 1.984010094688539e-05, + "loss": 0.4308, + "step": 655 + }, + { + "epoch": 0.31348187085598234, + "grad_norm": 0.6483836791788339, + "learning_rate": 1.9839161514219125e-05, + "loss": 0.4335, + "step": 656 + }, + { + "epoch": 0.3139597395615555, + "grad_norm": 0.7224253461656356, + "learning_rate": 1.983821935234301e-05, + "loss": 0.4079, + "step": 657 + }, + { + "epoch": 0.31443760826712863, + "grad_norm": 0.6023125747576976, + "learning_rate": 1.9837274461518393e-05, + "loss": 0.4208, + "step": 658 + }, + { + "epoch": 0.3149154769727017, + "grad_norm": 0.6840565183544368, + "learning_rate": 1.9836326842007368e-05, + "loss": 0.4179, + "step": 659 + }, + { + "epoch": 0.3153933456782749, + "grad_norm": 0.6070851699714096, + "learning_rate": 1.9835376494072788e-05, + "loss": 0.421, + "step": 660 + }, + { + "epoch": 0.315871214383848, + "grad_norm": 0.7883725816487038, + "learning_rate": 1.9834423417978258e-05, + "loss": 0.4161, + "step": 661 + }, + { + "epoch": 0.31634908308942117, + "grad_norm": 0.7058787543409192, + "learning_rate": 1.983346761398815e-05, + "loss": 0.4077, + "step": 662 + }, + { + "epoch": 0.3168269517949943, + "grad_norm": 0.6615940212203109, + "learning_rate": 1.983250908236759e-05, + "loss": 0.4149, + "step": 663 + }, + { + "epoch": 0.31730482050056746, + "grad_norm": 0.7006736509156779, + "learning_rate": 1.9831547823382446e-05, + "loss": 0.4088, + "step": 664 + }, + { + "epoch": 0.3177826892061406, + "grad_norm": 0.6646570566525254, + "learning_rate": 1.9830583837299363e-05, + "loss": 0.4131, + "step": 665 + }, + { + "epoch": 0.31826055791171376, + "grad_norm": 0.6530528056282681, + "learning_rate": 1.9829617124385732e-05, + "loss": 0.396, + "step": 666 + }, + { + "epoch": 0.3187384266172869, + "grad_norm": 0.9328303194521024, + "learning_rate": 1.9828647684909703e-05, + "loss": 0.3951, + "step": 667 + }, + { + "epoch": 0.31921629532286006, + "grad_norm": 0.6959860169532506, + "learning_rate": 1.9827675519140183e-05, + "loss": 0.427, + "step": 668 + }, + { + "epoch": 0.3196941640284332, + "grad_norm": 0.6996213590788233, + "learning_rate": 1.9826700627346825e-05, + "loss": 0.4142, + "step": 669 + }, + { + "epoch": 0.32017203273400635, + "grad_norm": 0.6591077715112674, + "learning_rate": 1.9825723009800058e-05, + "loss": 0.4163, + "step": 670 + }, + { + "epoch": 0.3206499014395795, + "grad_norm": 0.6696332483194858, + "learning_rate": 1.982474266677105e-05, + "loss": 0.407, + "step": 671 + }, + { + "epoch": 0.32112777014515265, + "grad_norm": 0.792144854122436, + "learning_rate": 1.9823759598531732e-05, + "loss": 0.4204, + "step": 672 + }, + { + "epoch": 0.32160563885072574, + "grad_norm": 0.7610226047172992, + "learning_rate": 1.9822773805354788e-05, + "loss": 0.412, + "step": 673 + }, + { + "epoch": 0.3220835075562989, + "grad_norm": 0.7518454562539055, + "learning_rate": 1.982178528751366e-05, + "loss": 0.4263, + "step": 674 + }, + { + "epoch": 0.32256137626187203, + "grad_norm": 0.6802817311894305, + "learning_rate": 1.9820794045282553e-05, + "loss": 0.4107, + "step": 675 + }, + { + "epoch": 0.3230392449674452, + "grad_norm": 0.7023162206342487, + "learning_rate": 1.9819800078936406e-05, + "loss": 0.4114, + "step": 676 + }, + { + "epoch": 0.32351711367301833, + "grad_norm": 0.6685874705533328, + "learning_rate": 1.981880338875094e-05, + "loss": 0.4185, + "step": 677 + }, + { + "epoch": 0.3239949823785915, + "grad_norm": 0.6244222422392075, + "learning_rate": 1.9817803975002614e-05, + "loss": 0.4241, + "step": 678 + }, + { + "epoch": 0.3244728510841646, + "grad_norm": 0.6522844807838616, + "learning_rate": 1.9816801837968647e-05, + "loss": 0.4089, + "step": 679 + }, + { + "epoch": 0.3249507197897378, + "grad_norm": 0.7102809988759002, + "learning_rate": 1.9815796977927015e-05, + "loss": 0.4094, + "step": 680 + }, + { + "epoch": 0.3254285884953109, + "grad_norm": 0.9736649325340913, + "learning_rate": 1.981478939515645e-05, + "loss": 0.3995, + "step": 681 + }, + { + "epoch": 0.32590645720088407, + "grad_norm": 0.6342355412436739, + "learning_rate": 1.981377908993644e-05, + "loss": 0.4086, + "step": 682 + }, + { + "epoch": 0.3263843259064572, + "grad_norm": 0.6557900154548946, + "learning_rate": 1.9812766062547218e-05, + "loss": 0.4167, + "step": 683 + }, + { + "epoch": 0.32686219461203037, + "grad_norm": 0.7089582460412668, + "learning_rate": 1.9811750313269785e-05, + "loss": 0.4184, + "step": 684 + }, + { + "epoch": 0.3273400633176035, + "grad_norm": 0.643622934429116, + "learning_rate": 1.9810731842385892e-05, + "loss": 0.4198, + "step": 685 + }, + { + "epoch": 0.3278179320231766, + "grad_norm": 0.6544401734455819, + "learning_rate": 1.9809710650178043e-05, + "loss": 0.4297, + "step": 686 + }, + { + "epoch": 0.32829580072874975, + "grad_norm": 0.6350037232509057, + "learning_rate": 1.9808686736929507e-05, + "loss": 0.4139, + "step": 687 + }, + { + "epoch": 0.3287736694343229, + "grad_norm": 0.6453998273855003, + "learning_rate": 1.9807660102924285e-05, + "loss": 0.4104, + "step": 688 + }, + { + "epoch": 0.32925153813989605, + "grad_norm": 0.6118435089195414, + "learning_rate": 1.980663074844716e-05, + "loss": 0.4439, + "step": 689 + }, + { + "epoch": 0.3297294068454692, + "grad_norm": 0.7137050965412601, + "learning_rate": 1.9805598673783644e-05, + "loss": 0.4051, + "step": 690 + }, + { + "epoch": 0.33020727555104235, + "grad_norm": 0.660701532438893, + "learning_rate": 1.980456387922003e-05, + "loss": 0.4247, + "step": 691 + }, + { + "epoch": 0.3306851442566155, + "grad_norm": 0.6219578421483081, + "learning_rate": 1.9803526365043342e-05, + "loss": 0.4311, + "step": 692 + }, + { + "epoch": 0.33116301296218864, + "grad_norm": 0.7398002604532392, + "learning_rate": 1.980248613154137e-05, + "loss": 0.4162, + "step": 693 + }, + { + "epoch": 0.3316408816677618, + "grad_norm": 0.594653754515913, + "learning_rate": 1.9801443179002664e-05, + "loss": 0.4291, + "step": 694 + }, + { + "epoch": 0.33211875037333494, + "grad_norm": 0.7039036720424287, + "learning_rate": 1.980039750771651e-05, + "loss": 0.4154, + "step": 695 + }, + { + "epoch": 0.3325966190789081, + "grad_norm": 0.6360396889801934, + "learning_rate": 1.9799349117972966e-05, + "loss": 0.4097, + "step": 696 + }, + { + "epoch": 0.33307448778448123, + "grad_norm": 0.650673130557327, + "learning_rate": 1.9798298010062834e-05, + "loss": 0.408, + "step": 697 + }, + { + "epoch": 0.3335523564900544, + "grad_norm": 0.6449814851092306, + "learning_rate": 1.979724418427767e-05, + "loss": 0.4237, + "step": 698 + }, + { + "epoch": 0.33403022519562753, + "grad_norm": 0.6952694171904437, + "learning_rate": 1.9796187640909793e-05, + "loss": 0.414, + "step": 699 + }, + { + "epoch": 0.3345080939012006, + "grad_norm": 0.6255490632329522, + "learning_rate": 1.9795128380252263e-05, + "loss": 0.4128, + "step": 700 + }, + { + "epoch": 0.33498596260677377, + "grad_norm": 0.7726068650204944, + "learning_rate": 1.9794066402598905e-05, + "loss": 0.3941, + "step": 701 + }, + { + "epoch": 0.3354638313123469, + "grad_norm": 0.7260875044062942, + "learning_rate": 1.9793001708244293e-05, + "loss": 0.4278, + "step": 702 + }, + { + "epoch": 0.33594170001792006, + "grad_norm": 0.6812812508438351, + "learning_rate": 1.9791934297483754e-05, + "loss": 0.4121, + "step": 703 + }, + { + "epoch": 0.3364195687234932, + "grad_norm": 0.6382204766381031, + "learning_rate": 1.9790864170613363e-05, + "loss": 0.4128, + "step": 704 + }, + { + "epoch": 0.33689743742906636, + "grad_norm": 0.7007245066237098, + "learning_rate": 1.978979132792996e-05, + "loss": 0.4226, + "step": 705 + }, + { + "epoch": 0.3373753061346395, + "grad_norm": 0.6498578233016445, + "learning_rate": 1.978871576973113e-05, + "loss": 0.4003, + "step": 706 + }, + { + "epoch": 0.33785317484021266, + "grad_norm": 0.8549420085188715, + "learning_rate": 1.9787637496315223e-05, + "loss": 0.4117, + "step": 707 + }, + { + "epoch": 0.3383310435457858, + "grad_norm": 0.7414633332196651, + "learning_rate": 1.978655650798132e-05, + "loss": 0.3992, + "step": 708 + }, + { + "epoch": 0.33880891225135895, + "grad_norm": 0.6427548458668187, + "learning_rate": 1.9785472805029274e-05, + "loss": 0.4097, + "step": 709 + }, + { + "epoch": 0.3392867809569321, + "grad_norm": 0.6277498179900414, + "learning_rate": 1.9784386387759684e-05, + "loss": 0.419, + "step": 710 + }, + { + "epoch": 0.33976464966250525, + "grad_norm": 0.7227930165728, + "learning_rate": 1.9783297256473905e-05, + "loss": 0.3995, + "step": 711 + }, + { + "epoch": 0.3402425183680784, + "grad_norm": 0.6041073332058529, + "learning_rate": 1.9782205411474042e-05, + "loss": 0.4028, + "step": 712 + }, + { + "epoch": 0.3407203870736515, + "grad_norm": 0.697676563953654, + "learning_rate": 1.978111085306295e-05, + "loss": 0.4131, + "step": 713 + }, + { + "epoch": 0.34119825577922464, + "grad_norm": 0.6626931153008405, + "learning_rate": 1.9780013581544245e-05, + "loss": 0.4364, + "step": 714 + }, + { + "epoch": 0.3416761244847978, + "grad_norm": 0.6841487019463591, + "learning_rate": 1.977891359722229e-05, + "loss": 0.423, + "step": 715 + }, + { + "epoch": 0.34215399319037093, + "grad_norm": 0.7564619275323727, + "learning_rate": 1.9777810900402203e-05, + "loss": 0.4161, + "step": 716 + }, + { + "epoch": 0.3426318618959441, + "grad_norm": 1.3658709278718753, + "learning_rate": 1.9776705491389844e-05, + "loss": 0.4109, + "step": 717 + }, + { + "epoch": 0.3431097306015172, + "grad_norm": 0.7907397360768815, + "learning_rate": 1.9775597370491838e-05, + "loss": 0.4051, + "step": 718 + }, + { + "epoch": 0.3435875993070904, + "grad_norm": 0.6514561615066782, + "learning_rate": 1.977448653801557e-05, + "loss": 0.4034, + "step": 719 + }, + { + "epoch": 0.3440654680126635, + "grad_norm": 0.6824032819742455, + "learning_rate": 1.9773372994269147e-05, + "loss": 0.4276, + "step": 720 + }, + { + "epoch": 0.34454333671823667, + "grad_norm": 0.6883595228191173, + "learning_rate": 1.9772256739561454e-05, + "loss": 0.4267, + "step": 721 + }, + { + "epoch": 0.3450212054238098, + "grad_norm": 0.7266627480718902, + "learning_rate": 1.9771137774202126e-05, + "loss": 0.4171, + "step": 722 + }, + { + "epoch": 0.34549907412938297, + "grad_norm": 0.6857743582824786, + "learning_rate": 1.9770016098501535e-05, + "loss": 0.4133, + "step": 723 + }, + { + "epoch": 0.3459769428349561, + "grad_norm": 0.8685391109702237, + "learning_rate": 1.9768891712770824e-05, + "loss": 0.394, + "step": 724 + }, + { + "epoch": 0.34645481154052926, + "grad_norm": 0.6842643433044535, + "learning_rate": 1.976776461732187e-05, + "loss": 0.4152, + "step": 725 + }, + { + "epoch": 0.3469326802461024, + "grad_norm": 0.7671231877298523, + "learning_rate": 1.976663481246731e-05, + "loss": 0.4238, + "step": 726 + }, + { + "epoch": 0.3474105489516755, + "grad_norm": 0.6577325703350719, + "learning_rate": 1.9765502298520534e-05, + "loss": 0.412, + "step": 727 + }, + { + "epoch": 0.34788841765724865, + "grad_norm": 0.6791922311294679, + "learning_rate": 1.976436707579568e-05, + "loss": 0.4071, + "step": 728 + }, + { + "epoch": 0.3483662863628218, + "grad_norm": 0.7164375380905718, + "learning_rate": 1.9763229144607643e-05, + "loss": 0.4213, + "step": 729 + }, + { + "epoch": 0.34884415506839495, + "grad_norm": 0.6448079596578629, + "learning_rate": 1.976208850527206e-05, + "loss": 0.4264, + "step": 730 + }, + { + "epoch": 0.3493220237739681, + "grad_norm": 0.6810860347020636, + "learning_rate": 1.9760945158105326e-05, + "loss": 0.4401, + "step": 731 + }, + { + "epoch": 0.34979989247954124, + "grad_norm": 0.6508629347760229, + "learning_rate": 1.975979910342458e-05, + "loss": 0.404, + "step": 732 + }, + { + "epoch": 0.3502777611851144, + "grad_norm": 0.667860785115438, + "learning_rate": 1.975865034154773e-05, + "loss": 0.4189, + "step": 733 + }, + { + "epoch": 0.35075562989068754, + "grad_norm": 0.5960633232464225, + "learning_rate": 1.975749887279341e-05, + "loss": 0.4063, + "step": 734 + }, + { + "epoch": 0.3512334985962607, + "grad_norm": 0.6676835117003757, + "learning_rate": 1.9756344697481027e-05, + "loss": 0.4168, + "step": 735 + }, + { + "epoch": 0.35171136730183383, + "grad_norm": 0.6027086678736178, + "learning_rate": 1.975518781593072e-05, + "loss": 0.3967, + "step": 736 + }, + { + "epoch": 0.352189236007407, + "grad_norm": 0.7011001055447835, + "learning_rate": 1.975402822846339e-05, + "loss": 0.409, + "step": 737 + }, + { + "epoch": 0.35266710471298013, + "grad_norm": 0.6173498609221076, + "learning_rate": 1.9752865935400692e-05, + "loss": 0.4069, + "step": 738 + }, + { + "epoch": 0.3531449734185533, + "grad_norm": 0.7500007957464343, + "learning_rate": 1.975170093706502e-05, + "loss": 0.4039, + "step": 739 + }, + { + "epoch": 0.35362284212412637, + "grad_norm": 0.586011297625854, + "learning_rate": 1.975053323377952e-05, + "loss": 0.4071, + "step": 740 + }, + { + "epoch": 0.3541007108296995, + "grad_norm": 0.749881324377058, + "learning_rate": 1.9749362825868105e-05, + "loss": 0.4236, + "step": 741 + }, + { + "epoch": 0.35457857953527266, + "grad_norm": 0.6097150903798377, + "learning_rate": 1.9748189713655414e-05, + "loss": 0.4343, + "step": 742 + }, + { + "epoch": 0.3550564482408458, + "grad_norm": 0.5888296839949839, + "learning_rate": 1.9747013897466852e-05, + "loss": 0.413, + "step": 743 + }, + { + "epoch": 0.35553431694641896, + "grad_norm": 0.6218972588587672, + "learning_rate": 1.974583537762857e-05, + "loss": 0.4002, + "step": 744 + }, + { + "epoch": 0.3560121856519921, + "grad_norm": 0.6244729217199203, + "learning_rate": 1.9744654154467468e-05, + "loss": 0.4249, + "step": 745 + }, + { + "epoch": 0.35649005435756526, + "grad_norm": 0.6019630528796382, + "learning_rate": 1.9743470228311195e-05, + "loss": 0.4108, + "step": 746 + }, + { + "epoch": 0.3569679230631384, + "grad_norm": 0.6053335199202433, + "learning_rate": 1.9742283599488154e-05, + "loss": 0.4164, + "step": 747 + }, + { + "epoch": 0.35744579176871155, + "grad_norm": 0.5390394035508936, + "learning_rate": 1.974109426832749e-05, + "loss": 0.4269, + "step": 748 + }, + { + "epoch": 0.3579236604742847, + "grad_norm": 0.6864448137729777, + "learning_rate": 1.973990223515911e-05, + "loss": 0.4151, + "step": 749 + }, + { + "epoch": 0.35840152917985785, + "grad_norm": 0.6117555520134601, + "learning_rate": 1.9738707500313655e-05, + "loss": 0.4127, + "step": 750 + }, + { + "epoch": 0.358879397885431, + "grad_norm": 0.6231362351908816, + "learning_rate": 1.973751006412253e-05, + "loss": 0.4107, + "step": 751 + }, + { + "epoch": 0.35935726659100414, + "grad_norm": 0.624929534918908, + "learning_rate": 1.973630992691788e-05, + "loss": 0.4006, + "step": 752 + }, + { + "epoch": 0.3598351352965773, + "grad_norm": 0.6397155829395167, + "learning_rate": 1.97351070890326e-05, + "loss": 0.4036, + "step": 753 + }, + { + "epoch": 0.3603130040021504, + "grad_norm": 0.6568169413470653, + "learning_rate": 1.9733901550800342e-05, + "loss": 0.3832, + "step": 754 + }, + { + "epoch": 0.36079087270772353, + "grad_norm": 0.6058849776764632, + "learning_rate": 1.9732693312555492e-05, + "loss": 0.4093, + "step": 755 + }, + { + "epoch": 0.3612687414132967, + "grad_norm": 0.6417564281631485, + "learning_rate": 1.9731482374633203e-05, + "loss": 0.4261, + "step": 756 + }, + { + "epoch": 0.3617466101188698, + "grad_norm": 0.6139727827460876, + "learning_rate": 1.973026873736936e-05, + "loss": 0.4099, + "step": 757 + }, + { + "epoch": 0.362224478824443, + "grad_norm": 0.6510466515537121, + "learning_rate": 1.972905240110061e-05, + "loss": 0.4054, + "step": 758 + }, + { + "epoch": 0.3627023475300161, + "grad_norm": 0.6367387723523807, + "learning_rate": 1.9727833366164342e-05, + "loss": 0.4288, + "step": 759 + }, + { + "epoch": 0.36318021623558927, + "grad_norm": 0.7047833872183114, + "learning_rate": 1.9726611632898693e-05, + "loss": 0.4184, + "step": 760 + }, + { + "epoch": 0.3636580849411624, + "grad_norm": 0.6625607174095874, + "learning_rate": 1.9725387201642553e-05, + "loss": 0.3984, + "step": 761 + }, + { + "epoch": 0.36413595364673557, + "grad_norm": 0.7543210649256787, + "learning_rate": 1.9724160072735553e-05, + "loss": 0.4157, + "step": 762 + }, + { + "epoch": 0.3646138223523087, + "grad_norm": 0.8409356525795596, + "learning_rate": 1.9722930246518083e-05, + "loss": 0.3997, + "step": 763 + }, + { + "epoch": 0.36509169105788186, + "grad_norm": 0.6419317088471457, + "learning_rate": 1.9721697723331273e-05, + "loss": 0.4059, + "step": 764 + }, + { + "epoch": 0.365569559763455, + "grad_norm": 0.739982349078809, + "learning_rate": 1.9720462503517e-05, + "loss": 0.4118, + "step": 765 + }, + { + "epoch": 0.36604742846902816, + "grad_norm": 0.6898565679952424, + "learning_rate": 1.9719224587417896e-05, + "loss": 0.4039, + "step": 766 + }, + { + "epoch": 0.3665252971746013, + "grad_norm": 0.7612996038765697, + "learning_rate": 1.971798397537733e-05, + "loss": 0.4153, + "step": 767 + }, + { + "epoch": 0.3670031658801744, + "grad_norm": 0.6660225921682122, + "learning_rate": 1.971674066773944e-05, + "loss": 0.4148, + "step": 768 + }, + { + "epoch": 0.36748103458574755, + "grad_norm": 0.7220884614015122, + "learning_rate": 1.9715494664849088e-05, + "loss": 0.4055, + "step": 769 + }, + { + "epoch": 0.3679589032913207, + "grad_norm": 0.677877244256832, + "learning_rate": 1.971424596705189e-05, + "loss": 0.4119, + "step": 770 + }, + { + "epoch": 0.36843677199689384, + "grad_norm": 0.7526211652684072, + "learning_rate": 1.971299457469422e-05, + "loss": 0.4177, + "step": 771 + }, + { + "epoch": 0.368914640702467, + "grad_norm": 0.6114869523814213, + "learning_rate": 1.971174048812319e-05, + "loss": 0.4101, + "step": 772 + }, + { + "epoch": 0.36939250940804014, + "grad_norm": 0.6899914780670549, + "learning_rate": 1.971048370768666e-05, + "loss": 0.4351, + "step": 773 + }, + { + "epoch": 0.3698703781136133, + "grad_norm": 0.6192089565424979, + "learning_rate": 1.9709224233733236e-05, + "loss": 0.4042, + "step": 774 + }, + { + "epoch": 0.37034824681918643, + "grad_norm": 0.7355290743652367, + "learning_rate": 1.9707962066612278e-05, + "loss": 0.4027, + "step": 775 + }, + { + "epoch": 0.3708261155247596, + "grad_norm": 0.7403645775826277, + "learning_rate": 1.9706697206673892e-05, + "loss": 0.4023, + "step": 776 + }, + { + "epoch": 0.37130398423033273, + "grad_norm": 0.6999584425787945, + "learning_rate": 1.9705429654268925e-05, + "loss": 0.4097, + "step": 777 + }, + { + "epoch": 0.3717818529359059, + "grad_norm": 0.6998503016090031, + "learning_rate": 1.9704159409748967e-05, + "loss": 0.4081, + "step": 778 + }, + { + "epoch": 0.372259721641479, + "grad_norm": 0.6443494078717972, + "learning_rate": 1.9702886473466368e-05, + "loss": 0.4077, + "step": 779 + }, + { + "epoch": 0.3727375903470522, + "grad_norm": 0.6852044402539894, + "learning_rate": 1.970161084577422e-05, + "loss": 0.4193, + "step": 780 + }, + { + "epoch": 0.37321545905262526, + "grad_norm": 0.6236384893553105, + "learning_rate": 1.970033252702636e-05, + "loss": 0.4031, + "step": 781 + }, + { + "epoch": 0.3736933277581984, + "grad_norm": 0.6469866739392119, + "learning_rate": 1.969905151757736e-05, + "loss": 0.4099, + "step": 782 + }, + { + "epoch": 0.37417119646377156, + "grad_norm": 0.6504567142773998, + "learning_rate": 1.9697767817782565e-05, + "loss": 0.4075, + "step": 783 + }, + { + "epoch": 0.3746490651693447, + "grad_norm": 0.7743161467373921, + "learning_rate": 1.969648142799804e-05, + "loss": 0.4241, + "step": 784 + }, + { + "epoch": 0.37512693387491786, + "grad_norm": 0.61791736904947, + "learning_rate": 1.9695192348580606e-05, + "loss": 0.4073, + "step": 785 + }, + { + "epoch": 0.375604802580491, + "grad_norm": 0.6551482938668043, + "learning_rate": 1.969390057988784e-05, + "loss": 0.4157, + "step": 786 + }, + { + "epoch": 0.37608267128606415, + "grad_norm": 0.6241776375892095, + "learning_rate": 1.9692606122278047e-05, + "loss": 0.3991, + "step": 787 + }, + { + "epoch": 0.3765605399916373, + "grad_norm": 0.8697779384559257, + "learning_rate": 1.9691308976110293e-05, + "loss": 0.3943, + "step": 788 + }, + { + "epoch": 0.37703840869721045, + "grad_norm": 0.6646525801024487, + "learning_rate": 1.969000914174438e-05, + "loss": 0.4011, + "step": 789 + }, + { + "epoch": 0.3775162774027836, + "grad_norm": 0.6842360997446302, + "learning_rate": 1.9688706619540863e-05, + "loss": 0.4229, + "step": 790 + }, + { + "epoch": 0.37799414610835674, + "grad_norm": 0.6269010992340172, + "learning_rate": 1.9687401409861032e-05, + "loss": 0.4165, + "step": 791 + }, + { + "epoch": 0.3784720148139299, + "grad_norm": 0.6948929157285494, + "learning_rate": 1.9686093513066933e-05, + "loss": 0.4189, + "step": 792 + }, + { + "epoch": 0.37894988351950304, + "grad_norm": 0.6938250597089114, + "learning_rate": 1.9684782929521355e-05, + "loss": 0.4382, + "step": 793 + }, + { + "epoch": 0.3794277522250762, + "grad_norm": 0.710536040353905, + "learning_rate": 1.9683469659587826e-05, + "loss": 0.4018, + "step": 794 + }, + { + "epoch": 0.3799056209306493, + "grad_norm": 0.7765930152534725, + "learning_rate": 1.968215370363063e-05, + "loss": 0.4025, + "step": 795 + }, + { + "epoch": 0.3803834896362224, + "grad_norm": 0.673786846379775, + "learning_rate": 1.9680835062014784e-05, + "loss": 0.4072, + "step": 796 + }, + { + "epoch": 0.3808613583417956, + "grad_norm": 0.6624205844915216, + "learning_rate": 1.967951373510606e-05, + "loss": 0.4029, + "step": 797 + }, + { + "epoch": 0.3813392270473687, + "grad_norm": 0.6702031183000383, + "learning_rate": 1.967818972327097e-05, + "loss": 0.3933, + "step": 798 + }, + { + "epoch": 0.38181709575294187, + "grad_norm": 0.6643830669329299, + "learning_rate": 1.967686302687677e-05, + "loss": 0.4078, + "step": 799 + }, + { + "epoch": 0.382294964458515, + "grad_norm": 0.6820114775894023, + "learning_rate": 1.9675533646291463e-05, + "loss": 0.4019, + "step": 800 + }, + { + "epoch": 0.38277283316408817, + "grad_norm": 0.6654337293955677, + "learning_rate": 1.9674201581883796e-05, + "loss": 0.4143, + "step": 801 + }, + { + "epoch": 0.3832507018696613, + "grad_norm": 0.6277257073491326, + "learning_rate": 1.9672866834023263e-05, + "loss": 0.3954, + "step": 802 + }, + { + "epoch": 0.38372857057523446, + "grad_norm": 1.181246701089062, + "learning_rate": 1.9671529403080095e-05, + "loss": 0.4115, + "step": 803 + }, + { + "epoch": 0.3842064392808076, + "grad_norm": 0.6999373807721258, + "learning_rate": 1.9670189289425273e-05, + "loss": 0.4085, + "step": 804 + }, + { + "epoch": 0.38468430798638076, + "grad_norm": 0.6351836651328577, + "learning_rate": 1.9668846493430522e-05, + "loss": 0.4024, + "step": 805 + }, + { + "epoch": 0.3851621766919539, + "grad_norm": 0.6200352215300432, + "learning_rate": 1.966750101546831e-05, + "loss": 0.4217, + "step": 806 + }, + { + "epoch": 0.38564004539752705, + "grad_norm": 0.6312607230160627, + "learning_rate": 1.9666152855911845e-05, + "loss": 0.4148, + "step": 807 + }, + { + "epoch": 0.38611791410310015, + "grad_norm": 0.6247739745978539, + "learning_rate": 1.966480201513509e-05, + "loss": 0.4129, + "step": 808 + }, + { + "epoch": 0.3865957828086733, + "grad_norm": 23.521606298262835, + "learning_rate": 1.966344849351274e-05, + "loss": 0.3985, + "step": 809 + }, + { + "epoch": 0.38707365151424644, + "grad_norm": 0.7307789504862988, + "learning_rate": 1.9662092291420233e-05, + "loss": 0.4355, + "step": 810 + }, + { + "epoch": 0.3875515202198196, + "grad_norm": 0.6338895925736729, + "learning_rate": 1.9660733409233763e-05, + "loss": 0.4114, + "step": 811 + }, + { + "epoch": 0.38802938892539274, + "grad_norm": 1.022414001164045, + "learning_rate": 1.965937184733026e-05, + "loss": 0.4177, + "step": 812 + }, + { + "epoch": 0.3885072576309659, + "grad_norm": 0.6257154243428295, + "learning_rate": 1.965800760608739e-05, + "loss": 0.4238, + "step": 813 + }, + { + "epoch": 0.38898512633653903, + "grad_norm": 0.6578526200723673, + "learning_rate": 1.965664068588358e-05, + "loss": 0.4027, + "step": 814 + }, + { + "epoch": 0.3894629950421122, + "grad_norm": 0.6394160956438139, + "learning_rate": 1.965527108709798e-05, + "loss": 0.4047, + "step": 815 + }, + { + "epoch": 0.38994086374768533, + "grad_norm": 0.6469488408202481, + "learning_rate": 1.96538988101105e-05, + "loss": 0.4035, + "step": 816 + }, + { + "epoch": 0.3904187324532585, + "grad_norm": 0.6625645754583935, + "learning_rate": 1.9652523855301783e-05, + "loss": 0.4144, + "step": 817 + }, + { + "epoch": 0.3908966011588316, + "grad_norm": 0.6841818131947363, + "learning_rate": 1.9651146223053213e-05, + "loss": 0.389, + "step": 818 + }, + { + "epoch": 0.3913744698644048, + "grad_norm": 0.6438625312526988, + "learning_rate": 1.9649765913746923e-05, + "loss": 0.4111, + "step": 819 + }, + { + "epoch": 0.3918523385699779, + "grad_norm": 1.01601163370014, + "learning_rate": 1.964838292776579e-05, + "loss": 0.4031, + "step": 820 + }, + { + "epoch": 0.39233020727555107, + "grad_norm": 0.677050070767038, + "learning_rate": 1.9646997265493426e-05, + "loss": 0.4149, + "step": 821 + }, + { + "epoch": 0.39280807598112416, + "grad_norm": 0.6738615859070755, + "learning_rate": 1.9645608927314194e-05, + "loss": 0.4123, + "step": 822 + }, + { + "epoch": 0.3932859446866973, + "grad_norm": 0.6651159143963696, + "learning_rate": 1.9644217913613187e-05, + "loss": 0.4057, + "step": 823 + }, + { + "epoch": 0.39376381339227046, + "grad_norm": 0.7691000084987812, + "learning_rate": 1.9642824224776252e-05, + "loss": 0.4046, + "step": 824 + }, + { + "epoch": 0.3942416820978436, + "grad_norm": 0.7211243084062602, + "learning_rate": 1.9641427861189973e-05, + "loss": 0.4089, + "step": 825 + }, + { + "epoch": 0.39471955080341675, + "grad_norm": 0.63694129075116, + "learning_rate": 1.964002882324168e-05, + "loss": 0.3879, + "step": 826 + }, + { + "epoch": 0.3951974195089899, + "grad_norm": 0.5886615227252643, + "learning_rate": 1.9638627111319437e-05, + "loss": 0.4098, + "step": 827 + }, + { + "epoch": 0.39567528821456305, + "grad_norm": 0.6720674176320514, + "learning_rate": 1.963722272581206e-05, + "loss": 0.4014, + "step": 828 + }, + { + "epoch": 0.3961531569201362, + "grad_norm": 0.5791735119683044, + "learning_rate": 1.963581566710909e-05, + "loss": 0.4008, + "step": 829 + }, + { + "epoch": 0.39663102562570934, + "grad_norm": 0.6464775042145513, + "learning_rate": 1.963440593560083e-05, + "loss": 0.4164, + "step": 830 + }, + { + "epoch": 0.3971088943312825, + "grad_norm": 0.6144881514988998, + "learning_rate": 1.963299353167831e-05, + "loss": 0.411, + "step": 831 + }, + { + "epoch": 0.39758676303685564, + "grad_norm": 0.670072169690497, + "learning_rate": 1.9631578455733307e-05, + "loss": 0.4089, + "step": 832 + }, + { + "epoch": 0.3980646317424288, + "grad_norm": 0.6989267984529208, + "learning_rate": 1.9630160708158343e-05, + "loss": 0.406, + "step": 833 + }, + { + "epoch": 0.39854250044800194, + "grad_norm": 1.073909712530086, + "learning_rate": 1.9628740289346668e-05, + "loss": 0.395, + "step": 834 + }, + { + "epoch": 0.3990203691535751, + "grad_norm": 0.9121845119973022, + "learning_rate": 1.9627317199692287e-05, + "loss": 0.4055, + "step": 835 + }, + { + "epoch": 0.3994982378591482, + "grad_norm": 0.6116187864851871, + "learning_rate": 1.9625891439589933e-05, + "loss": 0.3942, + "step": 836 + }, + { + "epoch": 0.3999761065647213, + "grad_norm": 0.6622950577654838, + "learning_rate": 1.9624463009435097e-05, + "loss": 0.4037, + "step": 837 + }, + { + "epoch": 0.40045397527029447, + "grad_norm": 0.7953106318152663, + "learning_rate": 1.9623031909623993e-05, + "loss": 0.4165, + "step": 838 + }, + { + "epoch": 0.4009318439758676, + "grad_norm": 0.6653838350357102, + "learning_rate": 1.962159814055358e-05, + "loss": 0.4084, + "step": 839 + }, + { + "epoch": 0.40140971268144077, + "grad_norm": 0.6890324725741804, + "learning_rate": 1.962016170262157e-05, + "loss": 0.4236, + "step": 840 + }, + { + "epoch": 0.4018875813870139, + "grad_norm": 0.6550797312828021, + "learning_rate": 1.96187225962264e-05, + "loss": 0.4096, + "step": 841 + }, + { + "epoch": 0.40236545009258706, + "grad_norm": 0.7180548191723333, + "learning_rate": 1.9617280821767253e-05, + "loss": 0.3889, + "step": 842 + }, + { + "epoch": 0.4028433187981602, + "grad_norm": 0.6990218911792896, + "learning_rate": 1.9615836379644054e-05, + "loss": 0.3876, + "step": 843 + }, + { + "epoch": 0.40332118750373336, + "grad_norm": 0.8178032329800812, + "learning_rate": 1.961438927025746e-05, + "loss": 0.3978, + "step": 844 + }, + { + "epoch": 0.4037990562093065, + "grad_norm": 0.5792010743662441, + "learning_rate": 1.961293949400888e-05, + "loss": 0.4065, + "step": 845 + }, + { + "epoch": 0.40427692491487965, + "grad_norm": 0.6641398130174404, + "learning_rate": 1.9611487051300454e-05, + "loss": 0.3914, + "step": 846 + }, + { + "epoch": 0.4047547936204528, + "grad_norm": 0.6195936771890139, + "learning_rate": 1.961003194253506e-05, + "loss": 0.402, + "step": 847 + }, + { + "epoch": 0.40523266232602595, + "grad_norm": 0.6768052082314968, + "learning_rate": 1.9608574168116324e-05, + "loss": 0.4199, + "step": 848 + }, + { + "epoch": 0.40571053103159904, + "grad_norm": 0.6082693604746662, + "learning_rate": 1.960711372844861e-05, + "loss": 0.4074, + "step": 849 + }, + { + "epoch": 0.4061883997371722, + "grad_norm": 0.6268219384593703, + "learning_rate": 1.960565062393701e-05, + "loss": 0.4157, + "step": 850 + }, + { + "epoch": 0.40666626844274534, + "grad_norm": 0.6158932362492403, + "learning_rate": 1.960418485498737e-05, + "loss": 0.3942, + "step": 851 + }, + { + "epoch": 0.4071441371483185, + "grad_norm": 0.7217988468496267, + "learning_rate": 1.9602716422006266e-05, + "loss": 0.4123, + "step": 852 + }, + { + "epoch": 0.40762200585389163, + "grad_norm": 0.6549013187083295, + "learning_rate": 1.9601245325401016e-05, + "loss": 0.3881, + "step": 853 + }, + { + "epoch": 0.4080998745594648, + "grad_norm": 0.6163590041751829, + "learning_rate": 1.9599771565579673e-05, + "loss": 0.4224, + "step": 854 + }, + { + "epoch": 0.40857774326503793, + "grad_norm": 0.8712876541825809, + "learning_rate": 1.9598295142951035e-05, + "loss": 0.4042, + "step": 855 + }, + { + "epoch": 0.4090556119706111, + "grad_norm": 0.7278398539493856, + "learning_rate": 1.959681605792464e-05, + "loss": 0.4164, + "step": 856 + }, + { + "epoch": 0.4095334806761842, + "grad_norm": 0.600386439964583, + "learning_rate": 1.9595334310910753e-05, + "loss": 0.4135, + "step": 857 + }, + { + "epoch": 0.4100113493817574, + "grad_norm": 0.7081480085926267, + "learning_rate": 1.9593849902320386e-05, + "loss": 0.4027, + "step": 858 + }, + { + "epoch": 0.4104892180873305, + "grad_norm": 0.6188620046952817, + "learning_rate": 1.9592362832565287e-05, + "loss": 0.4016, + "step": 859 + }, + { + "epoch": 0.41096708679290367, + "grad_norm": 0.682019844909097, + "learning_rate": 1.9590873102057948e-05, + "loss": 0.4071, + "step": 860 + }, + { + "epoch": 0.4114449554984768, + "grad_norm": 0.6603131181430578, + "learning_rate": 1.9589380711211588e-05, + "loss": 0.3946, + "step": 861 + }, + { + "epoch": 0.41192282420404996, + "grad_norm": 0.6577106617903441, + "learning_rate": 1.9587885660440176e-05, + "loss": 0.4133, + "step": 862 + }, + { + "epoch": 0.41240069290962306, + "grad_norm": 0.6628545122914132, + "learning_rate": 1.9586387950158406e-05, + "loss": 0.4131, + "step": 863 + }, + { + "epoch": 0.4128785616151962, + "grad_norm": 0.6190751337142328, + "learning_rate": 1.958488758078172e-05, + "loss": 0.4191, + "step": 864 + }, + { + "epoch": 0.41335643032076935, + "grad_norm": 0.7460256010176625, + "learning_rate": 1.9583384552726294e-05, + "loss": 0.3936, + "step": 865 + }, + { + "epoch": 0.4138342990263425, + "grad_norm": 0.5886477110046239, + "learning_rate": 1.9581878866409042e-05, + "loss": 0.4069, + "step": 866 + }, + { + "epoch": 0.41431216773191565, + "grad_norm": 0.6886448680334144, + "learning_rate": 1.9580370522247614e-05, + "loss": 0.397, + "step": 867 + }, + { + "epoch": 0.4147900364374888, + "grad_norm": 0.6232904922457612, + "learning_rate": 1.9578859520660396e-05, + "loss": 0.4029, + "step": 868 + }, + { + "epoch": 0.41526790514306194, + "grad_norm": 0.5941334358676716, + "learning_rate": 1.9577345862066518e-05, + "loss": 0.3878, + "step": 869 + }, + { + "epoch": 0.4157457738486351, + "grad_norm": 1.0139675059105102, + "learning_rate": 1.957582954688584e-05, + "loss": 0.3952, + "step": 870 + }, + { + "epoch": 0.41622364255420824, + "grad_norm": 3.476711388368822, + "learning_rate": 1.9574310575538956e-05, + "loss": 0.4103, + "step": 871 + }, + { + "epoch": 0.4167015112597814, + "grad_norm": 0.7611765590691459, + "learning_rate": 1.9572788948447206e-05, + "loss": 0.4179, + "step": 872 + }, + { + "epoch": 0.41717937996535454, + "grad_norm": 0.600202395984294, + "learning_rate": 1.9571264666032667e-05, + "loss": 0.4074, + "step": 873 + }, + { + "epoch": 0.4176572486709277, + "grad_norm": 0.651033950877546, + "learning_rate": 1.9569737728718143e-05, + "loss": 0.4007, + "step": 874 + }, + { + "epoch": 0.41813511737650083, + "grad_norm": 0.6518303558888998, + "learning_rate": 1.9568208136927177e-05, + "loss": 0.4078, + "step": 875 + }, + { + "epoch": 0.4186129860820739, + "grad_norm": 0.6107727192896184, + "learning_rate": 1.956667589108406e-05, + "loss": 0.4205, + "step": 876 + }, + { + "epoch": 0.41909085478764707, + "grad_norm": 0.5977988275186865, + "learning_rate": 1.95651409916138e-05, + "loss": 0.3981, + "step": 877 + }, + { + "epoch": 0.4195687234932202, + "grad_norm": 0.6053736216356521, + "learning_rate": 1.9563603438942155e-05, + "loss": 0.41, + "step": 878 + }, + { + "epoch": 0.42004659219879337, + "grad_norm": 0.7201616837003876, + "learning_rate": 1.9562063233495615e-05, + "loss": 0.4086, + "step": 879 + }, + { + "epoch": 0.4205244609043665, + "grad_norm": 0.6109033092609298, + "learning_rate": 1.9560520375701408e-05, + "loss": 0.4061, + "step": 880 + }, + { + "epoch": 0.42100232960993966, + "grad_norm": 0.6186670618629603, + "learning_rate": 1.9558974865987494e-05, + "loss": 0.4207, + "step": 881 + }, + { + "epoch": 0.4214801983155128, + "grad_norm": 0.6147640108712225, + "learning_rate": 1.9557426704782564e-05, + "loss": 0.4166, + "step": 882 + }, + { + "epoch": 0.42195806702108596, + "grad_norm": 0.6202320300282839, + "learning_rate": 1.9555875892516064e-05, + "loss": 0.4055, + "step": 883 + }, + { + "epoch": 0.4224359357266591, + "grad_norm": 0.6438580742369829, + "learning_rate": 1.955432242961815e-05, + "loss": 0.4077, + "step": 884 + }, + { + "epoch": 0.42291380443223225, + "grad_norm": 0.6294856559072706, + "learning_rate": 1.9552766316519726e-05, + "loss": 0.4149, + "step": 885 + }, + { + "epoch": 0.4233916731378054, + "grad_norm": 0.6161283088521784, + "learning_rate": 1.955120755365244e-05, + "loss": 0.411, + "step": 886 + }, + { + "epoch": 0.42386954184337855, + "grad_norm": 0.6014577535152139, + "learning_rate": 1.9549646141448657e-05, + "loss": 0.3914, + "step": 887 + }, + { + "epoch": 0.4243474105489517, + "grad_norm": 0.6848333020026872, + "learning_rate": 1.9548082080341486e-05, + "loss": 0.4001, + "step": 888 + }, + { + "epoch": 0.42482527925452485, + "grad_norm": 0.6720870979122917, + "learning_rate": 1.954651537076477e-05, + "loss": 0.4238, + "step": 889 + }, + { + "epoch": 0.42530314796009794, + "grad_norm": 0.5818304388573133, + "learning_rate": 1.9544946013153093e-05, + "loss": 0.4042, + "step": 890 + }, + { + "epoch": 0.4257810166656711, + "grad_norm": 0.7744562017315486, + "learning_rate": 1.9543374007941756e-05, + "loss": 0.4238, + "step": 891 + }, + { + "epoch": 0.42625888537124423, + "grad_norm": 0.7762559725025642, + "learning_rate": 1.9541799355566813e-05, + "loss": 0.4111, + "step": 892 + }, + { + "epoch": 0.4267367540768174, + "grad_norm": 0.6599610950235643, + "learning_rate": 1.9540222056465046e-05, + "loss": 0.411, + "step": 893 + }, + { + "epoch": 0.42721462278239053, + "grad_norm": 0.7413734398856531, + "learning_rate": 1.9538642111073966e-05, + "loss": 0.417, + "step": 894 + }, + { + "epoch": 0.4276924914879637, + "grad_norm": 0.6502451354585309, + "learning_rate": 1.9537059519831822e-05, + "loss": 0.416, + "step": 895 + }, + { + "epoch": 0.4281703601935368, + "grad_norm": 0.8797654195920014, + "learning_rate": 1.9535474283177597e-05, + "loss": 0.4156, + "step": 896 + }, + { + "epoch": 0.42864822889911, + "grad_norm": 0.5595634974263763, + "learning_rate": 1.953388640155101e-05, + "loss": 0.4082, + "step": 897 + }, + { + "epoch": 0.4291260976046831, + "grad_norm": 0.6530429716085568, + "learning_rate": 1.953229587539251e-05, + "loss": 0.4025, + "step": 898 + }, + { + "epoch": 0.42960396631025627, + "grad_norm": 0.6657647909949589, + "learning_rate": 1.953070270514328e-05, + "loss": 0.4143, + "step": 899 + }, + { + "epoch": 0.4300818350158294, + "grad_norm": 0.7463671068027646, + "learning_rate": 1.9529106891245244e-05, + "loss": 0.3887, + "step": 900 + }, + { + "epoch": 0.43055970372140256, + "grad_norm": 0.6059170836222347, + "learning_rate": 1.952750843414104e-05, + "loss": 0.4028, + "step": 901 + }, + { + "epoch": 0.4310375724269757, + "grad_norm": 0.6074415817250135, + "learning_rate": 1.9525907334274063e-05, + "loss": 0.3867, + "step": 902 + }, + { + "epoch": 0.4315154411325488, + "grad_norm": 0.6460281093432895, + "learning_rate": 1.9524303592088424e-05, + "loss": 0.403, + "step": 903 + }, + { + "epoch": 0.43199330983812195, + "grad_norm": 0.6192867647004487, + "learning_rate": 1.9522697208028975e-05, + "loss": 0.4004, + "step": 904 + }, + { + "epoch": 0.4324711785436951, + "grad_norm": 0.7247569688470655, + "learning_rate": 1.9521088182541298e-05, + "loss": 0.4167, + "step": 905 + }, + { + "epoch": 0.43294904724926825, + "grad_norm": 0.7590859276414997, + "learning_rate": 1.9519476516071706e-05, + "loss": 0.4002, + "step": 906 + }, + { + "epoch": 0.4334269159548414, + "grad_norm": 0.5841622232228564, + "learning_rate": 1.951786220906725e-05, + "loss": 0.4182, + "step": 907 + }, + { + "epoch": 0.43390478466041454, + "grad_norm": 0.6692111403784917, + "learning_rate": 1.951624526197571e-05, + "loss": 0.401, + "step": 908 + }, + { + "epoch": 0.4343826533659877, + "grad_norm": 0.6097916498450394, + "learning_rate": 1.95146256752456e-05, + "loss": 0.4001, + "step": 909 + }, + { + "epoch": 0.43486052207156084, + "grad_norm": 0.7357451300421103, + "learning_rate": 1.951300344932616e-05, + "loss": 0.4279, + "step": 910 + }, + { + "epoch": 0.435338390777134, + "grad_norm": 1.4472772781586367, + "learning_rate": 1.9511378584667372e-05, + "loss": 0.409, + "step": 911 + }, + { + "epoch": 0.43581625948270714, + "grad_norm": 0.6412845231251528, + "learning_rate": 1.950975108171994e-05, + "loss": 0.4232, + "step": 912 + }, + { + "epoch": 0.4362941281882803, + "grad_norm": 0.6815096104930907, + "learning_rate": 1.950812094093531e-05, + "loss": 0.3989, + "step": 913 + }, + { + "epoch": 0.43677199689385343, + "grad_norm": 0.6287441394449275, + "learning_rate": 1.950648816276565e-05, + "loss": 0.4141, + "step": 914 + }, + { + "epoch": 0.4372498655994266, + "grad_norm": 0.6475760769332336, + "learning_rate": 1.9504852747663862e-05, + "loss": 0.4116, + "step": 915 + }, + { + "epoch": 0.4377277343049997, + "grad_norm": 0.5697529534279385, + "learning_rate": 1.9503214696083587e-05, + "loss": 0.3941, + "step": 916 + }, + { + "epoch": 0.4382056030105728, + "grad_norm": 0.6214698860437475, + "learning_rate": 1.9501574008479188e-05, + "loss": 0.4199, + "step": 917 + }, + { + "epoch": 0.43868347171614597, + "grad_norm": 0.5978885664378845, + "learning_rate": 1.9499930685305767e-05, + "loss": 0.3966, + "step": 918 + }, + { + "epoch": 0.4391613404217191, + "grad_norm": 0.6051257671018444, + "learning_rate": 1.949828472701915e-05, + "loss": 0.4008, + "step": 919 + }, + { + "epoch": 0.43963920912729226, + "grad_norm": 0.6920182979659302, + "learning_rate": 1.9496636134075894e-05, + "loss": 0.3959, + "step": 920 + }, + { + "epoch": 0.4401170778328654, + "grad_norm": 0.650235126033297, + "learning_rate": 1.9494984906933293e-05, + "loss": 0.3951, + "step": 921 + }, + { + "epoch": 0.44059494653843856, + "grad_norm": 0.683430884067445, + "learning_rate": 1.9493331046049366e-05, + "loss": 0.398, + "step": 922 + }, + { + "epoch": 0.4410728152440117, + "grad_norm": 0.6452456126774382, + "learning_rate": 1.9491674551882867e-05, + "loss": 0.389, + "step": 923 + }, + { + "epoch": 0.44155068394958485, + "grad_norm": 0.695330236916925, + "learning_rate": 1.9490015424893277e-05, + "loss": 0.4016, + "step": 924 + }, + { + "epoch": 0.442028552655158, + "grad_norm": 0.6316596603137281, + "learning_rate": 1.9488353665540813e-05, + "loss": 0.3978, + "step": 925 + }, + { + "epoch": 0.44250642136073115, + "grad_norm": 0.8094826470406035, + "learning_rate": 1.9486689274286413e-05, + "loss": 0.3837, + "step": 926 + }, + { + "epoch": 0.4429842900663043, + "grad_norm": 0.8597357950448963, + "learning_rate": 1.9485022251591744e-05, + "loss": 0.4175, + "step": 927 + }, + { + "epoch": 0.44346215877187745, + "grad_norm": 0.6850435114555503, + "learning_rate": 1.9483352597919222e-05, + "loss": 0.416, + "step": 928 + }, + { + "epoch": 0.4439400274774506, + "grad_norm": 0.6105257300932792, + "learning_rate": 1.9481680313731973e-05, + "loss": 0.4087, + "step": 929 + }, + { + "epoch": 0.44441789618302374, + "grad_norm": 0.7783073049281799, + "learning_rate": 1.9480005399493857e-05, + "loss": 0.4084, + "step": 930 + }, + { + "epoch": 0.44489576488859683, + "grad_norm": 0.8160008351900309, + "learning_rate": 1.9478327855669468e-05, + "loss": 0.4102, + "step": 931 + }, + { + "epoch": 0.44537363359417, + "grad_norm": 0.6251532938868944, + "learning_rate": 1.9476647682724125e-05, + "loss": 0.4028, + "step": 932 + }, + { + "epoch": 0.44585150229974313, + "grad_norm": 0.7279316882881499, + "learning_rate": 1.9474964881123883e-05, + "loss": 0.4085, + "step": 933 + }, + { + "epoch": 0.4463293710053163, + "grad_norm": 0.6648782797147017, + "learning_rate": 1.9473279451335517e-05, + "loss": 0.412, + "step": 934 + }, + { + "epoch": 0.4468072397108894, + "grad_norm": 0.7245218627960338, + "learning_rate": 1.9471591393826536e-05, + "loss": 0.3952, + "step": 935 + }, + { + "epoch": 0.4472851084164626, + "grad_norm": 0.7468126769642631, + "learning_rate": 1.9469900709065176e-05, + "loss": 0.4059, + "step": 936 + }, + { + "epoch": 0.4477629771220357, + "grad_norm": 0.7157504712908894, + "learning_rate": 1.9468207397520413e-05, + "loss": 0.3994, + "step": 937 + }, + { + "epoch": 0.44824084582760887, + "grad_norm": 0.6310072705677632, + "learning_rate": 1.946651145966193e-05, + "loss": 0.406, + "step": 938 + }, + { + "epoch": 0.448718714533182, + "grad_norm": 0.6784353372400441, + "learning_rate": 1.9464812895960152e-05, + "loss": 0.4009, + "step": 939 + }, + { + "epoch": 0.44919658323875516, + "grad_norm": 0.6479984452905685, + "learning_rate": 1.9463111706886234e-05, + "loss": 0.4168, + "step": 940 + }, + { + "epoch": 0.4496744519443283, + "grad_norm": 0.6287167548848303, + "learning_rate": 1.9461407892912055e-05, + "loss": 0.4014, + "step": 941 + }, + { + "epoch": 0.45015232064990146, + "grad_norm": 0.7024078893645669, + "learning_rate": 1.9459701454510228e-05, + "loss": 0.4097, + "step": 942 + }, + { + "epoch": 0.4506301893554746, + "grad_norm": 0.6977676751624544, + "learning_rate": 1.945799239215408e-05, + "loss": 0.4026, + "step": 943 + }, + { + "epoch": 0.4511080580610477, + "grad_norm": 0.64820079330831, + "learning_rate": 1.945628070631768e-05, + "loss": 0.3986, + "step": 944 + }, + { + "epoch": 0.45158592676662085, + "grad_norm": 0.7286318274900389, + "learning_rate": 1.9454566397475813e-05, + "loss": 0.3888, + "step": 945 + }, + { + "epoch": 0.452063795472194, + "grad_norm": 1.4877125746903945, + "learning_rate": 1.9452849466104008e-05, + "loss": 0.4151, + "step": 946 + }, + { + "epoch": 0.45254166417776714, + "grad_norm": 0.6879684254386736, + "learning_rate": 1.9451129912678506e-05, + "loss": 0.3934, + "step": 947 + }, + { + "epoch": 0.4530195328833403, + "grad_norm": 0.6050674990436092, + "learning_rate": 1.9449407737676277e-05, + "loss": 0.4019, + "step": 948 + }, + { + "epoch": 0.45349740158891344, + "grad_norm": 0.6424207582798632, + "learning_rate": 1.9447682941575032e-05, + "loss": 0.4142, + "step": 949 + }, + { + "epoch": 0.4539752702944866, + "grad_norm": 0.6584464274725602, + "learning_rate": 1.944595552485319e-05, + "loss": 0.4112, + "step": 950 + }, + { + "epoch": 0.45445313900005974, + "grad_norm": 1.8201606803835755, + "learning_rate": 1.9444225487989912e-05, + "loss": 0.3986, + "step": 951 + }, + { + "epoch": 0.4549310077056329, + "grad_norm": 0.6649041164921152, + "learning_rate": 1.9442492831465075e-05, + "loss": 0.4124, + "step": 952 + }, + { + "epoch": 0.45540887641120603, + "grad_norm": 0.6103343936848789, + "learning_rate": 1.944075755575929e-05, + "loss": 0.402, + "step": 953 + }, + { + "epoch": 0.4558867451167792, + "grad_norm": 1.1183419241012187, + "learning_rate": 1.943901966135389e-05, + "loss": 0.4032, + "step": 954 + }, + { + "epoch": 0.4563646138223523, + "grad_norm": 0.6278256875792309, + "learning_rate": 1.943727914873094e-05, + "loss": 0.397, + "step": 955 + }, + { + "epoch": 0.4568424825279255, + "grad_norm": 0.5725976955845966, + "learning_rate": 1.943553601837322e-05, + "loss": 0.4117, + "step": 956 + }, + { + "epoch": 0.4573203512334986, + "grad_norm": 0.7341816058812604, + "learning_rate": 1.943379027076425e-05, + "loss": 0.389, + "step": 957 + }, + { + "epoch": 0.4577982199390717, + "grad_norm": 0.6161118906924941, + "learning_rate": 1.943204190638827e-05, + "loss": 0.402, + "step": 958 + }, + { + "epoch": 0.45827608864464486, + "grad_norm": 0.5931536750814665, + "learning_rate": 1.9430290925730245e-05, + "loss": 0.4064, + "step": 959 + }, + { + "epoch": 0.458753957350218, + "grad_norm": 0.6397321208831805, + "learning_rate": 1.9428537329275862e-05, + "loss": 0.4029, + "step": 960 + }, + { + "epoch": 0.45923182605579116, + "grad_norm": 0.5770485075045017, + "learning_rate": 1.942678111751154e-05, + "loss": 0.4154, + "step": 961 + }, + { + "epoch": 0.4597096947613643, + "grad_norm": 0.6983859402707672, + "learning_rate": 1.942502229092442e-05, + "loss": 0.3865, + "step": 962 + }, + { + "epoch": 0.46018756346693745, + "grad_norm": 0.566499169480957, + "learning_rate": 1.9423260850002375e-05, + "loss": 0.391, + "step": 963 + }, + { + "epoch": 0.4606654321725106, + "grad_norm": 0.6674856926358066, + "learning_rate": 1.9421496795233995e-05, + "loss": 0.4177, + "step": 964 + }, + { + "epoch": 0.46114330087808375, + "grad_norm": 0.6077827552367149, + "learning_rate": 1.941973012710859e-05, + "loss": 0.4064, + "step": 965 + }, + { + "epoch": 0.4616211695836569, + "grad_norm": 0.6393677236982209, + "learning_rate": 1.9417960846116214e-05, + "loss": 0.4079, + "step": 966 + }, + { + "epoch": 0.46209903828923005, + "grad_norm": 0.6681026939421527, + "learning_rate": 1.941618895274763e-05, + "loss": 0.3953, + "step": 967 + }, + { + "epoch": 0.4625769069948032, + "grad_norm": 0.6460174077865963, + "learning_rate": 1.9414414447494326e-05, + "loss": 0.4012, + "step": 968 + }, + { + "epoch": 0.46305477570037634, + "grad_norm": 0.6582432277442615, + "learning_rate": 1.9412637330848524e-05, + "loss": 0.4101, + "step": 969 + }, + { + "epoch": 0.4635326444059495, + "grad_norm": 0.6318462060701575, + "learning_rate": 1.941085760330316e-05, + "loss": 0.4059, + "step": 970 + }, + { + "epoch": 0.4640105131115226, + "grad_norm": 0.7203162939351494, + "learning_rate": 1.9409075265351904e-05, + "loss": 0.4137, + "step": 971 + }, + { + "epoch": 0.46448838181709573, + "grad_norm": 0.6894708541476223, + "learning_rate": 1.940729031748914e-05, + "loss": 0.409, + "step": 972 + }, + { + "epoch": 0.4649662505226689, + "grad_norm": 0.6082868118061796, + "learning_rate": 1.9405502760209988e-05, + "loss": 0.3811, + "step": 973 + }, + { + "epoch": 0.465444119228242, + "grad_norm": 0.575259488569313, + "learning_rate": 1.9403712594010275e-05, + "loss": 0.3959, + "step": 974 + }, + { + "epoch": 0.4659219879338152, + "grad_norm": 0.6187113385107887, + "learning_rate": 1.940191981938657e-05, + "loss": 0.3927, + "step": 975 + }, + { + "epoch": 0.4663998566393883, + "grad_norm": 0.5980885678751732, + "learning_rate": 1.9400124436836155e-05, + "loss": 0.403, + "step": 976 + }, + { + "epoch": 0.46687772534496147, + "grad_norm": 0.59607435141776, + "learning_rate": 1.9398326446857034e-05, + "loss": 0.4037, + "step": 977 + }, + { + "epoch": 0.4673555940505346, + "grad_norm": 0.6121952890654038, + "learning_rate": 1.939652584994794e-05, + "loss": 0.3928, + "step": 978 + }, + { + "epoch": 0.46783346275610777, + "grad_norm": 0.6519511555849895, + "learning_rate": 1.9394722646608332e-05, + "loss": 0.3984, + "step": 979 + }, + { + "epoch": 0.4683113314616809, + "grad_norm": 0.5807245005292517, + "learning_rate": 1.9392916837338376e-05, + "loss": 0.4097, + "step": 980 + }, + { + "epoch": 0.46878920016725406, + "grad_norm": 0.5679167625106275, + "learning_rate": 1.939110842263898e-05, + "loss": 0.3963, + "step": 981 + }, + { + "epoch": 0.4692670688728272, + "grad_norm": 0.6227411181855099, + "learning_rate": 1.9389297403011767e-05, + "loss": 0.4104, + "step": 982 + }, + { + "epoch": 0.46974493757840036, + "grad_norm": 0.5888976376216906, + "learning_rate": 1.9387483778959075e-05, + "loss": 0.3936, + "step": 983 + }, + { + "epoch": 0.4702228062839735, + "grad_norm": 1.09297261340797, + "learning_rate": 1.9385667550983974e-05, + "loss": 0.4219, + "step": 984 + }, + { + "epoch": 0.4707006749895466, + "grad_norm": 0.5838151733918109, + "learning_rate": 1.9383848719590257e-05, + "loss": 0.4059, + "step": 985 + }, + { + "epoch": 0.47117854369511974, + "grad_norm": 0.5684738684896777, + "learning_rate": 1.9382027285282437e-05, + "loss": 0.386, + "step": 986 + }, + { + "epoch": 0.4716564124006929, + "grad_norm": 0.5797945596171794, + "learning_rate": 1.9380203248565738e-05, + "loss": 0.406, + "step": 987 + }, + { + "epoch": 0.47213428110626604, + "grad_norm": 0.5735206647369371, + "learning_rate": 1.9378376609946126e-05, + "loss": 0.4109, + "step": 988 + }, + { + "epoch": 0.4726121498118392, + "grad_norm": 0.5591211466804421, + "learning_rate": 1.937654736993027e-05, + "loss": 0.4157, + "step": 989 + }, + { + "epoch": 0.47309001851741234, + "grad_norm": 0.5955292703363935, + "learning_rate": 1.9374715529025575e-05, + "loss": 0.3973, + "step": 990 + }, + { + "epoch": 0.4735678872229855, + "grad_norm": 0.967343371666392, + "learning_rate": 1.9372881087740162e-05, + "loss": 0.3997, + "step": 991 + }, + { + "epoch": 0.47404575592855863, + "grad_norm": 0.5801781650436966, + "learning_rate": 1.9371044046582867e-05, + "loss": 0.4082, + "step": 992 + }, + { + "epoch": 0.4745236246341318, + "grad_norm": 0.6145894286769649, + "learning_rate": 1.9369204406063257e-05, + "loss": 0.4004, + "step": 993 + }, + { + "epoch": 0.4750014933397049, + "grad_norm": 0.6330844181843237, + "learning_rate": 1.936736216669161e-05, + "loss": 0.3982, + "step": 994 + }, + { + "epoch": 0.4754793620452781, + "grad_norm": 0.5704791125251828, + "learning_rate": 1.9365517328978943e-05, + "loss": 0.4129, + "step": 995 + }, + { + "epoch": 0.4759572307508512, + "grad_norm": 0.5887093260184034, + "learning_rate": 1.936366989343697e-05, + "loss": 0.4071, + "step": 996 + }, + { + "epoch": 0.47643509945642437, + "grad_norm": 0.6293352936706972, + "learning_rate": 1.9361819860578143e-05, + "loss": 0.4105, + "step": 997 + }, + { + "epoch": 0.47691296816199746, + "grad_norm": 0.5368324375592416, + "learning_rate": 1.9359967230915622e-05, + "loss": 0.3939, + "step": 998 + }, + { + "epoch": 0.4773908368675706, + "grad_norm": 0.6637884556032649, + "learning_rate": 1.9358112004963304e-05, + "loss": 0.4003, + "step": 999 + }, + { + "epoch": 0.47786870557314376, + "grad_norm": 0.6269569640161438, + "learning_rate": 1.9356254183235785e-05, + "loss": 0.3977, + "step": 1000 + }, + { + "epoch": 0.4783465742787169, + "grad_norm": 0.5907094927745844, + "learning_rate": 1.93543937662484e-05, + "loss": 0.3888, + "step": 1001 + }, + { + "epoch": 0.47882444298429006, + "grad_norm": 0.6356082307658898, + "learning_rate": 1.935253075451719e-05, + "loss": 0.4009, + "step": 1002 + }, + { + "epoch": 0.4793023116898632, + "grad_norm": 0.598063943812148, + "learning_rate": 1.935066514855893e-05, + "loss": 0.3996, + "step": 1003 + }, + { + "epoch": 0.47978018039543635, + "grad_norm": 0.6878231195525895, + "learning_rate": 1.9348796948891094e-05, + "loss": 0.3917, + "step": 1004 + }, + { + "epoch": 0.4802580491010095, + "grad_norm": 0.5939172442059919, + "learning_rate": 1.93469261560319e-05, + "loss": 0.3907, + "step": 1005 + }, + { + "epoch": 0.48073591780658265, + "grad_norm": 0.7585300644554811, + "learning_rate": 1.9345052770500262e-05, + "loss": 0.3896, + "step": 1006 + }, + { + "epoch": 0.4812137865121558, + "grad_norm": 0.5717895671680949, + "learning_rate": 1.934317679281583e-05, + "loss": 0.4091, + "step": 1007 + }, + { + "epoch": 0.48169165521772894, + "grad_norm": 0.6208962871977249, + "learning_rate": 1.9341298223498973e-05, + "loss": 0.3867, + "step": 1008 + }, + { + "epoch": 0.4821695239233021, + "grad_norm": 0.66712603907002, + "learning_rate": 1.933941706307076e-05, + "loss": 0.3796, + "step": 1009 + }, + { + "epoch": 0.48264739262887524, + "grad_norm": 0.586042042111701, + "learning_rate": 1.9337533312053002e-05, + "loss": 0.4176, + "step": 1010 + }, + { + "epoch": 0.4831252613344484, + "grad_norm": 0.6269719997423575, + "learning_rate": 1.9335646970968214e-05, + "loss": 0.3991, + "step": 1011 + }, + { + "epoch": 0.4836031300400215, + "grad_norm": 0.55742694480261, + "learning_rate": 1.933375804033963e-05, + "loss": 0.3899, + "step": 1012 + }, + { + "epoch": 0.4840809987455946, + "grad_norm": 0.6669718179555711, + "learning_rate": 1.9331866520691214e-05, + "loss": 0.397, + "step": 1013 + }, + { + "epoch": 0.4845588674511678, + "grad_norm": 0.6257739428321304, + "learning_rate": 1.9329972412547637e-05, + "loss": 0.3954, + "step": 1014 + }, + { + "epoch": 0.4850367361567409, + "grad_norm": 0.6256701257508585, + "learning_rate": 1.9328075716434287e-05, + "loss": 0.3933, + "step": 1015 + }, + { + "epoch": 0.48551460486231407, + "grad_norm": 0.6720722566805102, + "learning_rate": 1.932617643287728e-05, + "loss": 0.3918, + "step": 1016 + }, + { + "epoch": 0.4859924735678872, + "grad_norm": 0.6950543974048602, + "learning_rate": 1.9324274562403446e-05, + "loss": 0.4021, + "step": 1017 + }, + { + "epoch": 0.48647034227346037, + "grad_norm": 0.6220688444979767, + "learning_rate": 1.9322370105540317e-05, + "loss": 0.4057, + "step": 1018 + }, + { + "epoch": 0.4869482109790335, + "grad_norm": 0.6165966442005212, + "learning_rate": 1.932046306281617e-05, + "loss": 0.3905, + "step": 1019 + }, + { + "epoch": 0.48742607968460666, + "grad_norm": 1.0211125046724039, + "learning_rate": 1.931855343475998e-05, + "loss": 0.4144, + "step": 1020 + }, + { + "epoch": 0.4879039483901798, + "grad_norm": 0.5733822946288798, + "learning_rate": 1.931664122190144e-05, + "loss": 0.3917, + "step": 1021 + }, + { + "epoch": 0.48838181709575296, + "grad_norm": 0.6130027608133938, + "learning_rate": 1.931472642477097e-05, + "loss": 0.413, + "step": 1022 + }, + { + "epoch": 0.4888596858013261, + "grad_norm": 0.6073794838231226, + "learning_rate": 1.93128090438997e-05, + "loss": 0.3926, + "step": 1023 + }, + { + "epoch": 0.48933755450689925, + "grad_norm": 0.6699051309493238, + "learning_rate": 1.9310889079819474e-05, + "loss": 0.3945, + "step": 1024 + }, + { + "epoch": 0.4898154232124724, + "grad_norm": 0.6983896244728203, + "learning_rate": 1.930896653306286e-05, + "loss": 0.4017, + "step": 1025 + }, + { + "epoch": 0.4902932919180455, + "grad_norm": 0.5963071167388689, + "learning_rate": 1.9307041404163135e-05, + "loss": 0.4059, + "step": 1026 + }, + { + "epoch": 0.49077116062361864, + "grad_norm": 0.6671617092464391, + "learning_rate": 1.93051136936543e-05, + "loss": 0.3939, + "step": 1027 + }, + { + "epoch": 0.4912490293291918, + "grad_norm": 0.6222549383683285, + "learning_rate": 1.9303183402071065e-05, + "loss": 0.4081, + "step": 1028 + }, + { + "epoch": 0.49172689803476494, + "grad_norm": 0.6266179396807892, + "learning_rate": 1.930125052994886e-05, + "loss": 0.4013, + "step": 1029 + }, + { + "epoch": 0.4922047667403381, + "grad_norm": 0.5868185951865348, + "learning_rate": 1.929931507782383e-05, + "loss": 0.3842, + "step": 1030 + }, + { + "epoch": 0.49268263544591123, + "grad_norm": 0.5854933919657626, + "learning_rate": 1.9297377046232833e-05, + "loss": 0.3961, + "step": 1031 + }, + { + "epoch": 0.4931605041514844, + "grad_norm": 0.5729556005202394, + "learning_rate": 1.9295436435713447e-05, + "loss": 0.3947, + "step": 1032 + }, + { + "epoch": 0.49363837285705753, + "grad_norm": 0.6238098390423348, + "learning_rate": 1.9293493246803962e-05, + "loss": 0.3816, + "step": 1033 + }, + { + "epoch": 0.4941162415626307, + "grad_norm": 0.5629873685546706, + "learning_rate": 1.9291547480043385e-05, + "loss": 0.4014, + "step": 1034 + }, + { + "epoch": 0.4945941102682038, + "grad_norm": 0.5648249030788314, + "learning_rate": 1.9289599135971437e-05, + "loss": 0.4241, + "step": 1035 + }, + { + "epoch": 0.49507197897377697, + "grad_norm": 0.6247491915063801, + "learning_rate": 1.9287648215128553e-05, + "loss": 0.4027, + "step": 1036 + }, + { + "epoch": 0.4955498476793501, + "grad_norm": 0.5182781354785916, + "learning_rate": 1.928569471805589e-05, + "loss": 0.3816, + "step": 1037 + }, + { + "epoch": 0.49602771638492327, + "grad_norm": 0.5842774907201587, + "learning_rate": 1.9283738645295304e-05, + "loss": 0.405, + "step": 1038 + }, + { + "epoch": 0.49650558509049636, + "grad_norm": 0.7006381584927244, + "learning_rate": 1.928177999738938e-05, + "loss": 0.3895, + "step": 1039 + }, + { + "epoch": 0.4969834537960695, + "grad_norm": 0.6100647649213121, + "learning_rate": 1.9279818774881418e-05, + "loss": 0.388, + "step": 1040 + }, + { + "epoch": 0.49746132250164266, + "grad_norm": 0.5485271123253042, + "learning_rate": 1.9277854978315415e-05, + "loss": 0.3972, + "step": 1041 + }, + { + "epoch": 0.4979391912072158, + "grad_norm": 0.5710883012225493, + "learning_rate": 1.9275888608236104e-05, + "loss": 0.4059, + "step": 1042 + }, + { + "epoch": 0.49841705991278895, + "grad_norm": 0.5630532049128263, + "learning_rate": 1.9273919665188913e-05, + "loss": 0.3959, + "step": 1043 + }, + { + "epoch": 0.4988949286183621, + "grad_norm": 0.5595415721679393, + "learning_rate": 1.9271948149719998e-05, + "loss": 0.4058, + "step": 1044 + }, + { + "epoch": 0.49937279732393525, + "grad_norm": 0.5890600892979772, + "learning_rate": 1.9269974062376224e-05, + "loss": 0.3893, + "step": 1045 + }, + { + "epoch": 0.4998506660295084, + "grad_norm": 0.5984090472815659, + "learning_rate": 1.926799740370516e-05, + "loss": 0.3982, + "step": 1046 + }, + { + "epoch": 0.5003285347350815, + "grad_norm": 0.6502431881696009, + "learning_rate": 1.92660181742551e-05, + "loss": 0.3988, + "step": 1047 + }, + { + "epoch": 0.5008064034406546, + "grad_norm": 0.5794688514925903, + "learning_rate": 1.926403637457505e-05, + "loss": 0.3978, + "step": 1048 + }, + { + "epoch": 0.5012842721462278, + "grad_norm": 0.5486583842322683, + "learning_rate": 1.926205200521473e-05, + "loss": 0.405, + "step": 1049 + }, + { + "epoch": 0.5017621408518009, + "grad_norm": 0.5964484082934959, + "learning_rate": 1.926006506672456e-05, + "loss": 0.4075, + "step": 1050 + }, + { + "epoch": 0.5022400095573741, + "grad_norm": 0.6015432115221289, + "learning_rate": 1.925807555965568e-05, + "loss": 0.4011, + "step": 1051 + }, + { + "epoch": 0.5027178782629472, + "grad_norm": 0.594525307065842, + "learning_rate": 1.9256083484559953e-05, + "loss": 0.3979, + "step": 1052 + }, + { + "epoch": 0.5031957469685204, + "grad_norm": 0.6660777792568852, + "learning_rate": 1.9254088841989943e-05, + "loss": 0.3944, + "step": 1053 + }, + { + "epoch": 0.5036736156740935, + "grad_norm": 0.589208348377751, + "learning_rate": 1.9252091632498926e-05, + "loss": 0.3863, + "step": 1054 + }, + { + "epoch": 0.5041514843796667, + "grad_norm": 0.635977217231809, + "learning_rate": 1.9250091856640895e-05, + "loss": 0.4113, + "step": 1055 + }, + { + "epoch": 0.5046293530852398, + "grad_norm": 0.6828960314162973, + "learning_rate": 1.9248089514970553e-05, + "loss": 0.3817, + "step": 1056 + }, + { + "epoch": 0.505107221790813, + "grad_norm": 0.5707103289605037, + "learning_rate": 1.924608460804331e-05, + "loss": 0.3892, + "step": 1057 + }, + { + "epoch": 0.5055850904963861, + "grad_norm": 0.5594636089450963, + "learning_rate": 1.9244077136415298e-05, + "loss": 0.3878, + "step": 1058 + }, + { + "epoch": 0.5060629592019593, + "grad_norm": 0.566732821838709, + "learning_rate": 1.924206710064335e-05, + "loss": 0.4012, + "step": 1059 + }, + { + "epoch": 0.5065408279075324, + "grad_norm": 0.5934091175624526, + "learning_rate": 1.9240054501285015e-05, + "loss": 0.3814, + "step": 1060 + }, + { + "epoch": 0.5070186966131055, + "grad_norm": 0.5563172693307783, + "learning_rate": 1.9238039338898554e-05, + "loss": 0.4011, + "step": 1061 + }, + { + "epoch": 0.5074965653186787, + "grad_norm": 0.5998159022961965, + "learning_rate": 1.9236021614042936e-05, + "loss": 0.3854, + "step": 1062 + }, + { + "epoch": 0.5079744340242518, + "grad_norm": 0.5778247983953256, + "learning_rate": 1.9234001327277842e-05, + "loss": 0.3855, + "step": 1063 + }, + { + "epoch": 0.508452302729825, + "grad_norm": 0.5761650392879268, + "learning_rate": 1.9231978479163666e-05, + "loss": 0.4016, + "step": 1064 + }, + { + "epoch": 0.5089301714353981, + "grad_norm": 0.5599768828396271, + "learning_rate": 1.922995307026151e-05, + "loss": 0.3971, + "step": 1065 + }, + { + "epoch": 0.5094080401409713, + "grad_norm": 0.8720261103107927, + "learning_rate": 1.9227925101133184e-05, + "loss": 0.4097, + "step": 1066 + }, + { + "epoch": 0.5098859088465444, + "grad_norm": 0.5924340309412149, + "learning_rate": 1.922589457234121e-05, + "loss": 0.3871, + "step": 1067 + }, + { + "epoch": 0.5103637775521176, + "grad_norm": 0.6004985435021857, + "learning_rate": 1.9223861484448833e-05, + "loss": 0.3951, + "step": 1068 + }, + { + "epoch": 0.5108416462576907, + "grad_norm": 0.6146759569604939, + "learning_rate": 1.922182583801998e-05, + "loss": 0.3922, + "step": 1069 + }, + { + "epoch": 0.5113195149632639, + "grad_norm": 0.5821995968641629, + "learning_rate": 1.921978763361931e-05, + "loss": 0.4043, + "step": 1070 + }, + { + "epoch": 0.511797383668837, + "grad_norm": 0.6337816029347101, + "learning_rate": 1.9217746871812184e-05, + "loss": 0.4012, + "step": 1071 + }, + { + "epoch": 0.5122752523744102, + "grad_norm": 0.5637983730800908, + "learning_rate": 1.9215703553164676e-05, + "loss": 0.4098, + "step": 1072 + }, + { + "epoch": 0.5127531210799833, + "grad_norm": 0.6159837626313385, + "learning_rate": 1.921365767824356e-05, + "loss": 0.402, + "step": 1073 + }, + { + "epoch": 0.5132309897855564, + "grad_norm": 0.6617255676511058, + "learning_rate": 1.9211609247616335e-05, + "loss": 0.4129, + "step": 1074 + }, + { + "epoch": 0.5137088584911296, + "grad_norm": 0.5934568209062026, + "learning_rate": 1.9209558261851194e-05, + "loss": 0.3833, + "step": 1075 + }, + { + "epoch": 0.5141867271967027, + "grad_norm": 0.6221409035389256, + "learning_rate": 1.9207504721517047e-05, + "loss": 0.3989, + "step": 1076 + }, + { + "epoch": 0.5146645959022759, + "grad_norm": 0.5989406482919697, + "learning_rate": 1.92054486271835e-05, + "loss": 0.3982, + "step": 1077 + }, + { + "epoch": 0.515142464607849, + "grad_norm": 0.6476320490359937, + "learning_rate": 1.9203389979420893e-05, + "loss": 0.3723, + "step": 1078 + }, + { + "epoch": 0.5156203333134222, + "grad_norm": 0.5853567195202979, + "learning_rate": 1.9201328778800247e-05, + "loss": 0.4077, + "step": 1079 + }, + { + "epoch": 0.5160982020189953, + "grad_norm": 0.5900737762201892, + "learning_rate": 1.919926502589331e-05, + "loss": 0.3866, + "step": 1080 + }, + { + "epoch": 0.5165760707245685, + "grad_norm": 0.6051998816584814, + "learning_rate": 1.9197198721272527e-05, + "loss": 0.4011, + "step": 1081 + }, + { + "epoch": 0.5170539394301416, + "grad_norm": 0.5901233973103801, + "learning_rate": 1.919512986551105e-05, + "loss": 0.3873, + "step": 1082 + }, + { + "epoch": 0.5175318081357148, + "grad_norm": 0.5723558837242273, + "learning_rate": 1.9193058459182747e-05, + "loss": 0.4008, + "step": 1083 + }, + { + "epoch": 0.5180096768412878, + "grad_norm": 0.586853172304618, + "learning_rate": 1.9190984502862196e-05, + "loss": 0.372, + "step": 1084 + }, + { + "epoch": 0.518487545546861, + "grad_norm": 0.5970057412271904, + "learning_rate": 1.9188907997124666e-05, + "loss": 0.4002, + "step": 1085 + }, + { + "epoch": 0.5189654142524341, + "grad_norm": 0.5890213702570649, + "learning_rate": 1.9186828942546148e-05, + "loss": 0.3988, + "step": 1086 + }, + { + "epoch": 0.5194432829580072, + "grad_norm": 0.5345329489947781, + "learning_rate": 1.9184747339703334e-05, + "loss": 0.3914, + "step": 1087 + }, + { + "epoch": 0.5199211516635804, + "grad_norm": 0.6791735135926099, + "learning_rate": 1.9182663189173625e-05, + "loss": 0.3988, + "step": 1088 + }, + { + "epoch": 0.5203990203691535, + "grad_norm": 0.5596329309086331, + "learning_rate": 1.9180576491535125e-05, + "loss": 0.3962, + "step": 1089 + }, + { + "epoch": 0.5208768890747267, + "grad_norm": 0.5628392832054393, + "learning_rate": 1.9178487247366652e-05, + "loss": 0.3747, + "step": 1090 + }, + { + "epoch": 0.5213547577802998, + "grad_norm": 0.569936759080529, + "learning_rate": 1.9176395457247722e-05, + "loss": 0.4074, + "step": 1091 + }, + { + "epoch": 0.521832626485873, + "grad_norm": 0.5528738761628226, + "learning_rate": 1.917430112175856e-05, + "loss": 0.3992, + "step": 1092 + }, + { + "epoch": 0.5223104951914461, + "grad_norm": 0.6109126673803167, + "learning_rate": 1.9172204241480098e-05, + "loss": 0.4046, + "step": 1093 + }, + { + "epoch": 0.5227883638970193, + "grad_norm": 0.5513494264708154, + "learning_rate": 1.9170104816993973e-05, + "loss": 0.388, + "step": 1094 + }, + { + "epoch": 0.5232662326025924, + "grad_norm": 0.5523842737345893, + "learning_rate": 1.916800284888253e-05, + "loss": 0.3811, + "step": 1095 + }, + { + "epoch": 0.5237441013081656, + "grad_norm": 0.5443431872996419, + "learning_rate": 1.9165898337728818e-05, + "loss": 0.4142, + "step": 1096 + }, + { + "epoch": 0.5242219700137387, + "grad_norm": 0.5903680509886946, + "learning_rate": 1.9163791284116592e-05, + "loss": 0.3954, + "step": 1097 + }, + { + "epoch": 0.5246998387193119, + "grad_norm": 0.7018103685542808, + "learning_rate": 1.916168168863031e-05, + "loss": 0.4088, + "step": 1098 + }, + { + "epoch": 0.525177707424885, + "grad_norm": 0.5915389193430727, + "learning_rate": 1.915956955185514e-05, + "loss": 0.4055, + "step": 1099 + }, + { + "epoch": 0.5256555761304581, + "grad_norm": 0.6388748207474322, + "learning_rate": 1.915745487437694e-05, + "loss": 0.4055, + "step": 1100 + }, + { + "epoch": 0.5261334448360313, + "grad_norm": 0.5374359808793951, + "learning_rate": 1.91553376567823e-05, + "loss": 0.4095, + "step": 1101 + }, + { + "epoch": 0.5266113135416044, + "grad_norm": 0.6253661283286657, + "learning_rate": 1.915321789965849e-05, + "loss": 0.3993, + "step": 1102 + }, + { + "epoch": 0.5270891822471776, + "grad_norm": 0.6007927062737137, + "learning_rate": 1.915109560359349e-05, + "loss": 0.3789, + "step": 1103 + }, + { + "epoch": 0.5275670509527507, + "grad_norm": 0.5773114426599819, + "learning_rate": 1.9148970769176e-05, + "loss": 0.4031, + "step": 1104 + }, + { + "epoch": 0.5280449196583239, + "grad_norm": 1.5644927863876732, + "learning_rate": 1.9146843396995396e-05, + "loss": 0.3807, + "step": 1105 + }, + { + "epoch": 0.528522788363897, + "grad_norm": 0.7068271199032458, + "learning_rate": 1.9144713487641786e-05, + "loss": 0.404, + "step": 1106 + }, + { + "epoch": 0.5290006570694702, + "grad_norm": 0.5842159117368475, + "learning_rate": 1.9142581041705964e-05, + "loss": 0.3733, + "step": 1107 + }, + { + "epoch": 0.5294785257750433, + "grad_norm": 0.629512416750542, + "learning_rate": 1.914044605977943e-05, + "loss": 0.4029, + "step": 1108 + }, + { + "epoch": 0.5299563944806165, + "grad_norm": 0.5591127726288794, + "learning_rate": 1.91383085424544e-05, + "loss": 0.3975, + "step": 1109 + }, + { + "epoch": 0.5304342631861896, + "grad_norm": 0.5508233170263688, + "learning_rate": 1.9136168490323772e-05, + "loss": 0.3975, + "step": 1110 + }, + { + "epoch": 0.5309121318917628, + "grad_norm": 0.6177805307841441, + "learning_rate": 1.9134025903981163e-05, + "loss": 0.3835, + "step": 1111 + }, + { + "epoch": 0.5313900005973359, + "grad_norm": 0.5833143529773639, + "learning_rate": 1.9131880784020893e-05, + "loss": 0.4139, + "step": 1112 + }, + { + "epoch": 0.5318678693029091, + "grad_norm": 0.5967595237608566, + "learning_rate": 1.9129733131037977e-05, + "loss": 0.4106, + "step": 1113 + }, + { + "epoch": 0.5323457380084822, + "grad_norm": 0.6172221950353266, + "learning_rate": 1.9127582945628135e-05, + "loss": 0.3902, + "step": 1114 + }, + { + "epoch": 0.5328236067140553, + "grad_norm": 0.5682174098963437, + "learning_rate": 1.9125430228387794e-05, + "loss": 0.3874, + "step": 1115 + }, + { + "epoch": 0.5333014754196285, + "grad_norm": 0.6193518812671628, + "learning_rate": 1.9123274979914076e-05, + "loss": 0.4007, + "step": 1116 + }, + { + "epoch": 0.5337793441252016, + "grad_norm": 0.604016269024644, + "learning_rate": 1.9121117200804812e-05, + "loss": 0.3932, + "step": 1117 + }, + { + "epoch": 0.5342572128307748, + "grad_norm": 0.6209761683524013, + "learning_rate": 1.9118956891658526e-05, + "loss": 0.3821, + "step": 1118 + }, + { + "epoch": 0.5347350815363479, + "grad_norm": 0.6931704696111478, + "learning_rate": 1.9116794053074458e-05, + "loss": 0.4071, + "step": 1119 + }, + { + "epoch": 0.5352129502419211, + "grad_norm": 0.632080163475699, + "learning_rate": 1.9114628685652535e-05, + "loss": 0.3839, + "step": 1120 + }, + { + "epoch": 0.5356908189474942, + "grad_norm": 0.5748984989211147, + "learning_rate": 1.9112460789993394e-05, + "loss": 0.3929, + "step": 1121 + }, + { + "epoch": 0.5361686876530674, + "grad_norm": 0.5635604927408805, + "learning_rate": 1.9110290366698373e-05, + "loss": 0.4027, + "step": 1122 + }, + { + "epoch": 0.5366465563586404, + "grad_norm": 0.7558151194826072, + "learning_rate": 1.9108117416369502e-05, + "loss": 0.3986, + "step": 1123 + }, + { + "epoch": 0.5371244250642137, + "grad_norm": 0.5980206275007937, + "learning_rate": 1.910594193960953e-05, + "loss": 0.4006, + "step": 1124 + }, + { + "epoch": 0.5376022937697867, + "grad_norm": 0.6780188301974273, + "learning_rate": 1.9103763937021887e-05, + "loss": 0.3826, + "step": 1125 + }, + { + "epoch": 0.53808016247536, + "grad_norm": 0.5790597345181424, + "learning_rate": 1.9101583409210714e-05, + "loss": 0.3835, + "step": 1126 + }, + { + "epoch": 0.538558031180933, + "grad_norm": 0.6487135859049837, + "learning_rate": 1.9099400356780857e-05, + "loss": 0.3995, + "step": 1127 + }, + { + "epoch": 0.5390358998865061, + "grad_norm": 0.5780178648506041, + "learning_rate": 1.909721478033785e-05, + "loss": 0.3925, + "step": 1128 + }, + { + "epoch": 0.5395137685920793, + "grad_norm": 0.642292787021177, + "learning_rate": 1.909502668048793e-05, + "loss": 0.4016, + "step": 1129 + }, + { + "epoch": 0.5399916372976524, + "grad_norm": 0.5939162000913498, + "learning_rate": 1.909283605783805e-05, + "loss": 0.4057, + "step": 1130 + }, + { + "epoch": 0.5404695060032256, + "grad_norm": 0.6390737833949265, + "learning_rate": 1.9090642912995836e-05, + "loss": 0.3997, + "step": 1131 + }, + { + "epoch": 0.5409473747087987, + "grad_norm": 0.5629045570791384, + "learning_rate": 1.9088447246569638e-05, + "loss": 0.3948, + "step": 1132 + }, + { + "epoch": 0.5414252434143719, + "grad_norm": 0.7058159449700776, + "learning_rate": 1.908624905916849e-05, + "loss": 0.4006, + "step": 1133 + }, + { + "epoch": 0.541903112119945, + "grad_norm": 0.5821061957075071, + "learning_rate": 1.9084048351402135e-05, + "loss": 0.4029, + "step": 1134 + }, + { + "epoch": 0.5423809808255182, + "grad_norm": 0.5745676634359547, + "learning_rate": 1.9081845123881002e-05, + "loss": 0.4067, + "step": 1135 + }, + { + "epoch": 0.5428588495310913, + "grad_norm": 0.5799046046456168, + "learning_rate": 1.9079639377216236e-05, + "loss": 0.3895, + "step": 1136 + }, + { + "epoch": 0.5433367182366645, + "grad_norm": 0.5423826236018965, + "learning_rate": 1.9077431112019666e-05, + "loss": 0.392, + "step": 1137 + }, + { + "epoch": 0.5438145869422376, + "grad_norm": 0.5614547514964748, + "learning_rate": 1.9075220328903833e-05, + "loss": 0.419, + "step": 1138 + }, + { + "epoch": 0.5442924556478108, + "grad_norm": 0.7294948459972235, + "learning_rate": 1.907300702848196e-05, + "loss": 0.3725, + "step": 1139 + }, + { + "epoch": 0.5447703243533839, + "grad_norm": 0.7211253855583865, + "learning_rate": 1.9070791211367984e-05, + "loss": 0.3958, + "step": 1140 + }, + { + "epoch": 0.545248193058957, + "grad_norm": 0.7264564481802851, + "learning_rate": 1.9068572878176535e-05, + "loss": 0.3969, + "step": 1141 + }, + { + "epoch": 0.5457260617645302, + "grad_norm": 0.5947660314640929, + "learning_rate": 1.9066352029522933e-05, + "loss": 0.4076, + "step": 1142 + }, + { + "epoch": 0.5462039304701033, + "grad_norm": 0.6646952552501058, + "learning_rate": 1.906412866602321e-05, + "loss": 0.4096, + "step": 1143 + }, + { + "epoch": 0.5466817991756765, + "grad_norm": 0.6440292968735994, + "learning_rate": 1.906190278829408e-05, + "loss": 0.3947, + "step": 1144 + }, + { + "epoch": 0.5471596678812496, + "grad_norm": 0.6531777155361128, + "learning_rate": 1.9059674396952963e-05, + "loss": 0.3921, + "step": 1145 + }, + { + "epoch": 0.5476375365868228, + "grad_norm": 0.6263069312765026, + "learning_rate": 1.9057443492617983e-05, + "loss": 0.3908, + "step": 1146 + }, + { + "epoch": 0.5481154052923959, + "grad_norm": 0.6009448091108315, + "learning_rate": 1.9055210075907946e-05, + "loss": 0.3872, + "step": 1147 + }, + { + "epoch": 0.5485932739979691, + "grad_norm": 0.7151497490060705, + "learning_rate": 1.9052974147442365e-05, + "loss": 0.4047, + "step": 1148 + }, + { + "epoch": 0.5490711427035422, + "grad_norm": 0.6210354172889246, + "learning_rate": 1.905073570784145e-05, + "loss": 0.3875, + "step": 1149 + }, + { + "epoch": 0.5495490114091154, + "grad_norm": 0.6071292868798474, + "learning_rate": 1.90484947577261e-05, + "loss": 0.3786, + "step": 1150 + }, + { + "epoch": 0.5500268801146885, + "grad_norm": 0.5393751375916709, + "learning_rate": 1.9046251297717915e-05, + "loss": 0.3815, + "step": 1151 + }, + { + "epoch": 0.5505047488202617, + "grad_norm": 0.549917579146435, + "learning_rate": 1.9044005328439197e-05, + "loss": 0.3955, + "step": 1152 + }, + { + "epoch": 0.5509826175258348, + "grad_norm": 0.7395520846508221, + "learning_rate": 1.9041756850512932e-05, + "loss": 0.4056, + "step": 1153 + }, + { + "epoch": 0.551460486231408, + "grad_norm": 0.5862288750441804, + "learning_rate": 1.9039505864562812e-05, + "loss": 0.3972, + "step": 1154 + }, + { + "epoch": 0.5519383549369811, + "grad_norm": 0.5831709652721829, + "learning_rate": 1.903725237121322e-05, + "loss": 0.3919, + "step": 1155 + }, + { + "epoch": 0.5524162236425542, + "grad_norm": 0.5916849238602208, + "learning_rate": 1.9034996371089233e-05, + "loss": 0.3856, + "step": 1156 + }, + { + "epoch": 0.5528940923481274, + "grad_norm": 0.6411973294376119, + "learning_rate": 1.9032737864816627e-05, + "loss": 0.4067, + "step": 1157 + }, + { + "epoch": 0.5533719610537005, + "grad_norm": 0.5832416797946984, + "learning_rate": 1.9030476853021875e-05, + "loss": 0.389, + "step": 1158 + }, + { + "epoch": 0.5538498297592737, + "grad_norm": 0.622158136439798, + "learning_rate": 1.9028213336332135e-05, + "loss": 0.3893, + "step": 1159 + }, + { + "epoch": 0.5543276984648468, + "grad_norm": 0.5762766222613097, + "learning_rate": 1.902594731537527e-05, + "loss": 0.394, + "step": 1160 + }, + { + "epoch": 0.55480556717042, + "grad_norm": 0.550291838034785, + "learning_rate": 1.9023678790779838e-05, + "loss": 0.3888, + "step": 1161 + }, + { + "epoch": 0.555283435875993, + "grad_norm": 0.5662654245608815, + "learning_rate": 1.9021407763175083e-05, + "loss": 0.3926, + "step": 1162 + }, + { + "epoch": 0.5557613045815663, + "grad_norm": 0.6334933581251789, + "learning_rate": 1.901913423319095e-05, + "loss": 0.4011, + "step": 1163 + }, + { + "epoch": 0.5562391732871393, + "grad_norm": 0.5575895045734839, + "learning_rate": 1.9016858201458075e-05, + "loss": 0.3972, + "step": 1164 + }, + { + "epoch": 0.5567170419927125, + "grad_norm": 0.5966619103161227, + "learning_rate": 1.901457966860779e-05, + "loss": 0.3824, + "step": 1165 + }, + { + "epoch": 0.5571949106982856, + "grad_norm": 0.5870555991442512, + "learning_rate": 1.9012298635272117e-05, + "loss": 0.4003, + "step": 1166 + }, + { + "epoch": 0.5576727794038588, + "grad_norm": 0.569548018108478, + "learning_rate": 1.9010015102083778e-05, + "loss": 0.3992, + "step": 1167 + }, + { + "epoch": 0.5581506481094319, + "grad_norm": 0.5537323419722724, + "learning_rate": 1.9007729069676185e-05, + "loss": 0.4126, + "step": 1168 + }, + { + "epoch": 0.558628516815005, + "grad_norm": 0.6214624827198821, + "learning_rate": 1.9005440538683443e-05, + "loss": 0.4034, + "step": 1169 + }, + { + "epoch": 0.5591063855205782, + "grad_norm": 0.5794512137484054, + "learning_rate": 1.9003149509740347e-05, + "loss": 0.4042, + "step": 1170 + }, + { + "epoch": 0.5595842542261513, + "grad_norm": 0.6471077379542854, + "learning_rate": 1.9000855983482386e-05, + "loss": 0.3955, + "step": 1171 + }, + { + "epoch": 0.5600621229317245, + "grad_norm": 0.7713823924796754, + "learning_rate": 1.8998559960545753e-05, + "loss": 0.3894, + "step": 1172 + }, + { + "epoch": 0.5605399916372976, + "grad_norm": 0.669563924210895, + "learning_rate": 1.8996261441567318e-05, + "loss": 0.4061, + "step": 1173 + }, + { + "epoch": 0.5610178603428708, + "grad_norm": 0.6358401107367022, + "learning_rate": 1.8993960427184647e-05, + "loss": 0.4049, + "step": 1174 + }, + { + "epoch": 0.5614957290484439, + "grad_norm": 0.632834423434781, + "learning_rate": 1.899165691803601e-05, + "loss": 0.3974, + "step": 1175 + }, + { + "epoch": 0.5619735977540171, + "grad_norm": 0.7647633906590625, + "learning_rate": 1.8989350914760348e-05, + "loss": 0.3967, + "step": 1176 + }, + { + "epoch": 0.5624514664595902, + "grad_norm": 0.6692881107694713, + "learning_rate": 1.8987042417997313e-05, + "loss": 0.4027, + "step": 1177 + }, + { + "epoch": 0.5629293351651634, + "grad_norm": 0.831620122043209, + "learning_rate": 1.898473142838724e-05, + "loss": 0.4012, + "step": 1178 + }, + { + "epoch": 0.5634072038707365, + "grad_norm": 1.1686193693749636, + "learning_rate": 1.898241794657116e-05, + "loss": 0.405, + "step": 1179 + }, + { + "epoch": 0.5638850725763097, + "grad_norm": 0.6351278759311869, + "learning_rate": 1.8980101973190787e-05, + "loss": 0.3754, + "step": 1180 + }, + { + "epoch": 0.5643629412818828, + "grad_norm": 0.5873130279029877, + "learning_rate": 1.8977783508888535e-05, + "loss": 0.3965, + "step": 1181 + }, + { + "epoch": 0.5648408099874559, + "grad_norm": 0.5989794788052755, + "learning_rate": 1.89754625543075e-05, + "loss": 0.399, + "step": 1182 + }, + { + "epoch": 0.5653186786930291, + "grad_norm": 0.5698375975862137, + "learning_rate": 1.8973139110091477e-05, + "loss": 0.4015, + "step": 1183 + }, + { + "epoch": 0.5657965473986022, + "grad_norm": 0.6737645692540567, + "learning_rate": 1.897081317688495e-05, + "loss": 0.4223, + "step": 1184 + }, + { + "epoch": 0.5662744161041754, + "grad_norm": 0.593327748620439, + "learning_rate": 1.896848475533309e-05, + "loss": 0.4066, + "step": 1185 + }, + { + "epoch": 0.5667522848097485, + "grad_norm": 0.5883337985827269, + "learning_rate": 1.896615384608176e-05, + "loss": 0.3736, + "step": 1186 + }, + { + "epoch": 0.5672301535153217, + "grad_norm": 0.9321510339697601, + "learning_rate": 1.896382044977751e-05, + "loss": 0.4119, + "step": 1187 + }, + { + "epoch": 0.5677080222208948, + "grad_norm": 0.5793456890053796, + "learning_rate": 1.896148456706759e-05, + "loss": 0.3928, + "step": 1188 + }, + { + "epoch": 0.568185890926468, + "grad_norm": 0.8624241078357229, + "learning_rate": 1.8959146198599928e-05, + "loss": 0.4138, + "step": 1189 + }, + { + "epoch": 0.5686637596320411, + "grad_norm": 0.6540030367738001, + "learning_rate": 1.8956805345023145e-05, + "loss": 0.3981, + "step": 1190 + }, + { + "epoch": 0.5691416283376143, + "grad_norm": 0.6327764655932094, + "learning_rate": 1.8954462006986557e-05, + "loss": 0.4082, + "step": 1191 + }, + { + "epoch": 0.5696194970431874, + "grad_norm": 0.6018338777252428, + "learning_rate": 1.8952116185140164e-05, + "loss": 0.3884, + "step": 1192 + }, + { + "epoch": 0.5700973657487606, + "grad_norm": 0.5645211426190209, + "learning_rate": 1.8949767880134652e-05, + "loss": 0.4008, + "step": 1193 + }, + { + "epoch": 0.5705752344543337, + "grad_norm": 0.62032674267378, + "learning_rate": 1.89474170926214e-05, + "loss": 0.387, + "step": 1194 + }, + { + "epoch": 0.5710531031599068, + "grad_norm": 0.5331504995268993, + "learning_rate": 1.894506382325248e-05, + "loss": 0.3944, + "step": 1195 + }, + { + "epoch": 0.57153097186548, + "grad_norm": 0.5571879177321089, + "learning_rate": 1.8942708072680637e-05, + "loss": 0.3873, + "step": 1196 + }, + { + "epoch": 0.5720088405710531, + "grad_norm": 0.5434082219860525, + "learning_rate": 1.8940349841559325e-05, + "loss": 0.3819, + "step": 1197 + }, + { + "epoch": 0.5724867092766263, + "grad_norm": 0.5556021619843906, + "learning_rate": 1.8937989130542672e-05, + "loss": 0.3819, + "step": 1198 + }, + { + "epoch": 0.5729645779821994, + "grad_norm": 0.6319621645757422, + "learning_rate": 1.8935625940285502e-05, + "loss": 0.3921, + "step": 1199 + }, + { + "epoch": 0.5734424466877726, + "grad_norm": 0.6282911906906646, + "learning_rate": 1.8933260271443313e-05, + "loss": 0.4087, + "step": 1200 + }, + { + "epoch": 0.5739203153933456, + "grad_norm": 0.5581191438877338, + "learning_rate": 1.8930892124672303e-05, + "loss": 0.3932, + "step": 1201 + }, + { + "epoch": 0.5743981840989189, + "grad_norm": 0.5463800521790595, + "learning_rate": 1.892852150062936e-05, + "loss": 0.394, + "step": 1202 + }, + { + "epoch": 0.5748760528044919, + "grad_norm": 0.5931390296947407, + "learning_rate": 1.8926148399972047e-05, + "loss": 0.3773, + "step": 1203 + }, + { + "epoch": 0.5753539215100651, + "grad_norm": 0.5452845931438769, + "learning_rate": 1.8923772823358624e-05, + "loss": 0.3901, + "step": 1204 + }, + { + "epoch": 0.5758317902156382, + "grad_norm": 0.5840615558705706, + "learning_rate": 1.8921394771448032e-05, + "loss": 0.4034, + "step": 1205 + }, + { + "epoch": 0.5763096589212114, + "grad_norm": 0.6291436076221341, + "learning_rate": 1.89190142448999e-05, + "loss": 0.3905, + "step": 1206 + }, + { + "epoch": 0.5767875276267845, + "grad_norm": 0.5327483631263784, + "learning_rate": 1.8916631244374548e-05, + "loss": 0.3999, + "step": 1207 + }, + { + "epoch": 0.5772653963323577, + "grad_norm": 0.6494525583362151, + "learning_rate": 1.891424577053297e-05, + "loss": 0.3893, + "step": 1208 + }, + { + "epoch": 0.5777432650379308, + "grad_norm": 0.5510447847761317, + "learning_rate": 1.8911857824036863e-05, + "loss": 0.3835, + "step": 1209 + }, + { + "epoch": 0.5782211337435039, + "grad_norm": 0.6227648730898245, + "learning_rate": 1.89094674055486e-05, + "loss": 0.3999, + "step": 1210 + }, + { + "epoch": 0.5786990024490771, + "grad_norm": 0.5541558527671858, + "learning_rate": 1.8907074515731236e-05, + "loss": 0.3994, + "step": 1211 + }, + { + "epoch": 0.5791768711546502, + "grad_norm": 0.5676146792664295, + "learning_rate": 1.8904679155248523e-05, + "loss": 0.3874, + "step": 1212 + }, + { + "epoch": 0.5796547398602234, + "grad_norm": 0.7499627866819074, + "learning_rate": 1.890228132476488e-05, + "loss": 0.3683, + "step": 1213 + }, + { + "epoch": 0.5801326085657965, + "grad_norm": 0.5526639952242319, + "learning_rate": 1.889988102494544e-05, + "loss": 0.3977, + "step": 1214 + }, + { + "epoch": 0.5806104772713697, + "grad_norm": 0.6492832611355143, + "learning_rate": 1.889747825645599e-05, + "loss": 0.3861, + "step": 1215 + }, + { + "epoch": 0.5810883459769428, + "grad_norm": 0.5411532191238587, + "learning_rate": 1.8895073019963022e-05, + "loss": 0.3934, + "step": 1216 + }, + { + "epoch": 0.581566214682516, + "grad_norm": 0.6698750283375203, + "learning_rate": 1.8892665316133706e-05, + "loss": 0.4089, + "step": 1217 + }, + { + "epoch": 0.5820440833880891, + "grad_norm": 0.604672289766661, + "learning_rate": 1.8890255145635895e-05, + "loss": 0.3985, + "step": 1218 + }, + { + "epoch": 0.5825219520936623, + "grad_norm": 0.5809198566541879, + "learning_rate": 1.888784250913813e-05, + "loss": 0.4029, + "step": 1219 + }, + { + "epoch": 0.5829998207992354, + "grad_norm": 0.5575695641227716, + "learning_rate": 1.8885427407309627e-05, + "loss": 0.3918, + "step": 1220 + }, + { + "epoch": 0.5834776895048086, + "grad_norm": 0.6012108926363768, + "learning_rate": 1.88830098408203e-05, + "loss": 0.3864, + "step": 1221 + }, + { + "epoch": 0.5839555582103817, + "grad_norm": 0.5621575791142275, + "learning_rate": 1.8880589810340734e-05, + "loss": 0.38, + "step": 1222 + }, + { + "epoch": 0.5844334269159548, + "grad_norm": 0.5831322537081107, + "learning_rate": 1.8878167316542207e-05, + "loss": 0.3894, + "step": 1223 + }, + { + "epoch": 0.584911295621528, + "grad_norm": 0.5871042303485483, + "learning_rate": 1.8875742360096675e-05, + "loss": 0.3896, + "step": 1224 + }, + { + "epoch": 0.5853891643271011, + "grad_norm": 0.5564242439821103, + "learning_rate": 1.887331494167678e-05, + "loss": 0.3933, + "step": 1225 + }, + { + "epoch": 0.5858670330326743, + "grad_norm": 0.5819982187143664, + "learning_rate": 1.887088506195584e-05, + "loss": 0.4114, + "step": 1226 + }, + { + "epoch": 0.5863449017382474, + "grad_norm": 0.5090058539216136, + "learning_rate": 1.8868452721607865e-05, + "loss": 0.3962, + "step": 1227 + }, + { + "epoch": 0.5868227704438206, + "grad_norm": 0.5875412543113175, + "learning_rate": 1.8866017921307544e-05, + "loss": 0.4056, + "step": 1228 + }, + { + "epoch": 0.5873006391493937, + "grad_norm": 0.6166139308626585, + "learning_rate": 1.886358066173024e-05, + "loss": 0.3882, + "step": 1229 + }, + { + "epoch": 0.5877785078549669, + "grad_norm": 0.5888127004124792, + "learning_rate": 1.8861140943552014e-05, + "loss": 0.3912, + "step": 1230 + }, + { + "epoch": 0.58825637656054, + "grad_norm": 0.6162981466724767, + "learning_rate": 1.8858698767449598e-05, + "loss": 0.3937, + "step": 1231 + }, + { + "epoch": 0.5887342452661132, + "grad_norm": 0.6715967270841622, + "learning_rate": 1.8856254134100408e-05, + "loss": 0.401, + "step": 1232 + }, + { + "epoch": 0.5892121139716863, + "grad_norm": 0.5366020380685378, + "learning_rate": 1.8853807044182544e-05, + "loss": 0.3875, + "step": 1233 + }, + { + "epoch": 0.5896899826772595, + "grad_norm": 0.591930559738908, + "learning_rate": 1.8851357498374785e-05, + "loss": 0.4082, + "step": 1234 + }, + { + "epoch": 0.5901678513828326, + "grad_norm": 0.5911447561891597, + "learning_rate": 1.884890549735659e-05, + "loss": 0.3904, + "step": 1235 + }, + { + "epoch": 0.5906457200884057, + "grad_norm": 0.5226862065805489, + "learning_rate": 1.8846451041808102e-05, + "loss": 0.3937, + "step": 1236 + }, + { + "epoch": 0.5911235887939789, + "grad_norm": 0.6596621571985957, + "learning_rate": 1.8843994132410143e-05, + "loss": 0.403, + "step": 1237 + }, + { + "epoch": 0.591601457499552, + "grad_norm": 0.5701923237556955, + "learning_rate": 1.884153476984422e-05, + "loss": 0.4008, + "step": 1238 + }, + { + "epoch": 0.5920793262051252, + "grad_norm": 0.5930242333237962, + "learning_rate": 1.883907295479251e-05, + "loss": 0.3795, + "step": 1239 + }, + { + "epoch": 0.5925571949106982, + "grad_norm": 0.7314294978915366, + "learning_rate": 1.8836608687937883e-05, + "loss": 0.3928, + "step": 1240 + }, + { + "epoch": 0.5930350636162715, + "grad_norm": 0.5401549968229234, + "learning_rate": 1.883414196996388e-05, + "loss": 0.3828, + "step": 1241 + }, + { + "epoch": 0.5935129323218445, + "grad_norm": 0.5649220265715015, + "learning_rate": 1.8831672801554726e-05, + "loss": 0.4082, + "step": 1242 + }, + { + "epoch": 0.5939908010274177, + "grad_norm": 0.5284937507489004, + "learning_rate": 1.882920118339533e-05, + "loss": 0.3894, + "step": 1243 + }, + { + "epoch": 0.5944686697329908, + "grad_norm": 0.5645488013407426, + "learning_rate": 1.8826727116171264e-05, + "loss": 0.3931, + "step": 1244 + }, + { + "epoch": 0.594946538438564, + "grad_norm": 0.5376862100448556, + "learning_rate": 1.8824250600568798e-05, + "loss": 0.3859, + "step": 1245 + }, + { + "epoch": 0.5954244071441371, + "grad_norm": 0.601826503242195, + "learning_rate": 1.8821771637274874e-05, + "loss": 0.3896, + "step": 1246 + }, + { + "epoch": 0.5959022758497103, + "grad_norm": 0.547959448752086, + "learning_rate": 1.881929022697711e-05, + "loss": 0.3716, + "step": 1247 + }, + { + "epoch": 0.5963801445552834, + "grad_norm": 0.5519251178713137, + "learning_rate": 1.881680637036381e-05, + "loss": 0.4019, + "step": 1248 + }, + { + "epoch": 0.5968580132608566, + "grad_norm": 0.5714630327545864, + "learning_rate": 1.8814320068123945e-05, + "loss": 0.3937, + "step": 1249 + }, + { + "epoch": 0.5973358819664297, + "grad_norm": 0.5470336630560245, + "learning_rate": 1.8811831320947177e-05, + "loss": 0.4043, + "step": 1250 + }, + { + "epoch": 0.5978137506720028, + "grad_norm": 0.5569332027496797, + "learning_rate": 1.8809340129523835e-05, + "loss": 0.3718, + "step": 1251 + }, + { + "epoch": 0.598291619377576, + "grad_norm": 0.5598645339474388, + "learning_rate": 1.880684649454494e-05, + "loss": 0.4028, + "step": 1252 + }, + { + "epoch": 0.5987694880831491, + "grad_norm": 0.5364480060048303, + "learning_rate": 1.8804350416702174e-05, + "loss": 0.3782, + "step": 1253 + }, + { + "epoch": 0.5992473567887223, + "grad_norm": 0.5873593424397728, + "learning_rate": 1.880185189668791e-05, + "loss": 0.392, + "step": 1254 + }, + { + "epoch": 0.5997252254942954, + "grad_norm": 0.5235640424479179, + "learning_rate": 1.879935093519519e-05, + "loss": 0.3805, + "step": 1255 + }, + { + "epoch": 0.6002030941998686, + "grad_norm": 0.620909525268386, + "learning_rate": 1.8796847532917743e-05, + "loss": 0.4068, + "step": 1256 + }, + { + "epoch": 0.6006809629054417, + "grad_norm": 0.5489235045804418, + "learning_rate": 1.879434169054996e-05, + "loss": 0.3848, + "step": 1257 + }, + { + "epoch": 0.6011588316110149, + "grad_norm": 0.5347491918019722, + "learning_rate": 1.8791833408786922e-05, + "loss": 0.3819, + "step": 1258 + }, + { + "epoch": 0.601636700316588, + "grad_norm": 0.5915201608837969, + "learning_rate": 1.878932268832438e-05, + "loss": 0.3951, + "step": 1259 + }, + { + "epoch": 0.6021145690221612, + "grad_norm": 0.5356326976446867, + "learning_rate": 1.878680952985877e-05, + "loss": 0.3905, + "step": 1260 + }, + { + "epoch": 0.6025924377277343, + "grad_norm": 0.6181767591406878, + "learning_rate": 1.878429393408719e-05, + "loss": 0.4024, + "step": 1261 + }, + { + "epoch": 0.6030703064333075, + "grad_norm": 0.5632469969713791, + "learning_rate": 1.8781775901707425e-05, + "loss": 0.3937, + "step": 1262 + }, + { + "epoch": 0.6035481751388806, + "grad_norm": 0.5877175455362745, + "learning_rate": 1.8779255433417935e-05, + "loss": 0.3943, + "step": 1263 + }, + { + "epoch": 0.6040260438444537, + "grad_norm": 0.5503903372236624, + "learning_rate": 1.8776732529917846e-05, + "loss": 0.3984, + "step": 1264 + }, + { + "epoch": 0.6045039125500269, + "grad_norm": 0.5649581602924505, + "learning_rate": 1.8774207191906976e-05, + "loss": 0.4017, + "step": 1265 + }, + { + "epoch": 0.6049817812556, + "grad_norm": 0.563853155134363, + "learning_rate": 1.8771679420085805e-05, + "loss": 0.3891, + "step": 1266 + }, + { + "epoch": 0.6054596499611732, + "grad_norm": 0.5651839005098936, + "learning_rate": 1.8769149215155497e-05, + "loss": 0.3913, + "step": 1267 + }, + { + "epoch": 0.6059375186667463, + "grad_norm": 0.5963210739445259, + "learning_rate": 1.8766616577817875e-05, + "loss": 0.4133, + "step": 1268 + }, + { + "epoch": 0.6064153873723195, + "grad_norm": 0.55539068455758, + "learning_rate": 1.876408150877546e-05, + "loss": 0.3797, + "step": 1269 + }, + { + "epoch": 0.6068932560778926, + "grad_norm": 0.5856042654690761, + "learning_rate": 1.8761544008731426e-05, + "loss": 0.4058, + "step": 1270 + }, + { + "epoch": 0.6073711247834658, + "grad_norm": 0.542197554269779, + "learning_rate": 1.8759004078389635e-05, + "loss": 0.39, + "step": 1271 + }, + { + "epoch": 0.6078489934890389, + "grad_norm": 0.5353927253355668, + "learning_rate": 1.8756461718454622e-05, + "loss": 0.4147, + "step": 1272 + }, + { + "epoch": 0.6083268621946121, + "grad_norm": 0.7529582857438246, + "learning_rate": 1.8753916929631586e-05, + "loss": 0.4046, + "step": 1273 + }, + { + "epoch": 0.6088047309001852, + "grad_norm": 0.5958352414182106, + "learning_rate": 1.8751369712626413e-05, + "loss": 0.385, + "step": 1274 + }, + { + "epoch": 0.6092825996057584, + "grad_norm": 0.5841015694045317, + "learning_rate": 1.874882006814565e-05, + "loss": 0.3854, + "step": 1275 + }, + { + "epoch": 0.6097604683113315, + "grad_norm": 0.6682019912149343, + "learning_rate": 1.874626799689653e-05, + "loss": 0.3775, + "step": 1276 + }, + { + "epoch": 0.6102383370169046, + "grad_norm": 0.5543315924200671, + "learning_rate": 1.874371349958695e-05, + "loss": 0.3767, + "step": 1277 + }, + { + "epoch": 0.6107162057224778, + "grad_norm": 0.5842970394229539, + "learning_rate": 1.874115657692548e-05, + "loss": 0.3992, + "step": 1278 + }, + { + "epoch": 0.6111940744280508, + "grad_norm": 0.5711349862291916, + "learning_rate": 1.8738597229621368e-05, + "loss": 0.3913, + "step": 1279 + }, + { + "epoch": 0.611671943133624, + "grad_norm": 0.6023071222272888, + "learning_rate": 1.8736035458384528e-05, + "loss": 0.3896, + "step": 1280 + }, + { + "epoch": 0.6121498118391971, + "grad_norm": 0.5777611740504066, + "learning_rate": 1.8733471263925553e-05, + "loss": 0.4059, + "step": 1281 + }, + { + "epoch": 0.6126276805447703, + "grad_norm": 0.5729042128264256, + "learning_rate": 1.8730904646955706e-05, + "loss": 0.4117, + "step": 1282 + }, + { + "epoch": 0.6131055492503434, + "grad_norm": 0.5307580555984255, + "learning_rate": 1.8728335608186923e-05, + "loss": 0.3967, + "step": 1283 + }, + { + "epoch": 0.6135834179559166, + "grad_norm": 0.5972005721716636, + "learning_rate": 1.8725764148331804e-05, + "loss": 0.3851, + "step": 1284 + }, + { + "epoch": 0.6140612866614897, + "grad_norm": 0.5486242431937549, + "learning_rate": 1.8723190268103634e-05, + "loss": 0.3921, + "step": 1285 + }, + { + "epoch": 0.6145391553670629, + "grad_norm": 0.5129417046829837, + "learning_rate": 1.8720613968216356e-05, + "loss": 0.3799, + "step": 1286 + }, + { + "epoch": 0.615017024072636, + "grad_norm": 0.5389718288002542, + "learning_rate": 1.871803524938459e-05, + "loss": 0.3784, + "step": 1287 + }, + { + "epoch": 0.6154948927782092, + "grad_norm": 0.5413932174652784, + "learning_rate": 1.871545411232363e-05, + "loss": 0.3883, + "step": 1288 + }, + { + "epoch": 0.6159727614837823, + "grad_norm": 0.5576540640582818, + "learning_rate": 1.871287055774944e-05, + "loss": 0.3725, + "step": 1289 + }, + { + "epoch": 0.6164506301893554, + "grad_norm": 0.5438843907191524, + "learning_rate": 1.8710284586378645e-05, + "loss": 0.3853, + "step": 1290 + }, + { + "epoch": 0.6169284988949286, + "grad_norm": 0.5921910838665108, + "learning_rate": 1.870769619892856e-05, + "loss": 0.3881, + "step": 1291 + }, + { + "epoch": 0.6174063676005017, + "grad_norm": 0.5281942703357103, + "learning_rate": 1.8705105396117145e-05, + "loss": 0.4026, + "step": 1292 + }, + { + "epoch": 0.6178842363060749, + "grad_norm": 0.6001108596932736, + "learning_rate": 1.870251217866305e-05, + "loss": 0.3875, + "step": 1293 + }, + { + "epoch": 0.618362105011648, + "grad_norm": 0.610945572306042, + "learning_rate": 1.8699916547285583e-05, + "loss": 0.3812, + "step": 1294 + }, + { + "epoch": 0.6188399737172212, + "grad_norm": 0.5695158823092838, + "learning_rate": 1.8697318502704734e-05, + "loss": 0.371, + "step": 1295 + }, + { + "epoch": 0.6193178424227943, + "grad_norm": 0.569180451522841, + "learning_rate": 1.869471804564115e-05, + "loss": 0.4033, + "step": 1296 + }, + { + "epoch": 0.6197957111283675, + "grad_norm": 0.5328007824386596, + "learning_rate": 1.869211517681615e-05, + "loss": 0.4016, + "step": 1297 + }, + { + "epoch": 0.6202735798339406, + "grad_norm": 0.5787431750554224, + "learning_rate": 1.868950989695173e-05, + "loss": 0.3875, + "step": 1298 + }, + { + "epoch": 0.6207514485395138, + "grad_norm": 0.7099261830662204, + "learning_rate": 1.8686902206770542e-05, + "loss": 0.3925, + "step": 1299 + }, + { + "epoch": 0.6212293172450869, + "grad_norm": 0.5775144630618253, + "learning_rate": 1.8684292106995916e-05, + "loss": 0.3876, + "step": 1300 + }, + { + "epoch": 0.6217071859506601, + "grad_norm": 0.5960304043808599, + "learning_rate": 1.868167959835185e-05, + "loss": 0.3966, + "step": 1301 + }, + { + "epoch": 0.6221850546562332, + "grad_norm": 0.6193706361120256, + "learning_rate": 1.8679064681563005e-05, + "loss": 0.4009, + "step": 1302 + }, + { + "epoch": 0.6226629233618064, + "grad_norm": 0.6051679338180642, + "learning_rate": 1.867644735735471e-05, + "loss": 0.385, + "step": 1303 + }, + { + "epoch": 0.6231407920673795, + "grad_norm": 0.6745787304908575, + "learning_rate": 1.8673827626452972e-05, + "loss": 0.4009, + "step": 1304 + }, + { + "epoch": 0.6236186607729526, + "grad_norm": 0.591651344835618, + "learning_rate": 1.8671205489584453e-05, + "loss": 0.3935, + "step": 1305 + }, + { + "epoch": 0.6240965294785258, + "grad_norm": 0.5499144339680516, + "learning_rate": 1.8668580947476487e-05, + "loss": 0.3803, + "step": 1306 + }, + { + "epoch": 0.6245743981840989, + "grad_norm": 0.5742789490169716, + "learning_rate": 1.8665954000857077e-05, + "loss": 0.3976, + "step": 1307 + }, + { + "epoch": 0.6250522668896721, + "grad_norm": 0.5342157539830956, + "learning_rate": 1.8663324650454896e-05, + "loss": 0.3966, + "step": 1308 + }, + { + "epoch": 0.6255301355952452, + "grad_norm": 0.557937880620906, + "learning_rate": 1.8660692896999272e-05, + "loss": 0.3891, + "step": 1309 + }, + { + "epoch": 0.6260080043008184, + "grad_norm": 0.6049761505152792, + "learning_rate": 1.865805874122021e-05, + "loss": 0.3903, + "step": 1310 + }, + { + "epoch": 0.6264858730063915, + "grad_norm": 0.5638630151330256, + "learning_rate": 1.865542218384838e-05, + "loss": 0.3987, + "step": 1311 + }, + { + "epoch": 0.6269637417119647, + "grad_norm": 0.5558339859479475, + "learning_rate": 1.865278322561512e-05, + "loss": 0.3838, + "step": 1312 + }, + { + "epoch": 0.6274416104175378, + "grad_norm": 0.7599896980096237, + "learning_rate": 1.8650141867252418e-05, + "loss": 0.3783, + "step": 1313 + }, + { + "epoch": 0.627919479123111, + "grad_norm": 0.5377592981486465, + "learning_rate": 1.8647498109492952e-05, + "loss": 0.3748, + "step": 1314 + }, + { + "epoch": 0.6283973478286841, + "grad_norm": 0.5747378065579319, + "learning_rate": 1.8644851953070045e-05, + "loss": 0.3797, + "step": 1315 + }, + { + "epoch": 0.6288752165342573, + "grad_norm": 0.5611211100705439, + "learning_rate": 1.8642203398717704e-05, + "loss": 0.374, + "step": 1316 + }, + { + "epoch": 0.6293530852398304, + "grad_norm": 0.5398727147760198, + "learning_rate": 1.8639552447170586e-05, + "loss": 0.4111, + "step": 1317 + }, + { + "epoch": 0.6298309539454034, + "grad_norm": 0.6478100942138757, + "learning_rate": 1.8636899099164016e-05, + "loss": 0.373, + "step": 1318 + }, + { + "epoch": 0.6303088226509767, + "grad_norm": 0.5239375476824424, + "learning_rate": 1.863424335543399e-05, + "loss": 0.3934, + "step": 1319 + }, + { + "epoch": 0.6307866913565497, + "grad_norm": 0.5594162158750057, + "learning_rate": 1.863158521671716e-05, + "loss": 0.3914, + "step": 1320 + }, + { + "epoch": 0.631264560062123, + "grad_norm": 0.5605106637954282, + "learning_rate": 1.862892468375085e-05, + "loss": 0.3907, + "step": 1321 + }, + { + "epoch": 0.631742428767696, + "grad_norm": 0.5566958695945204, + "learning_rate": 1.8626261757273047e-05, + "loss": 0.3853, + "step": 1322 + }, + { + "epoch": 0.6322202974732692, + "grad_norm": 0.5930367871951245, + "learning_rate": 1.8623596438022395e-05, + "loss": 0.3697, + "step": 1323 + }, + { + "epoch": 0.6326981661788423, + "grad_norm": 0.5744435257090056, + "learning_rate": 1.862092872673821e-05, + "loss": 0.3937, + "step": 1324 + }, + { + "epoch": 0.6331760348844155, + "grad_norm": 0.6885689441398278, + "learning_rate": 1.8618258624160465e-05, + "loss": 0.395, + "step": 1325 + }, + { + "epoch": 0.6336539035899886, + "grad_norm": 0.5363058119426725, + "learning_rate": 1.86155861310298e-05, + "loss": 0.3835, + "step": 1326 + }, + { + "epoch": 0.6341317722955618, + "grad_norm": 0.5648056274555979, + "learning_rate": 1.8612911248087523e-05, + "loss": 0.401, + "step": 1327 + }, + { + "epoch": 0.6346096410011349, + "grad_norm": 0.5517054491297217, + "learning_rate": 1.8610233976075595e-05, + "loss": 0.4094, + "step": 1328 + }, + { + "epoch": 0.6350875097067081, + "grad_norm": 0.610689280585854, + "learning_rate": 1.860755431573664e-05, + "loss": 0.4007, + "step": 1329 + }, + { + "epoch": 0.6355653784122812, + "grad_norm": 0.5856790883141125, + "learning_rate": 1.8604872267813954e-05, + "loss": 0.3868, + "step": 1330 + }, + { + "epoch": 0.6360432471178543, + "grad_norm": 0.5170823321710041, + "learning_rate": 1.8602187833051487e-05, + "loss": 0.3772, + "step": 1331 + }, + { + "epoch": 0.6365211158234275, + "grad_norm": 0.6525237313804392, + "learning_rate": 1.859950101219386e-05, + "loss": 0.3696, + "step": 1332 + }, + { + "epoch": 0.6369989845290006, + "grad_norm": 0.5357547476404161, + "learning_rate": 1.859681180598634e-05, + "loss": 0.3866, + "step": 1333 + }, + { + "epoch": 0.6374768532345738, + "grad_norm": 0.5669350371510699, + "learning_rate": 1.859412021517487e-05, + "loss": 0.3887, + "step": 1334 + }, + { + "epoch": 0.6379547219401469, + "grad_norm": 0.5936322359971947, + "learning_rate": 1.859142624050605e-05, + "loss": 0.4002, + "step": 1335 + }, + { + "epoch": 0.6384325906457201, + "grad_norm": 0.572891703767859, + "learning_rate": 1.8588729882727142e-05, + "loss": 0.3722, + "step": 1336 + }, + { + "epoch": 0.6389104593512932, + "grad_norm": 0.6958298176911252, + "learning_rate": 1.8586031142586073e-05, + "loss": 0.3872, + "step": 1337 + }, + { + "epoch": 0.6393883280568664, + "grad_norm": 0.6299358564635291, + "learning_rate": 1.858333002083141e-05, + "loss": 0.4038, + "step": 1338 + }, + { + "epoch": 0.6398661967624395, + "grad_norm": 0.5567224363270409, + "learning_rate": 1.8580626518212413e-05, + "loss": 0.3945, + "step": 1339 + }, + { + "epoch": 0.6403440654680127, + "grad_norm": 0.5356857164607767, + "learning_rate": 1.8577920635478976e-05, + "loss": 0.4022, + "step": 1340 + }, + { + "epoch": 0.6408219341735858, + "grad_norm": 0.5602058275360913, + "learning_rate": 1.8575212373381672e-05, + "loss": 0.3972, + "step": 1341 + }, + { + "epoch": 0.641299802879159, + "grad_norm": 0.5391006769979308, + "learning_rate": 1.8572501732671714e-05, + "loss": 0.4078, + "step": 1342 + }, + { + "epoch": 0.6417776715847321, + "grad_norm": 0.5274111377005911, + "learning_rate": 1.8569788714100993e-05, + "loss": 0.3887, + "step": 1343 + }, + { + "epoch": 0.6422555402903053, + "grad_norm": 0.5803845464955352, + "learning_rate": 1.8567073318422053e-05, + "loss": 0.402, + "step": 1344 + }, + { + "epoch": 0.6427334089958784, + "grad_norm": 0.5602953934235403, + "learning_rate": 1.8564355546388094e-05, + "loss": 0.4069, + "step": 1345 + }, + { + "epoch": 0.6432112777014515, + "grad_norm": 0.5353517895711432, + "learning_rate": 1.856163539875298e-05, + "loss": 0.3795, + "step": 1346 + }, + { + "epoch": 0.6436891464070247, + "grad_norm": 0.5562727303504088, + "learning_rate": 1.855891287627123e-05, + "loss": 0.388, + "step": 1347 + }, + { + "epoch": 0.6441670151125978, + "grad_norm": 0.6540276248275614, + "learning_rate": 1.8556187979698024e-05, + "loss": 0.3935, + "step": 1348 + }, + { + "epoch": 0.644644883818171, + "grad_norm": 0.5318329859411215, + "learning_rate": 1.85534607097892e-05, + "loss": 0.391, + "step": 1349 + }, + { + "epoch": 0.6451227525237441, + "grad_norm": 0.5984520705371762, + "learning_rate": 1.855073106730126e-05, + "loss": 0.3743, + "step": 1350 + }, + { + "epoch": 0.6456006212293173, + "grad_norm": 0.8656630009596894, + "learning_rate": 1.8547999052991353e-05, + "loss": 0.3831, + "step": 1351 + }, + { + "epoch": 0.6460784899348904, + "grad_norm": 0.5900719253503339, + "learning_rate": 1.854526466761729e-05, + "loss": 0.391, + "step": 1352 + }, + { + "epoch": 0.6465563586404636, + "grad_norm": 0.5874685110312172, + "learning_rate": 1.8542527911937546e-05, + "loss": 0.3891, + "step": 1353 + }, + { + "epoch": 0.6470342273460367, + "grad_norm": 0.5208727410164187, + "learning_rate": 1.8539788786711247e-05, + "loss": 0.388, + "step": 1354 + }, + { + "epoch": 0.6475120960516099, + "grad_norm": 0.5379067771394836, + "learning_rate": 1.8537047292698175e-05, + "loss": 0.3847, + "step": 1355 + }, + { + "epoch": 0.647989964757183, + "grad_norm": 0.5686849032925404, + "learning_rate": 1.853430343065878e-05, + "loss": 0.3902, + "step": 1356 + }, + { + "epoch": 0.6484678334627562, + "grad_norm": 0.5425524249952016, + "learning_rate": 1.853155720135415e-05, + "loss": 0.3964, + "step": 1357 + }, + { + "epoch": 0.6489457021683293, + "grad_norm": 0.5446815678020227, + "learning_rate": 1.8528808605546053e-05, + "loss": 0.4028, + "step": 1358 + }, + { + "epoch": 0.6494235708739023, + "grad_norm": 0.5502869577791677, + "learning_rate": 1.852605764399689e-05, + "loss": 0.4149, + "step": 1359 + }, + { + "epoch": 0.6499014395794755, + "grad_norm": 0.5356342796004244, + "learning_rate": 1.852330431746973e-05, + "loss": 0.3927, + "step": 1360 + }, + { + "epoch": 0.6503793082850486, + "grad_norm": 0.5620414346740084, + "learning_rate": 1.852054862672831e-05, + "loss": 0.3847, + "step": 1361 + }, + { + "epoch": 0.6508571769906218, + "grad_norm": 0.5494824917990645, + "learning_rate": 1.8517790572536996e-05, + "loss": 0.3893, + "step": 1362 + }, + { + "epoch": 0.6513350456961949, + "grad_norm": 0.6052620118835617, + "learning_rate": 1.851503015566083e-05, + "loss": 0.3853, + "step": 1363 + }, + { + "epoch": 0.6518129144017681, + "grad_norm": 0.5723298054745771, + "learning_rate": 1.85122673768655e-05, + "loss": 0.3837, + "step": 1364 + }, + { + "epoch": 0.6522907831073412, + "grad_norm": 1.0042668593415573, + "learning_rate": 1.8509502236917353e-05, + "loss": 0.3901, + "step": 1365 + }, + { + "epoch": 0.6527686518129144, + "grad_norm": 0.7999380988229664, + "learning_rate": 1.8506734736583388e-05, + "loss": 0.3922, + "step": 1366 + }, + { + "epoch": 0.6532465205184875, + "grad_norm": 0.583257327373365, + "learning_rate": 1.850396487663127e-05, + "loss": 0.3916, + "step": 1367 + }, + { + "epoch": 0.6537243892240607, + "grad_norm": 0.563446303201131, + "learning_rate": 1.85011926578293e-05, + "loss": 0.3973, + "step": 1368 + }, + { + "epoch": 0.6542022579296338, + "grad_norm": 0.5276619379868954, + "learning_rate": 1.8498418080946444e-05, + "loss": 0.3765, + "step": 1369 + }, + { + "epoch": 0.654680126635207, + "grad_norm": 0.5838719171402456, + "learning_rate": 1.8495641146752322e-05, + "loss": 0.4094, + "step": 1370 + }, + { + "epoch": 0.6551579953407801, + "grad_norm": 0.5690336348547724, + "learning_rate": 1.8492861856017206e-05, + "loss": 0.3861, + "step": 1371 + }, + { + "epoch": 0.6556358640463532, + "grad_norm": 0.6359126253694918, + "learning_rate": 1.8490080209512024e-05, + "loss": 0.4018, + "step": 1372 + }, + { + "epoch": 0.6561137327519264, + "grad_norm": 0.6504835153018964, + "learning_rate": 1.848729620800835e-05, + "loss": 0.4012, + "step": 1373 + }, + { + "epoch": 0.6565916014574995, + "grad_norm": 0.5305539981752121, + "learning_rate": 1.8484509852278426e-05, + "loss": 0.387, + "step": 1374 + }, + { + "epoch": 0.6570694701630727, + "grad_norm": 0.5413728281865292, + "learning_rate": 1.848172114309513e-05, + "loss": 0.3996, + "step": 1375 + }, + { + "epoch": 0.6575473388686458, + "grad_norm": 0.5770591202114015, + "learning_rate": 1.847893008123201e-05, + "loss": 0.3918, + "step": 1376 + }, + { + "epoch": 0.658025207574219, + "grad_norm": 0.5418743549514499, + "learning_rate": 1.8476136667463246e-05, + "loss": 0.3675, + "step": 1377 + }, + { + "epoch": 0.6585030762797921, + "grad_norm": 0.5384300809696159, + "learning_rate": 1.8473340902563686e-05, + "loss": 0.3804, + "step": 1378 + }, + { + "epoch": 0.6589809449853653, + "grad_norm": 0.582974175378902, + "learning_rate": 1.847054278730883e-05, + "loss": 0.3924, + "step": 1379 + }, + { + "epoch": 0.6594588136909384, + "grad_norm": 0.521915823350355, + "learning_rate": 1.8467742322474822e-05, + "loss": 0.3678, + "step": 1380 + }, + { + "epoch": 0.6599366823965116, + "grad_norm": 0.552505801390087, + "learning_rate": 1.846493950883846e-05, + "loss": 0.3673, + "step": 1381 + }, + { + "epoch": 0.6604145511020847, + "grad_norm": 0.5738503571000322, + "learning_rate": 1.84621343471772e-05, + "loss": 0.3784, + "step": 1382 + }, + { + "epoch": 0.6608924198076579, + "grad_norm": 0.543933586148958, + "learning_rate": 1.8459326838269137e-05, + "loss": 0.3883, + "step": 1383 + }, + { + "epoch": 0.661370288513231, + "grad_norm": 0.5624988585079359, + "learning_rate": 1.8456516982893036e-05, + "loss": 0.3841, + "step": 1384 + }, + { + "epoch": 0.6618481572188041, + "grad_norm": 0.8760123821285536, + "learning_rate": 1.845370478182829e-05, + "loss": 0.3789, + "step": 1385 + }, + { + "epoch": 0.6623260259243773, + "grad_norm": 0.5255149377123545, + "learning_rate": 1.8450890235854958e-05, + "loss": 0.3995, + "step": 1386 + }, + { + "epoch": 0.6628038946299504, + "grad_norm": 0.8238825404916186, + "learning_rate": 1.8448073345753746e-05, + "loss": 0.381, + "step": 1387 + }, + { + "epoch": 0.6632817633355236, + "grad_norm": 0.5368697433670593, + "learning_rate": 1.8445254112306013e-05, + "loss": 0.3901, + "step": 1388 + }, + { + "epoch": 0.6637596320410967, + "grad_norm": 0.5844500955136835, + "learning_rate": 1.8442432536293756e-05, + "loss": 0.3841, + "step": 1389 + }, + { + "epoch": 0.6642375007466699, + "grad_norm": 0.5317108098879734, + "learning_rate": 1.8439608618499637e-05, + "loss": 0.3846, + "step": 1390 + }, + { + "epoch": 0.664715369452243, + "grad_norm": 0.5486204761949643, + "learning_rate": 1.843678235970696e-05, + "loss": 0.4078, + "step": 1391 + }, + { + "epoch": 0.6651932381578162, + "grad_norm": 0.5621459280922341, + "learning_rate": 1.8433953760699678e-05, + "loss": 0.3711, + "step": 1392 + }, + { + "epoch": 0.6656711068633893, + "grad_norm": 0.626320660146386, + "learning_rate": 1.8431122822262398e-05, + "loss": 0.3721, + "step": 1393 + }, + { + "epoch": 0.6661489755689625, + "grad_norm": 0.5634692726095906, + "learning_rate": 1.8428289545180367e-05, + "loss": 0.3997, + "step": 1394 + }, + { + "epoch": 0.6666268442745356, + "grad_norm": 0.5806101182458545, + "learning_rate": 1.842545393023949e-05, + "loss": 0.3834, + "step": 1395 + }, + { + "epoch": 0.6671047129801088, + "grad_norm": 0.5508052241942929, + "learning_rate": 1.8422615978226313e-05, + "loss": 0.398, + "step": 1396 + }, + { + "epoch": 0.6675825816856819, + "grad_norm": 0.5741650592949158, + "learning_rate": 1.8419775689928035e-05, + "loss": 0.3806, + "step": 1397 + }, + { + "epoch": 0.6680604503912551, + "grad_norm": 0.5163994653334596, + "learning_rate": 1.8416933066132507e-05, + "loss": 0.3815, + "step": 1398 + }, + { + "epoch": 0.6685383190968281, + "grad_norm": 0.6173088911528165, + "learning_rate": 1.8414088107628215e-05, + "loss": 0.3758, + "step": 1399 + }, + { + "epoch": 0.6690161878024012, + "grad_norm": 0.6040108460649616, + "learning_rate": 1.841124081520431e-05, + "loss": 0.3933, + "step": 1400 + }, + { + "epoch": 0.6694940565079744, + "grad_norm": 0.6180489249655141, + "learning_rate": 1.840839118965057e-05, + "loss": 0.3931, + "step": 1401 + }, + { + "epoch": 0.6699719252135475, + "grad_norm": 0.5294472563078757, + "learning_rate": 1.8405539231757435e-05, + "loss": 0.3943, + "step": 1402 + }, + { + "epoch": 0.6704497939191207, + "grad_norm": 0.6292773643061201, + "learning_rate": 1.840268494231599e-05, + "loss": 0.3938, + "step": 1403 + }, + { + "epoch": 0.6709276626246938, + "grad_norm": 0.713996443805077, + "learning_rate": 1.839982832211796e-05, + "loss": 0.3781, + "step": 1404 + }, + { + "epoch": 0.671405531330267, + "grad_norm": 0.5799035626176681, + "learning_rate": 1.8396969371955724e-05, + "loss": 0.4023, + "step": 1405 + }, + { + "epoch": 0.6718834000358401, + "grad_norm": 0.5700183080878829, + "learning_rate": 1.8394108092622307e-05, + "loss": 0.3844, + "step": 1406 + }, + { + "epoch": 0.6723612687414133, + "grad_norm": 0.6189918262496643, + "learning_rate": 1.839124448491137e-05, + "loss": 0.3948, + "step": 1407 + }, + { + "epoch": 0.6728391374469864, + "grad_norm": 0.5222965203774697, + "learning_rate": 1.8388378549617238e-05, + "loss": 0.4008, + "step": 1408 + }, + { + "epoch": 0.6733170061525596, + "grad_norm": 0.7906792457229658, + "learning_rate": 1.838551028753486e-05, + "loss": 0.3897, + "step": 1409 + }, + { + "epoch": 0.6737948748581327, + "grad_norm": 0.6020606169233097, + "learning_rate": 1.838263969945985e-05, + "loss": 0.3839, + "step": 1410 + }, + { + "epoch": 0.6742727435637059, + "grad_norm": 0.5357579566712205, + "learning_rate": 1.8379766786188457e-05, + "loss": 0.3836, + "step": 1411 + }, + { + "epoch": 0.674750612269279, + "grad_norm": 0.5633942493212672, + "learning_rate": 1.8376891548517566e-05, + "loss": 0.3847, + "step": 1412 + }, + { + "epoch": 0.6752284809748521, + "grad_norm": 0.5798470271651702, + "learning_rate": 1.837401398724473e-05, + "loss": 0.3813, + "step": 1413 + }, + { + "epoch": 0.6757063496804253, + "grad_norm": 0.5636278137204155, + "learning_rate": 1.837113410316813e-05, + "loss": 0.3861, + "step": 1414 + }, + { + "epoch": 0.6761842183859984, + "grad_norm": 0.6008374318361374, + "learning_rate": 1.836825189708659e-05, + "loss": 0.3771, + "step": 1415 + }, + { + "epoch": 0.6766620870915716, + "grad_norm": 0.5521729571309322, + "learning_rate": 1.836536736979959e-05, + "loss": 0.3854, + "step": 1416 + }, + { + "epoch": 0.6771399557971447, + "grad_norm": 0.557361883029169, + "learning_rate": 1.8362480522107244e-05, + "loss": 0.3679, + "step": 1417 + }, + { + "epoch": 0.6776178245027179, + "grad_norm": 0.5435824654885472, + "learning_rate": 1.8359591354810313e-05, + "loss": 0.391, + "step": 1418 + }, + { + "epoch": 0.678095693208291, + "grad_norm": 0.5417613234703497, + "learning_rate": 1.8356699868710198e-05, + "loss": 0.3881, + "step": 1419 + }, + { + "epoch": 0.6785735619138642, + "grad_norm": 0.5308236388209336, + "learning_rate": 1.8353806064608953e-05, + "loss": 0.4036, + "step": 1420 + }, + { + "epoch": 0.6790514306194373, + "grad_norm": 0.5884130068215864, + "learning_rate": 1.8350909943309262e-05, + "loss": 0.3732, + "step": 1421 + }, + { + "epoch": 0.6795292993250105, + "grad_norm": 0.5409490290787906, + "learning_rate": 1.8348011505614462e-05, + "loss": 0.3896, + "step": 1422 + }, + { + "epoch": 0.6800071680305836, + "grad_norm": 0.5970144011295364, + "learning_rate": 1.8345110752328527e-05, + "loss": 0.3834, + "step": 1423 + }, + { + "epoch": 0.6804850367361568, + "grad_norm": 0.5261236913058273, + "learning_rate": 1.834220768425607e-05, + "loss": 0.389, + "step": 1424 + }, + { + "epoch": 0.6809629054417299, + "grad_norm": 0.5800849754387479, + "learning_rate": 1.833930230220236e-05, + "loss": 0.3769, + "step": 1425 + }, + { + "epoch": 0.681440774147303, + "grad_norm": 0.5513763570891704, + "learning_rate": 1.8336394606973293e-05, + "loss": 0.3755, + "step": 1426 + }, + { + "epoch": 0.6819186428528762, + "grad_norm": 0.5923393622326396, + "learning_rate": 1.8333484599375414e-05, + "loss": 0.3781, + "step": 1427 + }, + { + "epoch": 0.6823965115584493, + "grad_norm": 0.6542084688710019, + "learning_rate": 1.8330572280215904e-05, + "loss": 0.3802, + "step": 1428 + }, + { + "epoch": 0.6828743802640225, + "grad_norm": 0.5657355120408663, + "learning_rate": 1.8327657650302596e-05, + "loss": 0.3916, + "step": 1429 + }, + { + "epoch": 0.6833522489695956, + "grad_norm": 0.6676995536636485, + "learning_rate": 1.8324740710443955e-05, + "loss": 0.3841, + "step": 1430 + }, + { + "epoch": 0.6838301176751688, + "grad_norm": 0.5639564465708701, + "learning_rate": 1.8321821461449084e-05, + "loss": 0.3787, + "step": 1431 + }, + { + "epoch": 0.6843079863807419, + "grad_norm": 0.5518394477924401, + "learning_rate": 1.831889990412773e-05, + "loss": 0.389, + "step": 1432 + }, + { + "epoch": 0.6847858550863151, + "grad_norm": 0.5756892172775409, + "learning_rate": 1.831597603929029e-05, + "loss": 0.391, + "step": 1433 + }, + { + "epoch": 0.6852637237918882, + "grad_norm": 0.538534263526804, + "learning_rate": 1.8313049867747788e-05, + "loss": 0.3897, + "step": 1434 + }, + { + "epoch": 0.6857415924974614, + "grad_norm": 0.5680170895960552, + "learning_rate": 1.831012139031189e-05, + "loss": 0.3967, + "step": 1435 + }, + { + "epoch": 0.6862194612030345, + "grad_norm": 0.586463487068828, + "learning_rate": 1.830719060779491e-05, + "loss": 0.3813, + "step": 1436 + }, + { + "epoch": 0.6866973299086077, + "grad_norm": 0.6179525461545747, + "learning_rate": 1.830425752100979e-05, + "loss": 0.3918, + "step": 1437 + }, + { + "epoch": 0.6871751986141807, + "grad_norm": 0.6674306038089053, + "learning_rate": 1.8301322130770117e-05, + "loss": 0.3828, + "step": 1438 + }, + { + "epoch": 0.687653067319754, + "grad_norm": 2.332261711848657, + "learning_rate": 1.829838443789012e-05, + "loss": 0.3838, + "step": 1439 + }, + { + "epoch": 0.688130936025327, + "grad_norm": 0.6981750566847236, + "learning_rate": 1.829544444318466e-05, + "loss": 0.3946, + "step": 1440 + }, + { + "epoch": 0.6886088047309001, + "grad_norm": 0.5303694950239685, + "learning_rate": 1.829250214746924e-05, + "loss": 0.3809, + "step": 1441 + }, + { + "epoch": 0.6890866734364733, + "grad_norm": 0.7179900691839765, + "learning_rate": 1.8289557551560002e-05, + "loss": 0.3862, + "step": 1442 + }, + { + "epoch": 0.6895645421420464, + "grad_norm": 0.5345675046456023, + "learning_rate": 1.8286610656273724e-05, + "loss": 0.3658, + "step": 1443 + }, + { + "epoch": 0.6900424108476196, + "grad_norm": 0.6704734216159564, + "learning_rate": 1.828366146242782e-05, + "loss": 0.4057, + "step": 1444 + }, + { + "epoch": 0.6905202795531927, + "grad_norm": 0.6051417776741872, + "learning_rate": 1.8280709970840352e-05, + "loss": 0.4059, + "step": 1445 + }, + { + "epoch": 0.6909981482587659, + "grad_norm": 0.5450521205173916, + "learning_rate": 1.8277756182330008e-05, + "loss": 0.3914, + "step": 1446 + }, + { + "epoch": 0.691476016964339, + "grad_norm": 0.6382596888643569, + "learning_rate": 1.8274800097716113e-05, + "loss": 0.3682, + "step": 1447 + }, + { + "epoch": 0.6919538856699122, + "grad_norm": 0.5469859901894466, + "learning_rate": 1.827184171781864e-05, + "loss": 0.396, + "step": 1448 + }, + { + "epoch": 0.6924317543754853, + "grad_norm": 0.6169779108480756, + "learning_rate": 1.8268881043458183e-05, + "loss": 0.3852, + "step": 1449 + }, + { + "epoch": 0.6929096230810585, + "grad_norm": 0.5515949799511892, + "learning_rate": 1.8265918075455985e-05, + "loss": 0.3879, + "step": 1450 + }, + { + "epoch": 0.6933874917866316, + "grad_norm": 0.5852784051902299, + "learning_rate": 1.8262952814633927e-05, + "loss": 0.3777, + "step": 1451 + }, + { + "epoch": 0.6938653604922048, + "grad_norm": 0.5747324569861376, + "learning_rate": 1.8259985261814506e-05, + "loss": 0.3708, + "step": 1452 + }, + { + "epoch": 0.6943432291977779, + "grad_norm": 0.5584613448774921, + "learning_rate": 1.825701541782088e-05, + "loss": 0.3939, + "step": 1453 + }, + { + "epoch": 0.694821097903351, + "grad_norm": 0.6166551656589301, + "learning_rate": 1.825404328347683e-05, + "loss": 0.3956, + "step": 1454 + }, + { + "epoch": 0.6952989666089242, + "grad_norm": 0.5834938296403458, + "learning_rate": 1.8251068859606777e-05, + "loss": 0.3797, + "step": 1455 + }, + { + "epoch": 0.6957768353144973, + "grad_norm": 0.6153031826664653, + "learning_rate": 1.8248092147035762e-05, + "loss": 0.3873, + "step": 1456 + }, + { + "epoch": 0.6962547040200705, + "grad_norm": 0.5549036883726521, + "learning_rate": 1.8245113146589478e-05, + "loss": 0.3811, + "step": 1457 + }, + { + "epoch": 0.6967325727256436, + "grad_norm": 0.6100500222117385, + "learning_rate": 1.8242131859094253e-05, + "loss": 0.3739, + "step": 1458 + }, + { + "epoch": 0.6972104414312168, + "grad_norm": 0.5644164022778724, + "learning_rate": 1.823914828537704e-05, + "loss": 0.3805, + "step": 1459 + }, + { + "epoch": 0.6976883101367899, + "grad_norm": 0.5720320474251424, + "learning_rate": 1.823616242626542e-05, + "loss": 0.3716, + "step": 1460 + }, + { + "epoch": 0.6981661788423631, + "grad_norm": 0.5353119220152026, + "learning_rate": 1.8233174282587636e-05, + "loss": 0.3736, + "step": 1461 + }, + { + "epoch": 0.6986440475479362, + "grad_norm": 0.6632595208658023, + "learning_rate": 1.823018385517253e-05, + "loss": 0.3728, + "step": 1462 + }, + { + "epoch": 0.6991219162535094, + "grad_norm": 0.5332034499910223, + "learning_rate": 1.8227191144849606e-05, + "loss": 0.3905, + "step": 1463 + }, + { + "epoch": 0.6995997849590825, + "grad_norm": 0.6114377061257181, + "learning_rate": 1.822419615244898e-05, + "loss": 0.3729, + "step": 1464 + }, + { + "epoch": 0.7000776536646557, + "grad_norm": 0.5341668385517253, + "learning_rate": 1.8221198878801415e-05, + "loss": 0.3839, + "step": 1465 + }, + { + "epoch": 0.7005555223702288, + "grad_norm": 0.6080766324288399, + "learning_rate": 1.82181993247383e-05, + "loss": 0.3795, + "step": 1466 + }, + { + "epoch": 0.7010333910758019, + "grad_norm": 0.5588790456997228, + "learning_rate": 1.8215197491091657e-05, + "loss": 0.4014, + "step": 1467 + }, + { + "epoch": 0.7015112597813751, + "grad_norm": 0.5512763293405722, + "learning_rate": 1.8212193378694145e-05, + "loss": 0.3927, + "step": 1468 + }, + { + "epoch": 0.7019891284869482, + "grad_norm": 0.5615590012756244, + "learning_rate": 1.8209186988379053e-05, + "loss": 0.3922, + "step": 1469 + }, + { + "epoch": 0.7024669971925214, + "grad_norm": 0.5651868825673871, + "learning_rate": 1.8206178320980295e-05, + "loss": 0.3872, + "step": 1470 + }, + { + "epoch": 0.7029448658980945, + "grad_norm": 0.5672597491790865, + "learning_rate": 1.8203167377332428e-05, + "loss": 0.3755, + "step": 1471 + }, + { + "epoch": 0.7034227346036677, + "grad_norm": 0.6384822322202582, + "learning_rate": 1.820015415827063e-05, + "loss": 0.388, + "step": 1472 + }, + { + "epoch": 0.7039006033092408, + "grad_norm": 0.6101936905626903, + "learning_rate": 1.8197138664630714e-05, + "loss": 0.4005, + "step": 1473 + }, + { + "epoch": 0.704378472014814, + "grad_norm": 0.5780890528276408, + "learning_rate": 1.819412089724913e-05, + "loss": 0.3919, + "step": 1474 + }, + { + "epoch": 0.704856340720387, + "grad_norm": 0.5709336635510021, + "learning_rate": 1.819110085696295e-05, + "loss": 0.3851, + "step": 1475 + }, + { + "epoch": 0.7053342094259603, + "grad_norm": 0.5653007211064114, + "learning_rate": 1.8188078544609885e-05, + "loss": 0.3736, + "step": 1476 + }, + { + "epoch": 0.7058120781315333, + "grad_norm": 0.5489304827020616, + "learning_rate": 1.8185053961028262e-05, + "loss": 0.3939, + "step": 1477 + }, + { + "epoch": 0.7062899468371066, + "grad_norm": 0.57579168376753, + "learning_rate": 1.8182027107057054e-05, + "loss": 0.3797, + "step": 1478 + }, + { + "epoch": 0.7067678155426796, + "grad_norm": 0.5694198738113794, + "learning_rate": 1.8178997983535852e-05, + "loss": 0.3695, + "step": 1479 + }, + { + "epoch": 0.7072456842482527, + "grad_norm": 0.6271433046445659, + "learning_rate": 1.817596659130489e-05, + "loss": 0.3941, + "step": 1480 + }, + { + "epoch": 0.7077235529538259, + "grad_norm": 0.5356698848547248, + "learning_rate": 1.8172932931205018e-05, + "loss": 0.3814, + "step": 1481 + }, + { + "epoch": 0.708201421659399, + "grad_norm": 0.5350117309625678, + "learning_rate": 1.8169897004077714e-05, + "loss": 0.3684, + "step": 1482 + }, + { + "epoch": 0.7086792903649722, + "grad_norm": 0.563828736209021, + "learning_rate": 1.8166858810765093e-05, + "loss": 0.3759, + "step": 1483 + }, + { + "epoch": 0.7091571590705453, + "grad_norm": 0.5015045069229582, + "learning_rate": 1.8163818352109905e-05, + "loss": 0.3837, + "step": 1484 + }, + { + "epoch": 0.7096350277761185, + "grad_norm": 0.5494681307134424, + "learning_rate": 1.816077562895551e-05, + "loss": 0.3835, + "step": 1485 + }, + { + "epoch": 0.7101128964816916, + "grad_norm": 0.8231790789588842, + "learning_rate": 1.8157730642145912e-05, + "loss": 0.3928, + "step": 1486 + }, + { + "epoch": 0.7105907651872648, + "grad_norm": 0.5469496135283491, + "learning_rate": 1.815468339252573e-05, + "loss": 0.3996, + "step": 1487 + }, + { + "epoch": 0.7110686338928379, + "grad_norm": 0.5496003239400815, + "learning_rate": 1.8151633880940226e-05, + "loss": 0.3915, + "step": 1488 + }, + { + "epoch": 0.7115465025984111, + "grad_norm": 0.5228411620381861, + "learning_rate": 1.8148582108235274e-05, + "loss": 0.3725, + "step": 1489 + }, + { + "epoch": 0.7120243713039842, + "grad_norm": 0.5704507706283268, + "learning_rate": 1.814552807525738e-05, + "loss": 0.3919, + "step": 1490 + }, + { + "epoch": 0.7125022400095574, + "grad_norm": 0.648766640982509, + "learning_rate": 1.8142471782853686e-05, + "loss": 0.3908, + "step": 1491 + }, + { + "epoch": 0.7129801087151305, + "grad_norm": 0.8667271303998992, + "learning_rate": 1.813941323187195e-05, + "loss": 0.386, + "step": 1492 + }, + { + "epoch": 0.7134579774207037, + "grad_norm": 0.5901056863467703, + "learning_rate": 1.813635242316056e-05, + "loss": 0.3683, + "step": 1493 + }, + { + "epoch": 0.7139358461262768, + "grad_norm": 0.5222770355952134, + "learning_rate": 1.813328935756853e-05, + "loss": 0.396, + "step": 1494 + }, + { + "epoch": 0.7144137148318499, + "grad_norm": 0.6020991844265385, + "learning_rate": 1.81302240359455e-05, + "loss": 0.3642, + "step": 1495 + }, + { + "epoch": 0.7148915835374231, + "grad_norm": 0.573707434054128, + "learning_rate": 1.812715645914174e-05, + "loss": 0.3872, + "step": 1496 + }, + { + "epoch": 0.7153694522429962, + "grad_norm": 0.5132272553395804, + "learning_rate": 1.8124086628008137e-05, + "loss": 0.3992, + "step": 1497 + }, + { + "epoch": 0.7158473209485694, + "grad_norm": 0.5814015590506686, + "learning_rate": 1.812101454339621e-05, + "loss": 0.3813, + "step": 1498 + }, + { + "epoch": 0.7163251896541425, + "grad_norm": 0.5029998615126221, + "learning_rate": 1.81179402061581e-05, + "loss": 0.3932, + "step": 1499 + }, + { + "epoch": 0.7168030583597157, + "grad_norm": 0.5827864769681101, + "learning_rate": 1.8114863617146576e-05, + "loss": 0.3809, + "step": 1500 + }, + { + "epoch": 0.7172809270652888, + "grad_norm": 0.6147481989073579, + "learning_rate": 1.8111784777215028e-05, + "loss": 0.3914, + "step": 1501 + }, + { + "epoch": 0.717758795770862, + "grad_norm": 0.6022165391681865, + "learning_rate": 1.8108703687217474e-05, + "loss": 0.3926, + "step": 1502 + }, + { + "epoch": 0.7182366644764351, + "grad_norm": 0.5749879738062318, + "learning_rate": 1.8105620348008548e-05, + "loss": 0.3698, + "step": 1503 + }, + { + "epoch": 0.7187145331820083, + "grad_norm": 0.5144494515929651, + "learning_rate": 1.810253476044352e-05, + "loss": 0.3903, + "step": 1504 + }, + { + "epoch": 0.7191924018875814, + "grad_norm": 0.6680656863681617, + "learning_rate": 1.8099446925378278e-05, + "loss": 0.4031, + "step": 1505 + }, + { + "epoch": 0.7196702705931546, + "grad_norm": 0.5363554834856091, + "learning_rate": 1.809635684366933e-05, + "loss": 0.3897, + "step": 1506 + }, + { + "epoch": 0.7201481392987277, + "grad_norm": 0.57060574097619, + "learning_rate": 1.809326451617381e-05, + "loss": 0.3847, + "step": 1507 + }, + { + "epoch": 0.7206260080043008, + "grad_norm": 0.5082209282023774, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.386, + "step": 1508 + }, + { + "epoch": 0.721103876709874, + "grad_norm": 0.531721764246233, + "learning_rate": 1.8087073127254708e-05, + "loss": 0.379, + "step": 1509 + }, + { + "epoch": 0.7215817454154471, + "grad_norm": 0.5672223837229015, + "learning_rate": 1.8083974067548506e-05, + "loss": 0.3743, + "step": 1510 + }, + { + "epoch": 0.7220596141210203, + "grad_norm": 0.557363288254791, + "learning_rate": 1.80808727654905e-05, + "loss": 0.3922, + "step": 1511 + }, + { + "epoch": 0.7225374828265934, + "grad_norm": 0.5524986533091046, + "learning_rate": 1.807776922194093e-05, + "loss": 0.3828, + "step": 1512 + }, + { + "epoch": 0.7230153515321666, + "grad_norm": 0.5624030376925013, + "learning_rate": 1.807466343776067e-05, + "loss": 0.3879, + "step": 1513 + }, + { + "epoch": 0.7234932202377397, + "grad_norm": 0.5587652959424021, + "learning_rate": 1.8071555413811202e-05, + "loss": 0.3977, + "step": 1514 + }, + { + "epoch": 0.7239710889433129, + "grad_norm": 0.5243505053339486, + "learning_rate": 1.806844515095465e-05, + "loss": 0.3756, + "step": 1515 + }, + { + "epoch": 0.724448957648886, + "grad_norm": 0.5136588134139919, + "learning_rate": 1.8065332650053733e-05, + "loss": 0.386, + "step": 1516 + }, + { + "epoch": 0.7249268263544592, + "grad_norm": 0.765740907023463, + "learning_rate": 1.806221791197181e-05, + "loss": 0.3731, + "step": 1517 + }, + { + "epoch": 0.7254046950600322, + "grad_norm": 0.5765742196934072, + "learning_rate": 1.8059100937572853e-05, + "loss": 0.3859, + "step": 1518 + }, + { + "epoch": 0.7258825637656054, + "grad_norm": 0.508218622133711, + "learning_rate": 1.8055981727721454e-05, + "loss": 0.3887, + "step": 1519 + }, + { + "epoch": 0.7263604324711785, + "grad_norm": 0.5169436575801677, + "learning_rate": 1.8052860283282832e-05, + "loss": 0.3969, + "step": 1520 + }, + { + "epoch": 0.7268383011767516, + "grad_norm": 0.5627369158971857, + "learning_rate": 1.8049736605122814e-05, + "loss": 0.3957, + "step": 1521 + }, + { + "epoch": 0.7273161698823248, + "grad_norm": 0.5642116374092477, + "learning_rate": 1.804661069410786e-05, + "loss": 0.3877, + "step": 1522 + }, + { + "epoch": 0.7277940385878979, + "grad_norm": 0.5632828254864813, + "learning_rate": 1.8043482551105038e-05, + "loss": 0.3846, + "step": 1523 + }, + { + "epoch": 0.7282719072934711, + "grad_norm": 0.6720264991530787, + "learning_rate": 1.8040352176982042e-05, + "loss": 0.3742, + "step": 1524 + }, + { + "epoch": 0.7287497759990442, + "grad_norm": 0.5295580482800217, + "learning_rate": 1.8037219572607177e-05, + "loss": 0.3812, + "step": 1525 + }, + { + "epoch": 0.7292276447046174, + "grad_norm": 0.5343714521991437, + "learning_rate": 1.803408473884938e-05, + "loss": 0.3767, + "step": 1526 + }, + { + "epoch": 0.7297055134101905, + "grad_norm": 0.5713754536699026, + "learning_rate": 1.8030947676578198e-05, + "loss": 0.3829, + "step": 1527 + }, + { + "epoch": 0.7301833821157637, + "grad_norm": 0.5439449953744703, + "learning_rate": 1.802780838666379e-05, + "loss": 0.3883, + "step": 1528 + }, + { + "epoch": 0.7306612508213368, + "grad_norm": 1.4893821900130726, + "learning_rate": 1.8024666869976946e-05, + "loss": 0.3874, + "step": 1529 + }, + { + "epoch": 0.73113911952691, + "grad_norm": 0.5642566066926745, + "learning_rate": 1.8021523127389066e-05, + "loss": 0.3707, + "step": 1530 + }, + { + "epoch": 0.7316169882324831, + "grad_norm": 0.5131365637356557, + "learning_rate": 1.8018377159772163e-05, + "loss": 0.3842, + "step": 1531 + }, + { + "epoch": 0.7320948569380563, + "grad_norm": 0.6078967074642584, + "learning_rate": 1.801522896799888e-05, + "loss": 0.3802, + "step": 1532 + }, + { + "epoch": 0.7325727256436294, + "grad_norm": 0.5556930415295182, + "learning_rate": 1.801207855294247e-05, + "loss": 0.3974, + "step": 1533 + }, + { + "epoch": 0.7330505943492026, + "grad_norm": 0.5433320644721947, + "learning_rate": 1.8008925915476795e-05, + "loss": 0.3817, + "step": 1534 + }, + { + "epoch": 0.7335284630547757, + "grad_norm": 0.5725907910354026, + "learning_rate": 1.800577105647635e-05, + "loss": 0.3862, + "step": 1535 + }, + { + "epoch": 0.7340063317603488, + "grad_norm": 0.5275822555651795, + "learning_rate": 1.800261397681623e-05, + "loss": 0.3662, + "step": 1536 + }, + { + "epoch": 0.734484200465922, + "grad_norm": 0.5958868088113626, + "learning_rate": 1.799945467737216e-05, + "loss": 0.3861, + "step": 1537 + }, + { + "epoch": 0.7349620691714951, + "grad_norm": 0.528135241277979, + "learning_rate": 1.7996293159020468e-05, + "loss": 0.3633, + "step": 1538 + }, + { + "epoch": 0.7354399378770683, + "grad_norm": 0.5629034073505758, + "learning_rate": 1.799312942263811e-05, + "loss": 0.3752, + "step": 1539 + }, + { + "epoch": 0.7359178065826414, + "grad_norm": 0.5369379864183004, + "learning_rate": 1.7989963469102643e-05, + "loss": 0.3788, + "step": 1540 + }, + { + "epoch": 0.7363956752882146, + "grad_norm": 0.5181107364251343, + "learning_rate": 1.798679529929225e-05, + "loss": 0.3908, + "step": 1541 + }, + { + "epoch": 0.7368735439937877, + "grad_norm": 0.5889689370863749, + "learning_rate": 1.7983624914085726e-05, + "loss": 0.38, + "step": 1542 + }, + { + "epoch": 0.7373514126993609, + "grad_norm": 0.5087601460758195, + "learning_rate": 1.7980452314362482e-05, + "loss": 0.3816, + "step": 1543 + }, + { + "epoch": 0.737829281404934, + "grad_norm": 0.5940221012894452, + "learning_rate": 1.7977277501002538e-05, + "loss": 0.3637, + "step": 1544 + }, + { + "epoch": 0.7383071501105072, + "grad_norm": 0.5633247453068111, + "learning_rate": 1.797410047488653e-05, + "loss": 0.3969, + "step": 1545 + }, + { + "epoch": 0.7387850188160803, + "grad_norm": 0.5712343756297124, + "learning_rate": 1.797092123689571e-05, + "loss": 0.3811, + "step": 1546 + }, + { + "epoch": 0.7392628875216535, + "grad_norm": 0.7181347071469072, + "learning_rate": 1.7967739787911946e-05, + "loss": 0.3998, + "step": 1547 + }, + { + "epoch": 0.7397407562272266, + "grad_norm": 0.5637357459690837, + "learning_rate": 1.7964556128817713e-05, + "loss": 0.3927, + "step": 1548 + }, + { + "epoch": 0.7402186249327997, + "grad_norm": 0.5365699147490028, + "learning_rate": 1.79613702604961e-05, + "loss": 0.3888, + "step": 1549 + }, + { + "epoch": 0.7406964936383729, + "grad_norm": 0.5518595874098879, + "learning_rate": 1.7958182183830816e-05, + "loss": 0.3996, + "step": 1550 + }, + { + "epoch": 0.741174362343946, + "grad_norm": 0.5208148323047387, + "learning_rate": 1.795499189970617e-05, + "loss": 0.382, + "step": 1551 + }, + { + "epoch": 0.7416522310495192, + "grad_norm": 0.5007085516775489, + "learning_rate": 1.79517994090071e-05, + "loss": 0.4056, + "step": 1552 + }, + { + "epoch": 0.7421300997550923, + "grad_norm": 0.509836881110819, + "learning_rate": 1.794860471261914e-05, + "loss": 0.3774, + "step": 1553 + }, + { + "epoch": 0.7426079684606655, + "grad_norm": 0.5656747401049432, + "learning_rate": 1.794540781142844e-05, + "loss": 0.3795, + "step": 1554 + }, + { + "epoch": 0.7430858371662385, + "grad_norm": 0.5513113429693962, + "learning_rate": 1.794220870632177e-05, + "loss": 0.373, + "step": 1555 + }, + { + "epoch": 0.7435637058718118, + "grad_norm": 0.5180245500331098, + "learning_rate": 1.7939007398186507e-05, + "loss": 0.3891, + "step": 1556 + }, + { + "epoch": 0.7440415745773848, + "grad_norm": 0.5856101098872712, + "learning_rate": 1.793580388791063e-05, + "loss": 0.3906, + "step": 1557 + }, + { + "epoch": 0.744519443282958, + "grad_norm": 0.5597746606677304, + "learning_rate": 1.7932598176382735e-05, + "loss": 0.3768, + "step": 1558 + }, + { + "epoch": 0.7449973119885311, + "grad_norm": 0.5634426580173892, + "learning_rate": 1.792939026449204e-05, + "loss": 0.3708, + "step": 1559 + }, + { + "epoch": 0.7454751806941043, + "grad_norm": 0.5467128365442397, + "learning_rate": 1.7926180153128358e-05, + "loss": 0.3797, + "step": 1560 + }, + { + "epoch": 0.7459530493996774, + "grad_norm": 0.5203223948570378, + "learning_rate": 1.7922967843182113e-05, + "loss": 0.3872, + "step": 1561 + }, + { + "epoch": 0.7464309181052505, + "grad_norm": 0.5963679934080937, + "learning_rate": 1.7919753335544352e-05, + "loss": 0.3822, + "step": 1562 + }, + { + "epoch": 0.7469087868108237, + "grad_norm": 0.5959650909771645, + "learning_rate": 1.7916536631106714e-05, + "loss": 0.3906, + "step": 1563 + }, + { + "epoch": 0.7473866555163968, + "grad_norm": 0.5685204218306926, + "learning_rate": 1.7913317730761463e-05, + "loss": 0.3845, + "step": 1564 + }, + { + "epoch": 0.74786452422197, + "grad_norm": 0.6029950556254274, + "learning_rate": 1.791009663540146e-05, + "loss": 0.3794, + "step": 1565 + }, + { + "epoch": 0.7483423929275431, + "grad_norm": 0.5300367366403645, + "learning_rate": 1.790687334592018e-05, + "loss": 0.3836, + "step": 1566 + }, + { + "epoch": 0.7488202616331163, + "grad_norm": 0.5994658946073593, + "learning_rate": 1.790364786321171e-05, + "loss": 0.3881, + "step": 1567 + }, + { + "epoch": 0.7492981303386894, + "grad_norm": 0.5964406475861134, + "learning_rate": 1.7900420188170745e-05, + "loss": 0.3905, + "step": 1568 + }, + { + "epoch": 0.7497759990442626, + "grad_norm": 1.7504865632734914, + "learning_rate": 1.7897190321692578e-05, + "loss": 0.3996, + "step": 1569 + }, + { + "epoch": 0.7502538677498357, + "grad_norm": 0.5830761379235893, + "learning_rate": 1.789395826467312e-05, + "loss": 0.3859, + "step": 1570 + }, + { + "epoch": 0.7507317364554089, + "grad_norm": 0.5242952519103954, + "learning_rate": 1.7890724018008883e-05, + "loss": 0.3939, + "step": 1571 + }, + { + "epoch": 0.751209605160982, + "grad_norm": 0.543050764985698, + "learning_rate": 1.7887487582596995e-05, + "loss": 0.394, + "step": 1572 + }, + { + "epoch": 0.7516874738665552, + "grad_norm": 0.5259949143161994, + "learning_rate": 1.7884248959335186e-05, + "loss": 0.3858, + "step": 1573 + }, + { + "epoch": 0.7521653425721283, + "grad_norm": 0.5710757710219158, + "learning_rate": 1.7881008149121793e-05, + "loss": 0.3782, + "step": 1574 + }, + { + "epoch": 0.7526432112777014, + "grad_norm": 0.5437004375278933, + "learning_rate": 1.7877765152855757e-05, + "loss": 0.3744, + "step": 1575 + }, + { + "epoch": 0.7531210799832746, + "grad_norm": 0.5158132068664555, + "learning_rate": 1.7874519971436627e-05, + "loss": 0.378, + "step": 1576 + }, + { + "epoch": 0.7535989486888477, + "grad_norm": 0.5706843234145509, + "learning_rate": 1.787127260576456e-05, + "loss": 0.3843, + "step": 1577 + }, + { + "epoch": 0.7540768173944209, + "grad_norm": 0.5829854434809196, + "learning_rate": 1.7868023056740323e-05, + "loss": 0.383, + "step": 1578 + }, + { + "epoch": 0.754554686099994, + "grad_norm": 0.5491598281489602, + "learning_rate": 1.7864771325265276e-05, + "loss": 0.3843, + "step": 1579 + }, + { + "epoch": 0.7550325548055672, + "grad_norm": 0.5283360065954008, + "learning_rate": 1.78615174122414e-05, + "loss": 0.3867, + "step": 1580 + }, + { + "epoch": 0.7555104235111403, + "grad_norm": 0.5361665431481477, + "learning_rate": 1.7858261318571265e-05, + "loss": 0.3836, + "step": 1581 + }, + { + "epoch": 0.7559882922167135, + "grad_norm": 0.5773171737336307, + "learning_rate": 1.785500304515806e-05, + "loss": 0.4076, + "step": 1582 + }, + { + "epoch": 0.7564661609222866, + "grad_norm": 0.5639322568045663, + "learning_rate": 1.785174259290557e-05, + "loss": 0.3856, + "step": 1583 + }, + { + "epoch": 0.7569440296278598, + "grad_norm": 0.5659655900325912, + "learning_rate": 1.7848479962718183e-05, + "loss": 0.3686, + "step": 1584 + }, + { + "epoch": 0.7574218983334329, + "grad_norm": 0.6962795442570291, + "learning_rate": 1.78452151555009e-05, + "loss": 0.3881, + "step": 1585 + }, + { + "epoch": 0.7578997670390061, + "grad_norm": 0.5417577285230938, + "learning_rate": 1.7841948172159322e-05, + "loss": 0.3794, + "step": 1586 + }, + { + "epoch": 0.7583776357445792, + "grad_norm": 0.5211348547780497, + "learning_rate": 1.783867901359965e-05, + "loss": 0.388, + "step": 1587 + }, + { + "epoch": 0.7588555044501524, + "grad_norm": 0.5104365801361086, + "learning_rate": 1.7835407680728695e-05, + "loss": 0.3777, + "step": 1588 + }, + { + "epoch": 0.7593333731557255, + "grad_norm": 0.540783001344937, + "learning_rate": 1.783213417445386e-05, + "loss": 0.3805, + "step": 1589 + }, + { + "epoch": 0.7598112418612986, + "grad_norm": 0.5258232259033346, + "learning_rate": 1.7828858495683162e-05, + "loss": 0.3767, + "step": 1590 + }, + { + "epoch": 0.7602891105668718, + "grad_norm": 0.5222845479618173, + "learning_rate": 1.7825580645325217e-05, + "loss": 0.3866, + "step": 1591 + }, + { + "epoch": 0.7607669792724449, + "grad_norm": 0.5399407703301767, + "learning_rate": 1.7822300624289242e-05, + "loss": 0.3754, + "step": 1592 + }, + { + "epoch": 0.7612448479780181, + "grad_norm": 0.4996656893078933, + "learning_rate": 1.7819018433485055e-05, + "loss": 0.3792, + "step": 1593 + }, + { + "epoch": 0.7617227166835912, + "grad_norm": 0.5274635062067855, + "learning_rate": 1.7815734073823078e-05, + "loss": 0.3822, + "step": 1594 + }, + { + "epoch": 0.7622005853891644, + "grad_norm": 0.6480497541249455, + "learning_rate": 1.781244754621434e-05, + "loss": 0.3848, + "step": 1595 + }, + { + "epoch": 0.7626784540947374, + "grad_norm": 0.5526359451420565, + "learning_rate": 1.7809158851570463e-05, + "loss": 0.3725, + "step": 1596 + }, + { + "epoch": 0.7631563228003106, + "grad_norm": 0.49088307821668725, + "learning_rate": 1.780586799080367e-05, + "loss": 0.3823, + "step": 1597 + }, + { + "epoch": 0.7636341915058837, + "grad_norm": 0.5511921737669051, + "learning_rate": 1.780257496482679e-05, + "loss": 0.3808, + "step": 1598 + }, + { + "epoch": 0.764112060211457, + "grad_norm": 0.6775694510669721, + "learning_rate": 1.7799279774553246e-05, + "loss": 0.3802, + "step": 1599 + }, + { + "epoch": 0.76458992891703, + "grad_norm": 0.5310444266054842, + "learning_rate": 1.779598242089707e-05, + "loss": 0.3809, + "step": 1600 + }, + { + "epoch": 0.7650677976226032, + "grad_norm": 0.4885991078950156, + "learning_rate": 1.7792682904772894e-05, + "loss": 0.3891, + "step": 1601 + }, + { + "epoch": 0.7655456663281763, + "grad_norm": 0.7799415079754219, + "learning_rate": 1.7789381227095938e-05, + "loss": 0.4004, + "step": 1602 + }, + { + "epoch": 0.7660235350337494, + "grad_norm": 0.48613695514881117, + "learning_rate": 1.7786077388782034e-05, + "loss": 0.3995, + "step": 1603 + }, + { + "epoch": 0.7665014037393226, + "grad_norm": 0.5366975828343058, + "learning_rate": 1.77827713907476e-05, + "loss": 0.3757, + "step": 1604 + }, + { + "epoch": 0.7669792724448957, + "grad_norm": 0.5182970925432647, + "learning_rate": 1.7779463233909677e-05, + "loss": 0.3675, + "step": 1605 + }, + { + "epoch": 0.7674571411504689, + "grad_norm": 0.566425081373052, + "learning_rate": 1.7776152919185873e-05, + "loss": 0.3983, + "step": 1606 + }, + { + "epoch": 0.767935009856042, + "grad_norm": 0.5637827498293969, + "learning_rate": 1.777284044749442e-05, + "loss": 0.3732, + "step": 1607 + }, + { + "epoch": 0.7684128785616152, + "grad_norm": 0.6832393610992347, + "learning_rate": 1.7769525819754138e-05, + "loss": 0.3705, + "step": 1608 + }, + { + "epoch": 0.7688907472671883, + "grad_norm": 0.6312177052802135, + "learning_rate": 1.7766209036884447e-05, + "loss": 0.3672, + "step": 1609 + }, + { + "epoch": 0.7693686159727615, + "grad_norm": 0.5350057787652999, + "learning_rate": 1.7762890099805362e-05, + "loss": 0.3753, + "step": 1610 + }, + { + "epoch": 0.7698464846783346, + "grad_norm": 0.5139252783072269, + "learning_rate": 1.77595690094375e-05, + "loss": 0.3825, + "step": 1611 + }, + { + "epoch": 0.7703243533839078, + "grad_norm": 0.849473799017312, + "learning_rate": 1.7756245766702068e-05, + "loss": 0.3682, + "step": 1612 + }, + { + "epoch": 0.7708022220894809, + "grad_norm": 0.5132435684021898, + "learning_rate": 1.775292037252088e-05, + "loss": 0.3896, + "step": 1613 + }, + { + "epoch": 0.7712800907950541, + "grad_norm": 0.5945126305429924, + "learning_rate": 1.774959282781634e-05, + "loss": 0.3948, + "step": 1614 + }, + { + "epoch": 0.7717579595006272, + "grad_norm": 0.5332717557684908, + "learning_rate": 1.774626313351145e-05, + "loss": 0.3671, + "step": 1615 + }, + { + "epoch": 0.7722358282062003, + "grad_norm": 0.5596768760181904, + "learning_rate": 1.7742931290529804e-05, + "loss": 0.3519, + "step": 1616 + }, + { + "epoch": 0.7727136969117735, + "grad_norm": 0.801668548538205, + "learning_rate": 1.7739597299795606e-05, + "loss": 0.3759, + "step": 1617 + }, + { + "epoch": 0.7731915656173466, + "grad_norm": 0.5443956847817307, + "learning_rate": 1.773626116223364e-05, + "loss": 0.3701, + "step": 1618 + }, + { + "epoch": 0.7736694343229198, + "grad_norm": 0.5223788847180523, + "learning_rate": 1.7732922878769298e-05, + "loss": 0.3815, + "step": 1619 + }, + { + "epoch": 0.7741473030284929, + "grad_norm": 0.5153447344508111, + "learning_rate": 1.7729582450328547e-05, + "loss": 0.388, + "step": 1620 + }, + { + "epoch": 0.7746251717340661, + "grad_norm": 0.5399342464141584, + "learning_rate": 1.7726239877837977e-05, + "loss": 0.3771, + "step": 1621 + }, + { + "epoch": 0.7751030404396392, + "grad_norm": 0.5827130864299022, + "learning_rate": 1.7722895162224752e-05, + "loss": 0.3812, + "step": 1622 + }, + { + "epoch": 0.7755809091452124, + "grad_norm": 0.5264498651194451, + "learning_rate": 1.7719548304416638e-05, + "loss": 0.37, + "step": 1623 + }, + { + "epoch": 0.7760587778507855, + "grad_norm": 0.5525289832285949, + "learning_rate": 1.7716199305341993e-05, + "loss": 0.3641, + "step": 1624 + }, + { + "epoch": 0.7765366465563587, + "grad_norm": 0.6073361920730956, + "learning_rate": 1.771284816592978e-05, + "loss": 0.3751, + "step": 1625 + }, + { + "epoch": 0.7770145152619318, + "grad_norm": 0.5261064606183812, + "learning_rate": 1.770949488710953e-05, + "loss": 0.3831, + "step": 1626 + }, + { + "epoch": 0.777492383967505, + "grad_norm": 0.5346800888345036, + "learning_rate": 1.7706139469811395e-05, + "loss": 0.4004, + "step": 1627 + }, + { + "epoch": 0.7779702526730781, + "grad_norm": 0.585135599645527, + "learning_rate": 1.77027819149661e-05, + "loss": 0.3757, + "step": 1628 + }, + { + "epoch": 0.7784481213786513, + "grad_norm": 0.5412022951804295, + "learning_rate": 1.7699422223504983e-05, + "loss": 0.3903, + "step": 1629 + }, + { + "epoch": 0.7789259900842244, + "grad_norm": 0.5087555055370976, + "learning_rate": 1.7696060396359956e-05, + "loss": 0.3858, + "step": 1630 + }, + { + "epoch": 0.7794038587897975, + "grad_norm": 0.5392498385275896, + "learning_rate": 1.7692696434463527e-05, + "loss": 0.3908, + "step": 1631 + }, + { + "epoch": 0.7798817274953707, + "grad_norm": 0.5248886136108247, + "learning_rate": 1.768933033874881e-05, + "loss": 0.3818, + "step": 1632 + }, + { + "epoch": 0.7803595962009438, + "grad_norm": 0.6344637610053451, + "learning_rate": 1.768596211014949e-05, + "loss": 0.3705, + "step": 1633 + }, + { + "epoch": 0.780837464906517, + "grad_norm": 0.5079797518477398, + "learning_rate": 1.7682591749599856e-05, + "loss": 0.3876, + "step": 1634 + }, + { + "epoch": 0.78131533361209, + "grad_norm": 0.7193783569627261, + "learning_rate": 1.7679219258034798e-05, + "loss": 0.4143, + "step": 1635 + }, + { + "epoch": 0.7817932023176632, + "grad_norm": 0.5441633206776926, + "learning_rate": 1.767584463638977e-05, + "loss": 0.3819, + "step": 1636 + }, + { + "epoch": 0.7822710710232363, + "grad_norm": 0.5501870910998597, + "learning_rate": 1.767246788560084e-05, + "loss": 0.3675, + "step": 1637 + }, + { + "epoch": 0.7827489397288095, + "grad_norm": 1.1914603909692592, + "learning_rate": 1.766908900660466e-05, + "loss": 0.3664, + "step": 1638 + }, + { + "epoch": 0.7832268084343826, + "grad_norm": 0.5737236436900817, + "learning_rate": 1.7665708000338472e-05, + "loss": 0.3912, + "step": 1639 + }, + { + "epoch": 0.7837046771399558, + "grad_norm": 0.5111903213622228, + "learning_rate": 1.7662324867740102e-05, + "loss": 0.3729, + "step": 1640 + }, + { + "epoch": 0.7841825458455289, + "grad_norm": 0.5448282748052444, + "learning_rate": 1.7658939609747978e-05, + "loss": 0.3828, + "step": 1641 + }, + { + "epoch": 0.7846604145511021, + "grad_norm": 0.5536887303747314, + "learning_rate": 1.7655552227301105e-05, + "loss": 0.3734, + "step": 1642 + }, + { + "epoch": 0.7851382832566752, + "grad_norm": 0.5055836319468296, + "learning_rate": 1.7652162721339085e-05, + "loss": 0.3666, + "step": 1643 + }, + { + "epoch": 0.7856161519622483, + "grad_norm": 0.588666775287373, + "learning_rate": 1.764877109280211e-05, + "loss": 0.3917, + "step": 1644 + }, + { + "epoch": 0.7860940206678215, + "grad_norm": 0.5912101719269106, + "learning_rate": 1.7645377342630956e-05, + "loss": 0.3778, + "step": 1645 + }, + { + "epoch": 0.7865718893733946, + "grad_norm": 0.5355115123869113, + "learning_rate": 1.764198147176699e-05, + "loss": 0.3786, + "step": 1646 + }, + { + "epoch": 0.7870497580789678, + "grad_norm": 0.5273687503114382, + "learning_rate": 1.7638583481152164e-05, + "loss": 0.375, + "step": 1647 + }, + { + "epoch": 0.7875276267845409, + "grad_norm": 0.6354283594908688, + "learning_rate": 1.7635183371729022e-05, + "loss": 0.393, + "step": 1648 + }, + { + "epoch": 0.7880054954901141, + "grad_norm": 0.5268426064765231, + "learning_rate": 1.7631781144440697e-05, + "loss": 0.3834, + "step": 1649 + }, + { + "epoch": 0.7884833641956872, + "grad_norm": 0.5851405613750261, + "learning_rate": 1.76283768002309e-05, + "loss": 0.369, + "step": 1650 + }, + { + "epoch": 0.7889612329012604, + "grad_norm": 0.5514500060660226, + "learning_rate": 1.7624970340043948e-05, + "loss": 0.4001, + "step": 1651 + }, + { + "epoch": 0.7894391016068335, + "grad_norm": 0.5303902826647586, + "learning_rate": 1.7621561764824724e-05, + "loss": 0.383, + "step": 1652 + }, + { + "epoch": 0.7899169703124067, + "grad_norm": 0.5956286874077665, + "learning_rate": 1.7618151075518706e-05, + "loss": 0.3741, + "step": 1653 + }, + { + "epoch": 0.7903948390179798, + "grad_norm": 0.6017706875819347, + "learning_rate": 1.7614738273071963e-05, + "loss": 0.3778, + "step": 1654 + }, + { + "epoch": 0.790872707723553, + "grad_norm": 0.5218421728887426, + "learning_rate": 1.7611323358431145e-05, + "loss": 0.3719, + "step": 1655 + }, + { + "epoch": 0.7913505764291261, + "grad_norm": 0.5695441720555854, + "learning_rate": 1.7607906332543486e-05, + "loss": 0.394, + "step": 1656 + }, + { + "epoch": 0.7918284451346992, + "grad_norm": 0.5713620867531857, + "learning_rate": 1.7604487196356818e-05, + "loss": 0.3932, + "step": 1657 + }, + { + "epoch": 0.7923063138402724, + "grad_norm": 0.5142635919293859, + "learning_rate": 1.7601065950819536e-05, + "loss": 0.393, + "step": 1658 + }, + { + "epoch": 0.7927841825458455, + "grad_norm": 0.5563424936264186, + "learning_rate": 1.7597642596880642e-05, + "loss": 0.365, + "step": 1659 + }, + { + "epoch": 0.7932620512514187, + "grad_norm": 0.5538505182272977, + "learning_rate": 1.759421713548971e-05, + "loss": 0.3783, + "step": 1660 + }, + { + "epoch": 0.7937399199569918, + "grad_norm": 0.5142957210677673, + "learning_rate": 1.7590789567596908e-05, + "loss": 0.3953, + "step": 1661 + }, + { + "epoch": 0.794217788662565, + "grad_norm": 0.5688406081526746, + "learning_rate": 1.758735989415298e-05, + "loss": 0.3934, + "step": 1662 + }, + { + "epoch": 0.7946956573681381, + "grad_norm": 0.5009105683579143, + "learning_rate": 1.758392811610925e-05, + "loss": 0.3844, + "step": 1663 + }, + { + "epoch": 0.7951735260737113, + "grad_norm": 0.5603598984766094, + "learning_rate": 1.7580494234417645e-05, + "loss": 0.3952, + "step": 1664 + }, + { + "epoch": 0.7956513947792844, + "grad_norm": 0.5385081611958358, + "learning_rate": 1.757705825003065e-05, + "loss": 0.3791, + "step": 1665 + }, + { + "epoch": 0.7961292634848576, + "grad_norm": 0.550224901020255, + "learning_rate": 1.7573620163901362e-05, + "loss": 0.3723, + "step": 1666 + }, + { + "epoch": 0.7966071321904307, + "grad_norm": 0.6130708493238943, + "learning_rate": 1.7570179976983433e-05, + "loss": 0.3807, + "step": 1667 + }, + { + "epoch": 0.7970850008960039, + "grad_norm": 0.5930943169087218, + "learning_rate": 1.7566737690231113e-05, + "loss": 0.3436, + "step": 1668 + }, + { + "epoch": 0.797562869601577, + "grad_norm": 0.5666432561035308, + "learning_rate": 1.756329330459923e-05, + "loss": 0.3721, + "step": 1669 + }, + { + "epoch": 0.7980407383071502, + "grad_norm": 0.5599329725398824, + "learning_rate": 1.7559846821043205e-05, + "loss": 0.3896, + "step": 1670 + }, + { + "epoch": 0.7985186070127233, + "grad_norm": 0.520091169063164, + "learning_rate": 1.7556398240519025e-05, + "loss": 0.3766, + "step": 1671 + }, + { + "epoch": 0.7989964757182964, + "grad_norm": 0.5987277410784928, + "learning_rate": 1.755294756398326e-05, + "loss": 0.3887, + "step": 1672 + }, + { + "epoch": 0.7994743444238696, + "grad_norm": 0.4945539006204766, + "learning_rate": 1.7549494792393077e-05, + "loss": 0.3861, + "step": 1673 + }, + { + "epoch": 0.7999522131294426, + "grad_norm": 0.536193701750873, + "learning_rate": 1.7546039926706206e-05, + "loss": 0.3772, + "step": 1674 + }, + { + "epoch": 0.8004300818350158, + "grad_norm": 0.5264371388469762, + "learning_rate": 1.754258296788097e-05, + "loss": 0.3936, + "step": 1675 + }, + { + "epoch": 0.8009079505405889, + "grad_norm": 0.49365045426457704, + "learning_rate": 1.753912391687627e-05, + "loss": 0.3854, + "step": 1676 + }, + { + "epoch": 0.8013858192461621, + "grad_norm": 0.572847244678029, + "learning_rate": 1.753566277465158e-05, + "loss": 0.3892, + "step": 1677 + }, + { + "epoch": 0.8018636879517352, + "grad_norm": 0.5167500232282058, + "learning_rate": 1.7532199542166967e-05, + "loss": 0.3847, + "step": 1678 + }, + { + "epoch": 0.8023415566573084, + "grad_norm": 0.5616325554937263, + "learning_rate": 1.7528734220383065e-05, + "loss": 0.3672, + "step": 1679 + }, + { + "epoch": 0.8028194253628815, + "grad_norm": 0.5320597666482352, + "learning_rate": 1.7525266810261096e-05, + "loss": 0.387, + "step": 1680 + }, + { + "epoch": 0.8032972940684547, + "grad_norm": 0.5555593972083235, + "learning_rate": 1.7521797312762854e-05, + "loss": 0.3973, + "step": 1681 + }, + { + "epoch": 0.8037751627740278, + "grad_norm": 0.5880958847721388, + "learning_rate": 1.7518325728850722e-05, + "loss": 0.3683, + "step": 1682 + }, + { + "epoch": 0.804253031479601, + "grad_norm": 0.5565593460383242, + "learning_rate": 1.7514852059487652e-05, + "loss": 0.3751, + "step": 1683 + }, + { + "epoch": 0.8047309001851741, + "grad_norm": 0.5268326213678646, + "learning_rate": 1.7511376305637183e-05, + "loss": 0.3874, + "step": 1684 + }, + { + "epoch": 0.8052087688907472, + "grad_norm": 0.5685440803479197, + "learning_rate": 1.7507898468263422e-05, + "loss": 0.367, + "step": 1685 + }, + { + "epoch": 0.8056866375963204, + "grad_norm": 0.5404765320360659, + "learning_rate": 1.7504418548331065e-05, + "loss": 0.3781, + "step": 1686 + }, + { + "epoch": 0.8061645063018935, + "grad_norm": 0.5458851473805669, + "learning_rate": 1.750093654680538e-05, + "loss": 0.3834, + "step": 1687 + }, + { + "epoch": 0.8066423750074667, + "grad_norm": 0.53472908598489, + "learning_rate": 1.7497452464652207e-05, + "loss": 0.3931, + "step": 1688 + }, + { + "epoch": 0.8071202437130398, + "grad_norm": 0.5339306448364256, + "learning_rate": 1.7493966302837978e-05, + "loss": 0.3765, + "step": 1689 + }, + { + "epoch": 0.807598112418613, + "grad_norm": 0.5452629105131837, + "learning_rate": 1.7490478062329686e-05, + "loss": 0.3854, + "step": 1690 + }, + { + "epoch": 0.8080759811241861, + "grad_norm": 0.5007685109159807, + "learning_rate": 1.7486987744094905e-05, + "loss": 0.3822, + "step": 1691 + }, + { + "epoch": 0.8085538498297593, + "grad_norm": 0.5114501233435106, + "learning_rate": 1.74834953491018e-05, + "loss": 0.3751, + "step": 1692 + }, + { + "epoch": 0.8090317185353324, + "grad_norm": 0.5141980507523353, + "learning_rate": 1.7480000878319084e-05, + "loss": 0.3684, + "step": 1693 + }, + { + "epoch": 0.8095095872409056, + "grad_norm": 0.5034630274486521, + "learning_rate": 1.7476504332716072e-05, + "loss": 0.3758, + "step": 1694 + }, + { + "epoch": 0.8099874559464787, + "grad_norm": 0.5387147348927491, + "learning_rate": 1.7473005713262644e-05, + "loss": 0.37, + "step": 1695 + }, + { + "epoch": 0.8104653246520519, + "grad_norm": 0.5442093019130165, + "learning_rate": 1.7469505020929252e-05, + "loss": 0.3686, + "step": 1696 + }, + { + "epoch": 0.810943193357625, + "grad_norm": 0.4947860144971616, + "learning_rate": 1.7466002256686925e-05, + "loss": 0.3813, + "step": 1697 + }, + { + "epoch": 0.8114210620631981, + "grad_norm": 0.5537428519276251, + "learning_rate": 1.746249742150727e-05, + "loss": 0.3845, + "step": 1698 + }, + { + "epoch": 0.8118989307687713, + "grad_norm": 0.5074492556160554, + "learning_rate": 1.7458990516362468e-05, + "loss": 0.3706, + "step": 1699 + }, + { + "epoch": 0.8123767994743444, + "grad_norm": 0.5464251538547661, + "learning_rate": 1.7455481542225272e-05, + "loss": 0.3771, + "step": 1700 + }, + { + "epoch": 0.8128546681799176, + "grad_norm": 0.5386465641293379, + "learning_rate": 1.7451970500069007e-05, + "loss": 0.3802, + "step": 1701 + }, + { + "epoch": 0.8133325368854907, + "grad_norm": 0.6455115743390833, + "learning_rate": 1.7448457390867575e-05, + "loss": 0.3586, + "step": 1702 + }, + { + "epoch": 0.8138104055910639, + "grad_norm": 0.5305807234406511, + "learning_rate": 1.744494221559545e-05, + "loss": 0.3918, + "step": 1703 + }, + { + "epoch": 0.814288274296637, + "grad_norm": 0.5536892586879414, + "learning_rate": 1.7441424975227685e-05, + "loss": 0.3853, + "step": 1704 + }, + { + "epoch": 0.8147661430022102, + "grad_norm": 0.5262189394069378, + "learning_rate": 1.7437905670739893e-05, + "loss": 0.3738, + "step": 1705 + }, + { + "epoch": 0.8152440117077833, + "grad_norm": 0.5550027476074038, + "learning_rate": 1.7434384303108273e-05, + "loss": 0.3778, + "step": 1706 + }, + { + "epoch": 0.8157218804133565, + "grad_norm": 0.5346890732228916, + "learning_rate": 1.7430860873309586e-05, + "loss": 0.3702, + "step": 1707 + }, + { + "epoch": 0.8161997491189296, + "grad_norm": 0.5437600270856643, + "learning_rate": 1.7427335382321173e-05, + "loss": 0.3684, + "step": 1708 + }, + { + "epoch": 0.8166776178245028, + "grad_norm": 0.6463467252286066, + "learning_rate": 1.742380783112094e-05, + "loss": 0.383, + "step": 1709 + }, + { + "epoch": 0.8171554865300759, + "grad_norm": 0.5461358713423635, + "learning_rate": 1.7420278220687366e-05, + "loss": 0.3672, + "step": 1710 + }, + { + "epoch": 0.817633355235649, + "grad_norm": 0.5179296288711173, + "learning_rate": 1.7416746551999504e-05, + "loss": 0.3631, + "step": 1711 + }, + { + "epoch": 0.8181112239412222, + "grad_norm": 0.62856546922575, + "learning_rate": 1.741321282603698e-05, + "loss": 0.3587, + "step": 1712 + }, + { + "epoch": 0.8185890926467952, + "grad_norm": 0.5022602921463213, + "learning_rate": 1.7409677043779986e-05, + "loss": 0.3827, + "step": 1713 + }, + { + "epoch": 0.8190669613523685, + "grad_norm": 0.5287967693920427, + "learning_rate": 1.7406139206209283e-05, + "loss": 0.3813, + "step": 1714 + }, + { + "epoch": 0.8195448300579415, + "grad_norm": 0.5110900484564262, + "learning_rate": 1.7402599314306207e-05, + "loss": 0.3874, + "step": 1715 + }, + { + "epoch": 0.8200226987635147, + "grad_norm": 0.5781567849028013, + "learning_rate": 1.739905736905266e-05, + "loss": 0.3567, + "step": 1716 + }, + { + "epoch": 0.8205005674690878, + "grad_norm": 0.49833085488616846, + "learning_rate": 1.739551337143112e-05, + "loss": 0.3574, + "step": 1717 + }, + { + "epoch": 0.820978436174661, + "grad_norm": 0.5263629142124272, + "learning_rate": 1.739196732242462e-05, + "loss": 0.3899, + "step": 1718 + }, + { + "epoch": 0.8214563048802341, + "grad_norm": 0.5022935161884177, + "learning_rate": 1.7388419223016778e-05, + "loss": 0.3576, + "step": 1719 + }, + { + "epoch": 0.8219341735858073, + "grad_norm": 0.5254848193160996, + "learning_rate": 1.7384869074191777e-05, + "loss": 0.3779, + "step": 1720 + }, + { + "epoch": 0.8224120422913804, + "grad_norm": 0.5213123941213047, + "learning_rate": 1.738131687693436e-05, + "loss": 0.3586, + "step": 1721 + }, + { + "epoch": 0.8228899109969536, + "grad_norm": 0.5156498268841546, + "learning_rate": 1.737776263222984e-05, + "loss": 0.3724, + "step": 1722 + }, + { + "epoch": 0.8233677797025267, + "grad_norm": 0.5704476991648906, + "learning_rate": 1.737420634106411e-05, + "loss": 0.3705, + "step": 1723 + }, + { + "epoch": 0.8238456484080999, + "grad_norm": 0.5546790809955634, + "learning_rate": 1.7370648004423623e-05, + "loss": 0.3704, + "step": 1724 + }, + { + "epoch": 0.824323517113673, + "grad_norm": 0.618179020385375, + "learning_rate": 1.7367087623295394e-05, + "loss": 0.3723, + "step": 1725 + }, + { + "epoch": 0.8248013858192461, + "grad_norm": 0.5663350618454257, + "learning_rate": 1.7363525198667013e-05, + "loss": 0.3964, + "step": 1726 + }, + { + "epoch": 0.8252792545248193, + "grad_norm": 0.5530576617160258, + "learning_rate": 1.735996073152663e-05, + "loss": 0.3638, + "step": 1727 + }, + { + "epoch": 0.8257571232303924, + "grad_norm": 0.5518889071407768, + "learning_rate": 1.7356394222862966e-05, + "loss": 0.3777, + "step": 1728 + }, + { + "epoch": 0.8262349919359656, + "grad_norm": 0.5219424160449163, + "learning_rate": 1.7352825673665313e-05, + "loss": 0.3773, + "step": 1729 + }, + { + "epoch": 0.8267128606415387, + "grad_norm": 0.5416629798429636, + "learning_rate": 1.7349255084923517e-05, + "loss": 0.3728, + "step": 1730 + }, + { + "epoch": 0.8271907293471119, + "grad_norm": 0.542222397838335, + "learning_rate": 1.7345682457627998e-05, + "loss": 0.3846, + "step": 1731 + }, + { + "epoch": 0.827668598052685, + "grad_norm": 0.5564300713817316, + "learning_rate": 1.7342107792769747e-05, + "loss": 0.3854, + "step": 1732 + }, + { + "epoch": 0.8281464667582582, + "grad_norm": 0.5047965719419824, + "learning_rate": 1.7338531091340304e-05, + "loss": 0.3684, + "step": 1733 + }, + { + "epoch": 0.8286243354638313, + "grad_norm": 0.5525238950280134, + "learning_rate": 1.7334952354331783e-05, + "loss": 0.3604, + "step": 1734 + }, + { + "epoch": 0.8291022041694045, + "grad_norm": 0.5386457884764131, + "learning_rate": 1.7331371582736864e-05, + "loss": 0.382, + "step": 1735 + }, + { + "epoch": 0.8295800728749776, + "grad_norm": 0.5721485678679854, + "learning_rate": 1.7327788777548796e-05, + "loss": 0.3591, + "step": 1736 + }, + { + "epoch": 0.8300579415805508, + "grad_norm": 0.9233246568993144, + "learning_rate": 1.732420393976138e-05, + "loss": 0.3704, + "step": 1737 + }, + { + "epoch": 0.8305358102861239, + "grad_norm": 0.5345664368757062, + "learning_rate": 1.7320617070368985e-05, + "loss": 0.3895, + "step": 1738 + }, + { + "epoch": 0.831013678991697, + "grad_norm": 0.5011860668006918, + "learning_rate": 1.731702817036655e-05, + "loss": 0.3743, + "step": 1739 + }, + { + "epoch": 0.8314915476972702, + "grad_norm": 0.49769794527863426, + "learning_rate": 1.731343724074957e-05, + "loss": 0.3729, + "step": 1740 + }, + { + "epoch": 0.8319694164028433, + "grad_norm": 0.4910631974353203, + "learning_rate": 1.730984428251411e-05, + "loss": 0.3775, + "step": 1741 + }, + { + "epoch": 0.8324472851084165, + "grad_norm": 0.4981131547731618, + "learning_rate": 1.7306249296656784e-05, + "loss": 0.3819, + "step": 1742 + }, + { + "epoch": 0.8329251538139896, + "grad_norm": 0.5176839501122841, + "learning_rate": 1.7302652284174785e-05, + "loss": 0.3719, + "step": 1743 + }, + { + "epoch": 0.8334030225195628, + "grad_norm": 0.5152962449518306, + "learning_rate": 1.729905324606586e-05, + "loss": 0.3804, + "step": 1744 + }, + { + "epoch": 0.8338808912251359, + "grad_norm": 0.5558715961134091, + "learning_rate": 1.7295452183328317e-05, + "loss": 0.3828, + "step": 1745 + }, + { + "epoch": 0.8343587599307091, + "grad_norm": 0.5422705807257295, + "learning_rate": 1.7291849096961027e-05, + "loss": 0.3728, + "step": 1746 + }, + { + "epoch": 0.8348366286362822, + "grad_norm": 0.5132286453110108, + "learning_rate": 1.7288243987963423e-05, + "loss": 0.3915, + "step": 1747 + }, + { + "epoch": 0.8353144973418554, + "grad_norm": 0.5896569071524325, + "learning_rate": 1.7284636857335503e-05, + "loss": 0.3721, + "step": 1748 + }, + { + "epoch": 0.8357923660474285, + "grad_norm": 0.5315131515573899, + "learning_rate": 1.7281027706077815e-05, + "loss": 0.3916, + "step": 1749 + }, + { + "epoch": 0.8362702347530017, + "grad_norm": 0.5270182046627034, + "learning_rate": 1.7277416535191478e-05, + "loss": 0.3867, + "step": 1750 + }, + { + "epoch": 0.8367481034585748, + "grad_norm": 0.5016720673015266, + "learning_rate": 1.7273803345678163e-05, + "loss": 0.3793, + "step": 1751 + }, + { + "epoch": 0.8372259721641478, + "grad_norm": 0.538715167203939, + "learning_rate": 1.7270188138540106e-05, + "loss": 0.3835, + "step": 1752 + }, + { + "epoch": 0.837703840869721, + "grad_norm": 0.5152936944221022, + "learning_rate": 1.726657091478011e-05, + "loss": 0.3848, + "step": 1753 + }, + { + "epoch": 0.8381817095752941, + "grad_norm": 0.5144600147503787, + "learning_rate": 1.7262951675401517e-05, + "loss": 0.375, + "step": 1754 + }, + { + "epoch": 0.8386595782808673, + "grad_norm": 0.5789439256338922, + "learning_rate": 1.7259330421408247e-05, + "loss": 0.3686, + "step": 1755 + }, + { + "epoch": 0.8391374469864404, + "grad_norm": 0.5385957955102172, + "learning_rate": 1.7255707153804772e-05, + "loss": 0.3666, + "step": 1756 + }, + { + "epoch": 0.8396153156920136, + "grad_norm": 0.6176394509383132, + "learning_rate": 1.725208187359612e-05, + "loss": 0.3837, + "step": 1757 + }, + { + "epoch": 0.8400931843975867, + "grad_norm": 0.5271166709229913, + "learning_rate": 1.724845458178788e-05, + "loss": 0.3616, + "step": 1758 + }, + { + "epoch": 0.8405710531031599, + "grad_norm": 0.5785356635347916, + "learning_rate": 1.72448252793862e-05, + "loss": 0.3735, + "step": 1759 + }, + { + "epoch": 0.841048921808733, + "grad_norm": 0.5396169495403562, + "learning_rate": 1.7241193967397784e-05, + "loss": 0.3863, + "step": 1760 + }, + { + "epoch": 0.8415267905143062, + "grad_norm": 0.5968305743269035, + "learning_rate": 1.7237560646829893e-05, + "loss": 0.3844, + "step": 1761 + }, + { + "epoch": 0.8420046592198793, + "grad_norm": 0.5739568058873602, + "learning_rate": 1.723392531869035e-05, + "loss": 0.3623, + "step": 1762 + }, + { + "epoch": 0.8424825279254525, + "grad_norm": 0.5245097908307391, + "learning_rate": 1.7230287983987524e-05, + "loss": 0.3704, + "step": 1763 + }, + { + "epoch": 0.8429603966310256, + "grad_norm": 0.5908273151839591, + "learning_rate": 1.722664864373035e-05, + "loss": 0.3819, + "step": 1764 + }, + { + "epoch": 0.8434382653365988, + "grad_norm": 0.5155637558679567, + "learning_rate": 1.7223007298928322e-05, + "loss": 0.3592, + "step": 1765 + }, + { + "epoch": 0.8439161340421719, + "grad_norm": 0.5175194051496753, + "learning_rate": 1.7219363950591482e-05, + "loss": 0.3819, + "step": 1766 + }, + { + "epoch": 0.844394002747745, + "grad_norm": 0.5743671321894646, + "learning_rate": 1.7215718599730427e-05, + "loss": 0.3766, + "step": 1767 + }, + { + "epoch": 0.8448718714533182, + "grad_norm": 0.501602630302012, + "learning_rate": 1.7212071247356316e-05, + "loss": 0.3879, + "step": 1768 + }, + { + "epoch": 0.8453497401588913, + "grad_norm": 0.49054416633074915, + "learning_rate": 1.720842189448086e-05, + "loss": 0.385, + "step": 1769 + }, + { + "epoch": 0.8458276088644645, + "grad_norm": 0.5155620766796909, + "learning_rate": 1.7204770542116326e-05, + "loss": 0.3691, + "step": 1770 + }, + { + "epoch": 0.8463054775700376, + "grad_norm": 0.7473764351264286, + "learning_rate": 1.720111719127553e-05, + "loss": 0.3691, + "step": 1771 + }, + { + "epoch": 0.8467833462756108, + "grad_norm": 0.5182869320161151, + "learning_rate": 1.7197461842971854e-05, + "loss": 0.3742, + "step": 1772 + }, + { + "epoch": 0.8472612149811839, + "grad_norm": 0.539699951292783, + "learning_rate": 1.7193804498219222e-05, + "loss": 0.3665, + "step": 1773 + }, + { + "epoch": 0.8477390836867571, + "grad_norm": 0.49195363335336606, + "learning_rate": 1.719014515803212e-05, + "loss": 0.3781, + "step": 1774 + }, + { + "epoch": 0.8482169523923302, + "grad_norm": 0.5107276989908707, + "learning_rate": 1.7186483823425582e-05, + "loss": 0.3844, + "step": 1775 + }, + { + "epoch": 0.8486948210979034, + "grad_norm": 0.5454810223277834, + "learning_rate": 1.7182820495415197e-05, + "loss": 0.3615, + "step": 1776 + }, + { + "epoch": 0.8491726898034765, + "grad_norm": 0.5294996389762363, + "learning_rate": 1.7179155175017115e-05, + "loss": 0.3734, + "step": 1777 + }, + { + "epoch": 0.8496505585090497, + "grad_norm": 0.5499916031625075, + "learning_rate": 1.717548786324802e-05, + "loss": 0.3849, + "step": 1778 + }, + { + "epoch": 0.8501284272146228, + "grad_norm": 0.4973577029191277, + "learning_rate": 1.7171818561125168e-05, + "loss": 0.3731, + "step": 1779 + }, + { + "epoch": 0.8506062959201959, + "grad_norm": 0.5851851900260322, + "learning_rate": 1.7168147269666357e-05, + "loss": 0.3799, + "step": 1780 + }, + { + "epoch": 0.8510841646257691, + "grad_norm": 0.4822034404377725, + "learning_rate": 1.7164473989889937e-05, + "loss": 0.3693, + "step": 1781 + }, + { + "epoch": 0.8515620333313422, + "grad_norm": 1.2985086241604744, + "learning_rate": 1.7160798722814808e-05, + "loss": 0.3677, + "step": 1782 + }, + { + "epoch": 0.8520399020369154, + "grad_norm": 0.5822593257906487, + "learning_rate": 1.7157121469460428e-05, + "loss": 0.3575, + "step": 1783 + }, + { + "epoch": 0.8525177707424885, + "grad_norm": 0.6086802336624687, + "learning_rate": 1.7153442230846808e-05, + "loss": 0.3706, + "step": 1784 + }, + { + "epoch": 0.8529956394480617, + "grad_norm": 0.5605480795542676, + "learning_rate": 1.714976100799449e-05, + "loss": 0.3898, + "step": 1785 + }, + { + "epoch": 0.8534735081536348, + "grad_norm": 0.4972956581461137, + "learning_rate": 1.7146077801924593e-05, + "loss": 0.3694, + "step": 1786 + }, + { + "epoch": 0.853951376859208, + "grad_norm": 0.5805310199986851, + "learning_rate": 1.7142392613658764e-05, + "loss": 0.3718, + "step": 1787 + }, + { + "epoch": 0.8544292455647811, + "grad_norm": 0.5287792910272413, + "learning_rate": 1.7138705444219215e-05, + "loss": 0.3714, + "step": 1788 + }, + { + "epoch": 0.8549071142703543, + "grad_norm": 0.5101894702630763, + "learning_rate": 1.7135016294628703e-05, + "loss": 0.3621, + "step": 1789 + }, + { + "epoch": 0.8553849829759274, + "grad_norm": 0.6079137123999032, + "learning_rate": 1.713132516591053e-05, + "loss": 0.3794, + "step": 1790 + }, + { + "epoch": 0.8558628516815006, + "grad_norm": 0.5669666874135264, + "learning_rate": 1.7127632059088547e-05, + "loss": 0.3777, + "step": 1791 + }, + { + "epoch": 0.8563407203870737, + "grad_norm": 0.545562892761849, + "learning_rate": 1.7123936975187164e-05, + "loss": 0.3797, + "step": 1792 + }, + { + "epoch": 0.8568185890926467, + "grad_norm": 0.6098893498728746, + "learning_rate": 1.7120239915231326e-05, + "loss": 0.3689, + "step": 1793 + }, + { + "epoch": 0.85729645779822, + "grad_norm": 0.5161732997271268, + "learning_rate": 1.7116540880246536e-05, + "loss": 0.3894, + "step": 1794 + }, + { + "epoch": 0.857774326503793, + "grad_norm": 0.5403314394420677, + "learning_rate": 1.7112839871258838e-05, + "loss": 0.3683, + "step": 1795 + }, + { + "epoch": 0.8582521952093662, + "grad_norm": 0.5784670620134145, + "learning_rate": 1.710913688929483e-05, + "loss": 0.3815, + "step": 1796 + }, + { + "epoch": 0.8587300639149393, + "grad_norm": 0.5025205431269636, + "learning_rate": 1.710543193538165e-05, + "loss": 0.3879, + "step": 1797 + }, + { + "epoch": 0.8592079326205125, + "grad_norm": 0.5892397369469627, + "learning_rate": 1.7101725010546988e-05, + "loss": 0.3956, + "step": 1798 + }, + { + "epoch": 0.8596858013260856, + "grad_norm": 0.47462809973773024, + "learning_rate": 1.7098016115819082e-05, + "loss": 0.3783, + "step": 1799 + }, + { + "epoch": 0.8601636700316588, + "grad_norm": 0.5288631373582081, + "learning_rate": 1.7094305252226713e-05, + "loss": 0.3958, + "step": 1800 + }, + { + "epoch": 0.8606415387372319, + "grad_norm": 0.530585661371997, + "learning_rate": 1.7090592420799206e-05, + "loss": 0.3794, + "step": 1801 + }, + { + "epoch": 0.8611194074428051, + "grad_norm": 0.5288225495006263, + "learning_rate": 1.708687762256644e-05, + "loss": 0.3562, + "step": 1802 + }, + { + "epoch": 0.8615972761483782, + "grad_norm": 0.5052081248332658, + "learning_rate": 1.708316085855883e-05, + "loss": 0.3788, + "step": 1803 + }, + { + "epoch": 0.8620751448539514, + "grad_norm": 0.5037294594905739, + "learning_rate": 1.7079442129807345e-05, + "loss": 0.3726, + "step": 1804 + }, + { + "epoch": 0.8625530135595245, + "grad_norm": 0.49391286749648405, + "learning_rate": 1.7075721437343488e-05, + "loss": 0.3716, + "step": 1805 + }, + { + "epoch": 0.8630308822650976, + "grad_norm": 0.4971338729755265, + "learning_rate": 1.707199878219932e-05, + "loss": 0.3703, + "step": 1806 + }, + { + "epoch": 0.8635087509706708, + "grad_norm": 0.5233353158987447, + "learning_rate": 1.7068274165407438e-05, + "loss": 0.3863, + "step": 1807 + }, + { + "epoch": 0.8639866196762439, + "grad_norm": 0.5533231164405442, + "learning_rate": 1.706454758800099e-05, + "loss": 0.3771, + "step": 1808 + }, + { + "epoch": 0.8644644883818171, + "grad_norm": 0.5368308793227745, + "learning_rate": 1.706081905101365e-05, + "loss": 0.3606, + "step": 1809 + }, + { + "epoch": 0.8649423570873902, + "grad_norm": 0.5642371259255226, + "learning_rate": 1.705708855547966e-05, + "loss": 0.3536, + "step": 1810 + }, + { + "epoch": 0.8654202257929634, + "grad_norm": 0.5545194035776633, + "learning_rate": 1.7053356102433786e-05, + "loss": 0.3738, + "step": 1811 + }, + { + "epoch": 0.8658980944985365, + "grad_norm": 0.66942068698917, + "learning_rate": 1.704962169291135e-05, + "loss": 0.3773, + "step": 1812 + }, + { + "epoch": 0.8663759632041097, + "grad_norm": 0.5154302230933852, + "learning_rate": 1.704588532794821e-05, + "loss": 0.3703, + "step": 1813 + }, + { + "epoch": 0.8668538319096828, + "grad_norm": 0.5317590606844941, + "learning_rate": 1.7042147008580768e-05, + "loss": 0.3655, + "step": 1814 + }, + { + "epoch": 0.867331700615256, + "grad_norm": 0.5285596576650338, + "learning_rate": 1.7038406735845967e-05, + "loss": 0.3862, + "step": 1815 + }, + { + "epoch": 0.8678095693208291, + "grad_norm": 0.556947670099024, + "learning_rate": 1.7034664510781294e-05, + "loss": 0.3494, + "step": 1816 + }, + { + "epoch": 0.8682874380264023, + "grad_norm": 0.5007734114771821, + "learning_rate": 1.7030920334424774e-05, + "loss": 0.3784, + "step": 1817 + }, + { + "epoch": 0.8687653067319754, + "grad_norm": 0.5306943675714777, + "learning_rate": 1.7027174207814977e-05, + "loss": 0.3735, + "step": 1818 + }, + { + "epoch": 0.8692431754375486, + "grad_norm": 0.49311406820837095, + "learning_rate": 1.7023426131991008e-05, + "loss": 0.3745, + "step": 1819 + }, + { + "epoch": 0.8697210441431217, + "grad_norm": 2.167028523901367, + "learning_rate": 1.7019676107992523e-05, + "loss": 0.384, + "step": 1820 + }, + { + "epoch": 0.8701989128486948, + "grad_norm": 0.5774159028549396, + "learning_rate": 1.701592413685971e-05, + "loss": 0.3688, + "step": 1821 + }, + { + "epoch": 0.870676781554268, + "grad_norm": 0.5107936854972478, + "learning_rate": 1.7012170219633306e-05, + "loss": 0.3823, + "step": 1822 + }, + { + "epoch": 0.8711546502598411, + "grad_norm": 0.5153038414963254, + "learning_rate": 1.700841435735457e-05, + "loss": 0.3906, + "step": 1823 + }, + { + "epoch": 0.8716325189654143, + "grad_norm": 0.6927821263237308, + "learning_rate": 1.7004656551065317e-05, + "loss": 0.3549, + "step": 1824 + }, + { + "epoch": 0.8721103876709874, + "grad_norm": 0.5401362110735732, + "learning_rate": 1.70008968018079e-05, + "loss": 0.3759, + "step": 1825 + }, + { + "epoch": 0.8725882563765606, + "grad_norm": 0.5538300760634917, + "learning_rate": 1.6997135110625203e-05, + "loss": 0.3658, + "step": 1826 + }, + { + "epoch": 0.8730661250821337, + "grad_norm": 0.6412840566926354, + "learning_rate": 1.6993371478560652e-05, + "loss": 0.3801, + "step": 1827 + }, + { + "epoch": 0.8735439937877069, + "grad_norm": 0.5385013396738852, + "learning_rate": 1.6989605906658217e-05, + "loss": 0.3696, + "step": 1828 + }, + { + "epoch": 0.87402186249328, + "grad_norm": 0.5786447420911893, + "learning_rate": 1.6985838395962397e-05, + "loss": 0.3773, + "step": 1829 + }, + { + "epoch": 0.8744997311988532, + "grad_norm": 0.528475998356483, + "learning_rate": 1.6982068947518235e-05, + "loss": 0.3615, + "step": 1830 + }, + { + "epoch": 0.8749775999044263, + "grad_norm": 0.9317925627901069, + "learning_rate": 1.6978297562371304e-05, + "loss": 0.3869, + "step": 1831 + }, + { + "epoch": 0.8754554686099995, + "grad_norm": 0.491169996492111, + "learning_rate": 1.6974524241567726e-05, + "loss": 0.3694, + "step": 1832 + }, + { + "epoch": 0.8759333373155725, + "grad_norm": 0.5289533891513116, + "learning_rate": 1.6970748986154153e-05, + "loss": 0.3659, + "step": 1833 + }, + { + "epoch": 0.8764112060211456, + "grad_norm": 0.5541497878429626, + "learning_rate": 1.6966971797177777e-05, + "loss": 0.3617, + "step": 1834 + }, + { + "epoch": 0.8768890747267188, + "grad_norm": 0.5832036352429374, + "learning_rate": 1.6963192675686312e-05, + "loss": 0.3667, + "step": 1835 + }, + { + "epoch": 0.8773669434322919, + "grad_norm": 0.5650619500446709, + "learning_rate": 1.6959411622728034e-05, + "loss": 0.3808, + "step": 1836 + }, + { + "epoch": 0.8778448121378651, + "grad_norm": 0.5699075770705608, + "learning_rate": 1.695562863935173e-05, + "loss": 0.3656, + "step": 1837 + }, + { + "epoch": 0.8783226808434382, + "grad_norm": 0.551562318574458, + "learning_rate": 1.695184372660674e-05, + "loss": 0.3712, + "step": 1838 + }, + { + "epoch": 0.8788005495490114, + "grad_norm": 0.5240567810273391, + "learning_rate": 1.6948056885542925e-05, + "loss": 0.3543, + "step": 1839 + }, + { + "epoch": 0.8792784182545845, + "grad_norm": 0.5020036514932894, + "learning_rate": 1.694426811721069e-05, + "loss": 0.3679, + "step": 1840 + }, + { + "epoch": 0.8797562869601577, + "grad_norm": 0.556752437838373, + "learning_rate": 1.6940477422660976e-05, + "loss": 0.3704, + "step": 1841 + }, + { + "epoch": 0.8802341556657308, + "grad_norm": 0.5191567869165117, + "learning_rate": 1.6936684802945255e-05, + "loss": 0.3773, + "step": 1842 + }, + { + "epoch": 0.880712024371304, + "grad_norm": 0.5533663083233841, + "learning_rate": 1.693289025911553e-05, + "loss": 0.3779, + "step": 1843 + }, + { + "epoch": 0.8811898930768771, + "grad_norm": 0.5503898647969782, + "learning_rate": 1.692909379222434e-05, + "loss": 0.3791, + "step": 1844 + }, + { + "epoch": 0.8816677617824503, + "grad_norm": 0.5551067074917195, + "learning_rate": 1.6925295403324758e-05, + "loss": 0.3821, + "step": 1845 + }, + { + "epoch": 0.8821456304880234, + "grad_norm": 0.5622953147656727, + "learning_rate": 1.6921495093470394e-05, + "loss": 0.3873, + "step": 1846 + }, + { + "epoch": 0.8826234991935965, + "grad_norm": 0.5365186748712514, + "learning_rate": 1.6917692863715384e-05, + "loss": 0.3717, + "step": 1847 + }, + { + "epoch": 0.8831013678991697, + "grad_norm": 0.5478602830128808, + "learning_rate": 1.69138887151144e-05, + "loss": 0.3723, + "step": 1848 + }, + { + "epoch": 0.8835792366047428, + "grad_norm": 0.5427879963525947, + "learning_rate": 1.6910082648722643e-05, + "loss": 0.3716, + "step": 1849 + }, + { + "epoch": 0.884057105310316, + "grad_norm": 0.5101893372822378, + "learning_rate": 1.6906274665595854e-05, + "loss": 0.369, + "step": 1850 + }, + { + "epoch": 0.8845349740158891, + "grad_norm": 0.5434862693905707, + "learning_rate": 1.6902464766790295e-05, + "loss": 0.3731, + "step": 1851 + }, + { + "epoch": 0.8850128427214623, + "grad_norm": 0.5250634322593507, + "learning_rate": 1.6898652953362765e-05, + "loss": 0.3736, + "step": 1852 + }, + { + "epoch": 0.8854907114270354, + "grad_norm": 0.5732211434436395, + "learning_rate": 1.68948392263706e-05, + "loss": 0.3756, + "step": 1853 + }, + { + "epoch": 0.8859685801326086, + "grad_norm": 0.5470798537140039, + "learning_rate": 1.6891023586871654e-05, + "loss": 0.3834, + "step": 1854 + }, + { + "epoch": 0.8864464488381817, + "grad_norm": 0.5126373876854253, + "learning_rate": 1.688720603592432e-05, + "loss": 0.3726, + "step": 1855 + }, + { + "epoch": 0.8869243175437549, + "grad_norm": 0.574477636771307, + "learning_rate": 1.6883386574587524e-05, + "loss": 0.3726, + "step": 1856 + }, + { + "epoch": 0.887402186249328, + "grad_norm": 0.5760651136056916, + "learning_rate": 1.687956520392071e-05, + "loss": 0.3623, + "step": 1857 + }, + { + "epoch": 0.8878800549549012, + "grad_norm": 0.8682852157187648, + "learning_rate": 1.6875741924983865e-05, + "loss": 0.3741, + "step": 1858 + }, + { + "epoch": 0.8883579236604743, + "grad_norm": 0.623287286436862, + "learning_rate": 1.687191673883749e-05, + "loss": 0.359, + "step": 1859 + }, + { + "epoch": 0.8888357923660475, + "grad_norm": 0.5331585373043249, + "learning_rate": 1.6868089646542632e-05, + "loss": 0.3788, + "step": 1860 + }, + { + "epoch": 0.8893136610716206, + "grad_norm": 0.5578200837319638, + "learning_rate": 1.686426064916086e-05, + "loss": 0.3794, + "step": 1861 + }, + { + "epoch": 0.8897915297771937, + "grad_norm": 0.5422159312602259, + "learning_rate": 1.6860429747754267e-05, + "loss": 0.3772, + "step": 1862 + }, + { + "epoch": 0.8902693984827669, + "grad_norm": 0.5228366276138506, + "learning_rate": 1.685659694338548e-05, + "loss": 0.3806, + "step": 1863 + }, + { + "epoch": 0.89074726718834, + "grad_norm": 0.5358911036348268, + "learning_rate": 1.6852762237117643e-05, + "loss": 0.3834, + "step": 1864 + }, + { + "epoch": 0.8912251358939132, + "grad_norm": 0.528212445670647, + "learning_rate": 1.6848925630014445e-05, + "loss": 0.3882, + "step": 1865 + }, + { + "epoch": 0.8917030045994863, + "grad_norm": 0.5542155155507105, + "learning_rate": 1.684508712314009e-05, + "loss": 0.3753, + "step": 1866 + }, + { + "epoch": 0.8921808733050595, + "grad_norm": 0.504104107379108, + "learning_rate": 1.6841246717559316e-05, + "loss": 0.3875, + "step": 1867 + }, + { + "epoch": 0.8926587420106326, + "grad_norm": 0.5434960586896977, + "learning_rate": 1.6837404414337374e-05, + "loss": 0.3558, + "step": 1868 + }, + { + "epoch": 0.8931366107162058, + "grad_norm": 0.5567958899249101, + "learning_rate": 1.683356021454006e-05, + "loss": 0.3672, + "step": 1869 + }, + { + "epoch": 0.8936144794217789, + "grad_norm": 0.5022263534465929, + "learning_rate": 1.6829714119233688e-05, + "loss": 0.3537, + "step": 1870 + }, + { + "epoch": 0.894092348127352, + "grad_norm": 0.549284213737301, + "learning_rate": 1.6825866129485088e-05, + "loss": 0.3574, + "step": 1871 + }, + { + "epoch": 0.8945702168329251, + "grad_norm": 0.4705829377264607, + "learning_rate": 1.6822016246361633e-05, + "loss": 0.3851, + "step": 1872 + }, + { + "epoch": 0.8950480855384984, + "grad_norm": 0.5062114611657378, + "learning_rate": 1.681816447093121e-05, + "loss": 0.3609, + "step": 1873 + }, + { + "epoch": 0.8955259542440714, + "grad_norm": 0.5826140193721359, + "learning_rate": 1.6814310804262225e-05, + "loss": 0.3726, + "step": 1874 + }, + { + "epoch": 0.8960038229496445, + "grad_norm": 0.5520132821306097, + "learning_rate": 1.6810455247423634e-05, + "loss": 0.3782, + "step": 1875 + }, + { + "epoch": 0.8964816916552177, + "grad_norm": 0.5170660835598903, + "learning_rate": 1.680659780148489e-05, + "loss": 0.357, + "step": 1876 + }, + { + "epoch": 0.8969595603607908, + "grad_norm": 0.583946284953352, + "learning_rate": 1.680273846751598e-05, + "loss": 0.3776, + "step": 1877 + }, + { + "epoch": 0.897437429066364, + "grad_norm": 0.5723420342490975, + "learning_rate": 1.6798877246587418e-05, + "loss": 0.3895, + "step": 1878 + }, + { + "epoch": 0.8979152977719371, + "grad_norm": 0.5082366243810379, + "learning_rate": 1.679501413977024e-05, + "loss": 0.3808, + "step": 1879 + }, + { + "epoch": 0.8983931664775103, + "grad_norm": 0.507443343885867, + "learning_rate": 1.6791149148136003e-05, + "loss": 0.3536, + "step": 1880 + }, + { + "epoch": 0.8988710351830834, + "grad_norm": 0.5703634285838638, + "learning_rate": 1.6787282272756784e-05, + "loss": 0.3682, + "step": 1881 + }, + { + "epoch": 0.8993489038886566, + "grad_norm": 0.5158030787718338, + "learning_rate": 1.6783413514705186e-05, + "loss": 0.3813, + "step": 1882 + }, + { + "epoch": 0.8998267725942297, + "grad_norm": 0.5015158481243369, + "learning_rate": 1.677954287505434e-05, + "loss": 0.3775, + "step": 1883 + }, + { + "epoch": 0.9003046412998029, + "grad_norm": 0.5098825854109337, + "learning_rate": 1.6775670354877888e-05, + "loss": 0.3526, + "step": 1884 + }, + { + "epoch": 0.900782510005376, + "grad_norm": 0.5242066060684257, + "learning_rate": 1.677179595525e-05, + "loss": 0.4053, + "step": 1885 + }, + { + "epoch": 0.9012603787109492, + "grad_norm": 0.5351980918146104, + "learning_rate": 1.6767919677245367e-05, + "loss": 0.3714, + "step": 1886 + }, + { + "epoch": 0.9017382474165223, + "grad_norm": 0.5328925764874355, + "learning_rate": 1.6764041521939194e-05, + "loss": 0.3675, + "step": 1887 + }, + { + "epoch": 0.9022161161220954, + "grad_norm": 0.5494270094887949, + "learning_rate": 1.6760161490407227e-05, + "loss": 0.388, + "step": 1888 + }, + { + "epoch": 0.9026939848276686, + "grad_norm": 0.5031329675709398, + "learning_rate": 1.675627958372571e-05, + "loss": 0.3811, + "step": 1889 + }, + { + "epoch": 0.9031718535332417, + "grad_norm": 0.5262746250301709, + "learning_rate": 1.675239580297141e-05, + "loss": 0.3935, + "step": 1890 + }, + { + "epoch": 0.9036497222388149, + "grad_norm": 0.5069957733853212, + "learning_rate": 1.6748510149221623e-05, + "loss": 0.3777, + "step": 1891 + }, + { + "epoch": 0.904127590944388, + "grad_norm": 0.5428615459596646, + "learning_rate": 1.6744622623554166e-05, + "loss": 0.3802, + "step": 1892 + }, + { + "epoch": 0.9046054596499612, + "grad_norm": 0.5266303210825704, + "learning_rate": 1.6740733227047365e-05, + "loss": 0.3776, + "step": 1893 + }, + { + "epoch": 0.9050833283555343, + "grad_norm": 0.5348979320077515, + "learning_rate": 1.673684196078007e-05, + "loss": 0.3848, + "step": 1894 + }, + { + "epoch": 0.9055611970611075, + "grad_norm": 0.5314119413401441, + "learning_rate": 1.6732948825831657e-05, + "loss": 0.364, + "step": 1895 + }, + { + "epoch": 0.9060390657666806, + "grad_norm": 0.5516947593383675, + "learning_rate": 1.6729053823282e-05, + "loss": 0.3553, + "step": 1896 + }, + { + "epoch": 0.9065169344722538, + "grad_norm": 0.5090831476794752, + "learning_rate": 1.6725156954211516e-05, + "loss": 0.3713, + "step": 1897 + }, + { + "epoch": 0.9069948031778269, + "grad_norm": 0.6262571396693599, + "learning_rate": 1.672125821970112e-05, + "loss": 0.3689, + "step": 1898 + }, + { + "epoch": 0.9074726718834001, + "grad_norm": 0.5193279263946532, + "learning_rate": 1.6717357620832256e-05, + "loss": 0.3783, + "step": 1899 + }, + { + "epoch": 0.9079505405889732, + "grad_norm": 0.503064317241693, + "learning_rate": 1.671345515868688e-05, + "loss": 0.3793, + "step": 1900 + }, + { + "epoch": 0.9084284092945463, + "grad_norm": 0.5130059203403388, + "learning_rate": 1.6709550834347463e-05, + "loss": 0.3715, + "step": 1901 + }, + { + "epoch": 0.9089062780001195, + "grad_norm": 0.5369725030475303, + "learning_rate": 1.6705644648897004e-05, + "loss": 0.3592, + "step": 1902 + }, + { + "epoch": 0.9093841467056926, + "grad_norm": 0.5150663544165971, + "learning_rate": 1.6701736603419002e-05, + "loss": 0.3637, + "step": 1903 + }, + { + "epoch": 0.9098620154112658, + "grad_norm": 0.5399311902064715, + "learning_rate": 1.6697826698997483e-05, + "loss": 0.3594, + "step": 1904 + }, + { + "epoch": 0.9103398841168389, + "grad_norm": 0.5518934019533673, + "learning_rate": 1.6693914936716983e-05, + "loss": 0.3682, + "step": 1905 + }, + { + "epoch": 0.9108177528224121, + "grad_norm": 0.5209140835926345, + "learning_rate": 1.6690001317662563e-05, + "loss": 0.382, + "step": 1906 + }, + { + "epoch": 0.9112956215279852, + "grad_norm": 0.5660288276754546, + "learning_rate": 1.6686085842919784e-05, + "loss": 0.3801, + "step": 1907 + }, + { + "epoch": 0.9117734902335584, + "grad_norm": 0.47006676537993314, + "learning_rate": 1.668216851357473e-05, + "loss": 0.3724, + "step": 1908 + }, + { + "epoch": 0.9122513589391315, + "grad_norm": 0.5577644040191538, + "learning_rate": 1.667824933071401e-05, + "loss": 0.3622, + "step": 1909 + }, + { + "epoch": 0.9127292276447047, + "grad_norm": 0.5340230250631273, + "learning_rate": 1.6674328295424723e-05, + "loss": 0.369, + "step": 1910 + }, + { + "epoch": 0.9132070963502777, + "grad_norm": 0.5079231886115417, + "learning_rate": 1.6670405408794498e-05, + "loss": 0.3715, + "step": 1911 + }, + { + "epoch": 0.913684965055851, + "grad_norm": 0.5325193692349207, + "learning_rate": 1.666648067191148e-05, + "loss": 0.3789, + "step": 1912 + }, + { + "epoch": 0.914162833761424, + "grad_norm": 0.5323003539826137, + "learning_rate": 1.666255408586432e-05, + "loss": 0.3818, + "step": 1913 + }, + { + "epoch": 0.9146407024669972, + "grad_norm": 0.4869815217201792, + "learning_rate": 1.6658625651742178e-05, + "loss": 0.3733, + "step": 1914 + }, + { + "epoch": 0.9151185711725703, + "grad_norm": 0.5576240488695627, + "learning_rate": 1.6654695370634738e-05, + "loss": 0.3589, + "step": 1915 + }, + { + "epoch": 0.9155964398781434, + "grad_norm": 0.49247551368135417, + "learning_rate": 1.6650763243632187e-05, + "loss": 0.3585, + "step": 1916 + }, + { + "epoch": 0.9160743085837166, + "grad_norm": 0.5148567366006771, + "learning_rate": 1.664682927182523e-05, + "loss": 0.3838, + "step": 1917 + }, + { + "epoch": 0.9165521772892897, + "grad_norm": 0.49717329992717463, + "learning_rate": 1.6642893456305086e-05, + "loss": 0.3915, + "step": 1918 + }, + { + "epoch": 0.9170300459948629, + "grad_norm": 0.5628249235481696, + "learning_rate": 1.663895579816347e-05, + "loss": 0.3689, + "step": 1919 + }, + { + "epoch": 0.917507914700436, + "grad_norm": 0.5136903278787316, + "learning_rate": 1.6635016298492628e-05, + "loss": 0.3838, + "step": 1920 + }, + { + "epoch": 0.9179857834060092, + "grad_norm": 0.5334625808445861, + "learning_rate": 1.6631074958385304e-05, + "loss": 0.374, + "step": 1921 + }, + { + "epoch": 0.9184636521115823, + "grad_norm": 0.5386195590160799, + "learning_rate": 1.6627131778934755e-05, + "loss": 0.3751, + "step": 1922 + }, + { + "epoch": 0.9189415208171555, + "grad_norm": 0.5379052116006212, + "learning_rate": 1.662318676123475e-05, + "loss": 0.3809, + "step": 1923 + }, + { + "epoch": 0.9194193895227286, + "grad_norm": 0.5100602264626746, + "learning_rate": 1.6619239906379574e-05, + "loss": 0.3701, + "step": 1924 + }, + { + "epoch": 0.9198972582283018, + "grad_norm": 0.5239150729526496, + "learning_rate": 1.6615291215464005e-05, + "loss": 0.3877, + "step": 1925 + }, + { + "epoch": 0.9203751269338749, + "grad_norm": 0.6375609494234671, + "learning_rate": 1.6611340689583343e-05, + "loss": 0.3844, + "step": 1926 + }, + { + "epoch": 0.9208529956394481, + "grad_norm": 0.5123037134022694, + "learning_rate": 1.66073883298334e-05, + "loss": 0.3651, + "step": 1927 + }, + { + "epoch": 0.9213308643450212, + "grad_norm": 0.5110576525109256, + "learning_rate": 1.6603434137310482e-05, + "loss": 0.3707, + "step": 1928 + }, + { + "epoch": 0.9218087330505943, + "grad_norm": 0.5021261687202652, + "learning_rate": 1.6599478113111424e-05, + "loss": 0.3682, + "step": 1929 + }, + { + "epoch": 0.9222866017561675, + "grad_norm": 0.5371433965125264, + "learning_rate": 1.6595520258333545e-05, + "loss": 0.3624, + "step": 1930 + }, + { + "epoch": 0.9227644704617406, + "grad_norm": 0.5366454399722713, + "learning_rate": 1.659156057407469e-05, + "loss": 0.3809, + "step": 1931 + }, + { + "epoch": 0.9232423391673138, + "grad_norm": 0.9136617720639402, + "learning_rate": 1.6587599061433207e-05, + "loss": 0.3738, + "step": 1932 + }, + { + "epoch": 0.9237202078728869, + "grad_norm": 0.8488713857773934, + "learning_rate": 1.6583635721507944e-05, + "loss": 0.3637, + "step": 1933 + }, + { + "epoch": 0.9241980765784601, + "grad_norm": 0.49495459546373427, + "learning_rate": 1.6579670555398268e-05, + "loss": 0.367, + "step": 1934 + }, + { + "epoch": 0.9246759452840332, + "grad_norm": 0.5422350097681802, + "learning_rate": 1.657570356420404e-05, + "loss": 0.3747, + "step": 1935 + }, + { + "epoch": 0.9251538139896064, + "grad_norm": 0.5485293026148379, + "learning_rate": 1.657173474902564e-05, + "loss": 0.3767, + "step": 1936 + }, + { + "epoch": 0.9256316826951795, + "grad_norm": 0.7216669697310142, + "learning_rate": 1.6567764110963948e-05, + "loss": 0.3761, + "step": 1937 + }, + { + "epoch": 0.9261095514007527, + "grad_norm": 0.4879951676041884, + "learning_rate": 1.6563791651120336e-05, + "loss": 0.3637, + "step": 1938 + }, + { + "epoch": 0.9265874201063258, + "grad_norm": 0.49428473376773735, + "learning_rate": 1.6559817370596708e-05, + "loss": 0.3843, + "step": 1939 + }, + { + "epoch": 0.927065288811899, + "grad_norm": 0.48309373277907747, + "learning_rate": 1.6555841270495456e-05, + "loss": 0.3765, + "step": 1940 + }, + { + "epoch": 0.9275431575174721, + "grad_norm": 0.5013632030999634, + "learning_rate": 1.6551863351919478e-05, + "loss": 0.3694, + "step": 1941 + }, + { + "epoch": 0.9280210262230452, + "grad_norm": 0.5110638525672886, + "learning_rate": 1.6547883615972176e-05, + "loss": 0.363, + "step": 1942 + }, + { + "epoch": 0.9284988949286184, + "grad_norm": 0.5185504025173149, + "learning_rate": 1.6543902063757462e-05, + "loss": 0.3801, + "step": 1943 + }, + { + "epoch": 0.9289767636341915, + "grad_norm": 0.5049387315294666, + "learning_rate": 1.653991869637975e-05, + "loss": 0.369, + "step": 1944 + }, + { + "epoch": 0.9294546323397647, + "grad_norm": 0.4912665471357074, + "learning_rate": 1.6535933514943955e-05, + "loss": 0.3702, + "step": 1945 + }, + { + "epoch": 0.9299325010453378, + "grad_norm": 0.5118031125606555, + "learning_rate": 1.653194652055549e-05, + "loss": 0.3732, + "step": 1946 + }, + { + "epoch": 0.930410369750911, + "grad_norm": 0.6431555235235055, + "learning_rate": 1.6527957714320283e-05, + "loss": 0.3725, + "step": 1947 + }, + { + "epoch": 0.930888238456484, + "grad_norm": 0.5097523807572212, + "learning_rate": 1.6523967097344763e-05, + "loss": 0.3712, + "step": 1948 + }, + { + "epoch": 0.9313661071620573, + "grad_norm": 0.5046812127135673, + "learning_rate": 1.6519974670735846e-05, + "loss": 0.3737, + "step": 1949 + }, + { + "epoch": 0.9318439758676303, + "grad_norm": 0.5194776247965222, + "learning_rate": 1.6515980435600965e-05, + "loss": 0.3742, + "step": 1950 + }, + { + "epoch": 0.9323218445732036, + "grad_norm": 0.5404940696555603, + "learning_rate": 1.6511984393048055e-05, + "loss": 0.381, + "step": 1951 + }, + { + "epoch": 0.9327997132787766, + "grad_norm": 0.5553063994108789, + "learning_rate": 1.6507986544185543e-05, + "loss": 0.3649, + "step": 1952 + }, + { + "epoch": 0.9332775819843498, + "grad_norm": 0.5499668048968661, + "learning_rate": 1.650398689012236e-05, + "loss": 0.3793, + "step": 1953 + }, + { + "epoch": 0.9337554506899229, + "grad_norm": 0.563687647978333, + "learning_rate": 1.649998543196794e-05, + "loss": 0.3808, + "step": 1954 + }, + { + "epoch": 0.9342333193954961, + "grad_norm": 0.545135564437546, + "learning_rate": 1.6495982170832224e-05, + "loss": 0.3706, + "step": 1955 + }, + { + "epoch": 0.9347111881010692, + "grad_norm": 0.5303943311116299, + "learning_rate": 1.6491977107825642e-05, + "loss": 0.3685, + "step": 1956 + }, + { + "epoch": 0.9351890568066423, + "grad_norm": 0.5619908532760816, + "learning_rate": 1.648797024405912e-05, + "loss": 0.3883, + "step": 1957 + }, + { + "epoch": 0.9356669255122155, + "grad_norm": 0.5038859263688038, + "learning_rate": 1.64839615806441e-05, + "loss": 0.3766, + "step": 1958 + }, + { + "epoch": 0.9361447942177886, + "grad_norm": 0.521422602845323, + "learning_rate": 1.6479951118692515e-05, + "loss": 0.3895, + "step": 1959 + }, + { + "epoch": 0.9366226629233618, + "grad_norm": 0.5282657221332856, + "learning_rate": 1.6475938859316795e-05, + "loss": 0.3605, + "step": 1960 + }, + { + "epoch": 0.9371005316289349, + "grad_norm": 0.6351039349256459, + "learning_rate": 1.6471924803629867e-05, + "loss": 0.3651, + "step": 1961 + }, + { + "epoch": 0.9375784003345081, + "grad_norm": 0.5764981472703029, + "learning_rate": 1.6467908952745163e-05, + "loss": 0.38, + "step": 1962 + }, + { + "epoch": 0.9380562690400812, + "grad_norm": 8.536515562815161, + "learning_rate": 1.6463891307776606e-05, + "loss": 0.3956, + "step": 1963 + }, + { + "epoch": 0.9385341377456544, + "grad_norm": 0.6278724935607395, + "learning_rate": 1.645987186983862e-05, + "loss": 0.3739, + "step": 1964 + }, + { + "epoch": 0.9390120064512275, + "grad_norm": 0.48387133354028933, + "learning_rate": 1.6455850640046134e-05, + "loss": 0.3888, + "step": 1965 + }, + { + "epoch": 0.9394898751568007, + "grad_norm": 0.531275176529521, + "learning_rate": 1.6451827619514552e-05, + "loss": 0.3649, + "step": 1966 + }, + { + "epoch": 0.9399677438623738, + "grad_norm": 0.569449658310809, + "learning_rate": 1.6447802809359802e-05, + "loss": 0.3699, + "step": 1967 + }, + { + "epoch": 0.940445612567947, + "grad_norm": 0.5120339336372176, + "learning_rate": 1.6443776210698288e-05, + "loss": 0.3846, + "step": 1968 + }, + { + "epoch": 0.9409234812735201, + "grad_norm": 0.5460326477017806, + "learning_rate": 1.643974782464692e-05, + "loss": 0.3465, + "step": 1969 + }, + { + "epoch": 0.9414013499790932, + "grad_norm": 0.518882665215977, + "learning_rate": 1.6435717652323097e-05, + "loss": 0.3839, + "step": 1970 + }, + { + "epoch": 0.9418792186846664, + "grad_norm": 0.49852258072808264, + "learning_rate": 1.6431685694844725e-05, + "loss": 0.3816, + "step": 1971 + }, + { + "epoch": 0.9423570873902395, + "grad_norm": 0.5815555518919842, + "learning_rate": 1.6427651953330196e-05, + "loss": 0.381, + "step": 1972 + }, + { + "epoch": 0.9428349560958127, + "grad_norm": 0.48319880339598503, + "learning_rate": 1.6423616428898392e-05, + "loss": 0.3701, + "step": 1973 + }, + { + "epoch": 0.9433128248013858, + "grad_norm": 0.5167840266697357, + "learning_rate": 1.6419579122668704e-05, + "loss": 0.3556, + "step": 1974 + }, + { + "epoch": 0.943790693506959, + "grad_norm": 0.5618010439515246, + "learning_rate": 1.6415540035761008e-05, + "loss": 0.3744, + "step": 1975 + }, + { + "epoch": 0.9442685622125321, + "grad_norm": 0.5051139249357881, + "learning_rate": 1.641149916929567e-05, + "loss": 0.3876, + "step": 1976 + }, + { + "epoch": 0.9447464309181053, + "grad_norm": 0.5611312662809763, + "learning_rate": 1.6407456524393562e-05, + "loss": 0.3686, + "step": 1977 + }, + { + "epoch": 0.9452242996236784, + "grad_norm": 0.5732842999649458, + "learning_rate": 1.640341210217604e-05, + "loss": 0.3865, + "step": 1978 + }, + { + "epoch": 0.9457021683292516, + "grad_norm": 0.550740123702534, + "learning_rate": 1.6399365903764956e-05, + "loss": 0.3743, + "step": 1979 + }, + { + "epoch": 0.9461800370348247, + "grad_norm": 0.5379350100432237, + "learning_rate": 1.639531793028265e-05, + "loss": 0.3808, + "step": 1980 + }, + { + "epoch": 0.9466579057403979, + "grad_norm": 0.5315866561065092, + "learning_rate": 1.6391268182851963e-05, + "loss": 0.3734, + "step": 1981 + }, + { + "epoch": 0.947135774445971, + "grad_norm": 0.5224017338900245, + "learning_rate": 1.638721666259622e-05, + "loss": 0.3859, + "step": 1982 + }, + { + "epoch": 0.9476136431515441, + "grad_norm": 0.5967691599781494, + "learning_rate": 1.638316337063925e-05, + "loss": 0.3581, + "step": 1983 + }, + { + "epoch": 0.9480915118571173, + "grad_norm": 0.589119592504554, + "learning_rate": 1.6379108308105354e-05, + "loss": 0.3835, + "step": 1984 + }, + { + "epoch": 0.9485693805626904, + "grad_norm": 0.51383408409613, + "learning_rate": 1.637505147611934e-05, + "loss": 0.3586, + "step": 1985 + }, + { + "epoch": 0.9490472492682636, + "grad_norm": 0.4994797959754618, + "learning_rate": 1.63709928758065e-05, + "loss": 0.377, + "step": 1986 + }, + { + "epoch": 0.9495251179738367, + "grad_norm": 0.5048066876195055, + "learning_rate": 1.6366932508292618e-05, + "loss": 0.3731, + "step": 1987 + }, + { + "epoch": 0.9500029866794099, + "grad_norm": 0.524811235817121, + "learning_rate": 1.6362870374703967e-05, + "loss": 0.3793, + "step": 1988 + }, + { + "epoch": 0.950480855384983, + "grad_norm": 0.5735930473974293, + "learning_rate": 1.6358806476167316e-05, + "loss": 0.3656, + "step": 1989 + }, + { + "epoch": 0.9509587240905562, + "grad_norm": 0.4832184523655007, + "learning_rate": 1.6354740813809917e-05, + "loss": 0.373, + "step": 1990 + }, + { + "epoch": 0.9514365927961292, + "grad_norm": 0.5128768000065408, + "learning_rate": 1.635067338875951e-05, + "loss": 0.3821, + "step": 1991 + }, + { + "epoch": 0.9519144615017024, + "grad_norm": 0.5022274034535443, + "learning_rate": 1.6346604202144326e-05, + "loss": 0.3901, + "step": 1992 + }, + { + "epoch": 0.9523923302072755, + "grad_norm": 0.5489970781237143, + "learning_rate": 1.634253325509309e-05, + "loss": 0.3858, + "step": 1993 + }, + { + "epoch": 0.9528701989128487, + "grad_norm": 0.4975450932340785, + "learning_rate": 1.6338460548735015e-05, + "loss": 0.3686, + "step": 1994 + }, + { + "epoch": 0.9533480676184218, + "grad_norm": 0.48127408958137835, + "learning_rate": 1.6334386084199787e-05, + "loss": 0.3745, + "step": 1995 + }, + { + "epoch": 0.9538259363239949, + "grad_norm": 0.5704221304645912, + "learning_rate": 1.6330309862617598e-05, + "loss": 0.3613, + "step": 1996 + }, + { + "epoch": 0.9543038050295681, + "grad_norm": 0.5148160729331526, + "learning_rate": 1.6326231885119117e-05, + "loss": 0.3827, + "step": 1997 + }, + { + "epoch": 0.9547816737351412, + "grad_norm": 0.6284728203646394, + "learning_rate": 1.63221521528355e-05, + "loss": 0.3824, + "step": 1998 + }, + { + "epoch": 0.9552595424407144, + "grad_norm": 0.49271041299162915, + "learning_rate": 1.63180706668984e-05, + "loss": 0.3691, + "step": 1999 + }, + { + "epoch": 0.9557374111462875, + "grad_norm": 0.5390149320147591, + "learning_rate": 1.631398742843995e-05, + "loss": 0.3652, + "step": 2000 + }, + { + "epoch": 0.9562152798518607, + "grad_norm": 0.5025205155225639, + "learning_rate": 1.6309902438592762e-05, + "loss": 0.3694, + "step": 2001 + }, + { + "epoch": 0.9566931485574338, + "grad_norm": 0.5222794351472518, + "learning_rate": 1.6305815698489938e-05, + "loss": 0.3681, + "step": 2002 + }, + { + "epoch": 0.957171017263007, + "grad_norm": 0.5145118353547241, + "learning_rate": 1.6301727209265077e-05, + "loss": 0.3719, + "step": 2003 + }, + { + "epoch": 0.9576488859685801, + "grad_norm": 0.4908469670116591, + "learning_rate": 1.629763697205225e-05, + "loss": 0.3705, + "step": 2004 + }, + { + "epoch": 0.9581267546741533, + "grad_norm": 0.5012664663409069, + "learning_rate": 1.629354498798601e-05, + "loss": 0.3719, + "step": 2005 + }, + { + "epoch": 0.9586046233797264, + "grad_norm": 0.518588891339538, + "learning_rate": 1.628945125820141e-05, + "loss": 0.3709, + "step": 2006 + }, + { + "epoch": 0.9590824920852996, + "grad_norm": 0.5117699235235205, + "learning_rate": 1.628535578383397e-05, + "loss": 0.3737, + "step": 2007 + }, + { + "epoch": 0.9595603607908727, + "grad_norm": 0.5356226048939458, + "learning_rate": 1.6281258566019712e-05, + "loss": 0.3764, + "step": 2008 + }, + { + "epoch": 0.9600382294964459, + "grad_norm": 0.5083134780109303, + "learning_rate": 1.6277159605895124e-05, + "loss": 0.3723, + "step": 2009 + }, + { + "epoch": 0.960516098202019, + "grad_norm": 0.5023579232084872, + "learning_rate": 1.627305890459719e-05, + "loss": 0.3704, + "step": 2010 + }, + { + "epoch": 0.9609939669075921, + "grad_norm": 0.46631779465683365, + "learning_rate": 1.6268956463263372e-05, + "loss": 0.3526, + "step": 2011 + }, + { + "epoch": 0.9614718356131653, + "grad_norm": 0.547788682791615, + "learning_rate": 1.6264852283031614e-05, + "loss": 0.3779, + "step": 2012 + }, + { + "epoch": 0.9619497043187384, + "grad_norm": 0.5104177809427011, + "learning_rate": 1.6260746365040342e-05, + "loss": 0.3729, + "step": 2013 + }, + { + "epoch": 0.9624275730243116, + "grad_norm": 0.5421693818087101, + "learning_rate": 1.6256638710428468e-05, + "loss": 0.3698, + "step": 2014 + }, + { + "epoch": 0.9629054417298847, + "grad_norm": 0.5878685955211844, + "learning_rate": 1.625252932033538e-05, + "loss": 0.3702, + "step": 2015 + }, + { + "epoch": 0.9633833104354579, + "grad_norm": 0.5392634273182241, + "learning_rate": 1.6248418195900944e-05, + "loss": 0.3714, + "step": 2016 + }, + { + "epoch": 0.963861179141031, + "grad_norm": 0.5041453371523026, + "learning_rate": 1.6244305338265528e-05, + "loss": 0.3757, + "step": 2017 + }, + { + "epoch": 0.9643390478466042, + "grad_norm": 0.5611684018900728, + "learning_rate": 1.6240190748569958e-05, + "loss": 0.3615, + "step": 2018 + }, + { + "epoch": 0.9648169165521773, + "grad_norm": 0.4958027277870733, + "learning_rate": 1.6236074427955547e-05, + "loss": 0.3643, + "step": 2019 + }, + { + "epoch": 0.9652947852577505, + "grad_norm": 0.5065207954211904, + "learning_rate": 1.6231956377564095e-05, + "loss": 0.372, + "step": 2020 + }, + { + "epoch": 0.9657726539633236, + "grad_norm": 0.48383262645888736, + "learning_rate": 1.6227836598537874e-05, + "loss": 0.3924, + "step": 2021 + }, + { + "epoch": 0.9662505226688968, + "grad_norm": 0.5200139461232003, + "learning_rate": 1.6223715092019636e-05, + "loss": 0.3671, + "step": 2022 + }, + { + "epoch": 0.9667283913744699, + "grad_norm": 0.4953988372654815, + "learning_rate": 1.6219591859152618e-05, + "loss": 0.3812, + "step": 2023 + }, + { + "epoch": 0.967206260080043, + "grad_norm": 0.5102837388791924, + "learning_rate": 1.621546690108053e-05, + "loss": 0.3652, + "step": 2024 + }, + { + "epoch": 0.9676841287856162, + "grad_norm": 0.5629208320653784, + "learning_rate": 1.621134021894756e-05, + "loss": 0.3611, + "step": 2025 + }, + { + "epoch": 0.9681619974911893, + "grad_norm": 0.4952840329409754, + "learning_rate": 1.6207211813898377e-05, + "loss": 0.3972, + "step": 2026 + }, + { + "epoch": 0.9686398661967625, + "grad_norm": 0.5037099311393208, + "learning_rate": 1.6203081687078136e-05, + "loss": 0.373, + "step": 2027 + }, + { + "epoch": 0.9691177349023355, + "grad_norm": 0.5838992353089708, + "learning_rate": 1.6198949839632453e-05, + "loss": 0.3793, + "step": 2028 + }, + { + "epoch": 0.9695956036079088, + "grad_norm": 0.49580068754399503, + "learning_rate": 1.619481627270743e-05, + "loss": 0.3613, + "step": 2029 + }, + { + "epoch": 0.9700734723134818, + "grad_norm": 0.532208761854008, + "learning_rate": 1.619068098744965e-05, + "loss": 0.3866, + "step": 2030 + }, + { + "epoch": 0.970551341019055, + "grad_norm": 0.5224695569046617, + "learning_rate": 1.6186543985006164e-05, + "loss": 0.3699, + "step": 2031 + }, + { + "epoch": 0.9710292097246281, + "grad_norm": 0.5227062263015988, + "learning_rate": 1.6182405266524507e-05, + "loss": 0.3912, + "step": 2032 + }, + { + "epoch": 0.9715070784302013, + "grad_norm": 0.52945280171711, + "learning_rate": 1.6178264833152688e-05, + "loss": 0.3403, + "step": 2033 + }, + { + "epoch": 0.9719849471357744, + "grad_norm": 0.5629792708553427, + "learning_rate": 1.6174122686039182e-05, + "loss": 0.3784, + "step": 2034 + }, + { + "epoch": 0.9724628158413476, + "grad_norm": 0.5163999238740223, + "learning_rate": 1.6169978826332955e-05, + "loss": 0.3659, + "step": 2035 + }, + { + "epoch": 0.9729406845469207, + "grad_norm": 0.5133995557530827, + "learning_rate": 1.6165833255183438e-05, + "loss": 0.3694, + "step": 2036 + }, + { + "epoch": 0.9734185532524938, + "grad_norm": 0.5617728810922827, + "learning_rate": 1.616168597374054e-05, + "loss": 0.363, + "step": 2037 + }, + { + "epoch": 0.973896421958067, + "grad_norm": 0.5237674641752671, + "learning_rate": 1.615753698315464e-05, + "loss": 0.3803, + "step": 2038 + }, + { + "epoch": 0.9743742906636401, + "grad_norm": 0.5228216583365605, + "learning_rate": 1.61533862845766e-05, + "loss": 0.3681, + "step": 2039 + }, + { + "epoch": 0.9748521593692133, + "grad_norm": 0.4805229482972204, + "learning_rate": 1.6149233879157747e-05, + "loss": 0.3658, + "step": 2040 + }, + { + "epoch": 0.9753300280747864, + "grad_norm": 0.5534895159460543, + "learning_rate": 1.614507976804989e-05, + "loss": 0.3784, + "step": 2041 + }, + { + "epoch": 0.9758078967803596, + "grad_norm": 0.5486724592299036, + "learning_rate": 1.6140923952405302e-05, + "loss": 0.3786, + "step": 2042 + }, + { + "epoch": 0.9762857654859327, + "grad_norm": 0.8080884579911872, + "learning_rate": 1.6136766433376728e-05, + "loss": 0.3723, + "step": 2043 + }, + { + "epoch": 0.9767636341915059, + "grad_norm": 0.5359191028098362, + "learning_rate": 1.6132607212117404e-05, + "loss": 0.3526, + "step": 2044 + }, + { + "epoch": 0.977241502897079, + "grad_norm": 0.5053369417965282, + "learning_rate": 1.6128446289781012e-05, + "loss": 0.3772, + "step": 2045 + }, + { + "epoch": 0.9777193716026522, + "grad_norm": 0.5211994750741029, + "learning_rate": 1.6124283667521727e-05, + "loss": 0.3808, + "step": 2046 + }, + { + "epoch": 0.9781972403082253, + "grad_norm": 0.5676346296630453, + "learning_rate": 1.612011934649418e-05, + "loss": 0.3715, + "step": 2047 + }, + { + "epoch": 0.9786751090137985, + "grad_norm": 0.5248006849417738, + "learning_rate": 1.611595332785348e-05, + "loss": 0.3553, + "step": 2048 + }, + { + "epoch": 0.9791529777193716, + "grad_norm": 0.508623732364023, + "learning_rate": 1.6111785612755214e-05, + "loss": 0.3754, + "step": 2049 + }, + { + "epoch": 0.9796308464249448, + "grad_norm": 0.5266899133269225, + "learning_rate": 1.610761620235543e-05, + "loss": 0.3859, + "step": 2050 + }, + { + "epoch": 0.9801087151305179, + "grad_norm": 0.4751459644310755, + "learning_rate": 1.610344509781065e-05, + "loss": 0.387, + "step": 2051 + }, + { + "epoch": 0.980586583836091, + "grad_norm": 0.5316863726383205, + "learning_rate": 1.609927230027786e-05, + "loss": 0.3959, + "step": 2052 + }, + { + "epoch": 0.9810644525416642, + "grad_norm": 0.5453039434436993, + "learning_rate": 1.609509781091452e-05, + "loss": 0.3697, + "step": 2053 + }, + { + "epoch": 0.9815423212472373, + "grad_norm": 0.4825716401466539, + "learning_rate": 1.6090921630878568e-05, + "loss": 0.3835, + "step": 2054 + }, + { + "epoch": 0.9820201899528105, + "grad_norm": 0.5178510139332422, + "learning_rate": 1.60867437613284e-05, + "loss": 0.3717, + "step": 2055 + }, + { + "epoch": 0.9824980586583836, + "grad_norm": 0.524887791800861, + "learning_rate": 1.6082564203422876e-05, + "loss": 0.3653, + "step": 2056 + }, + { + "epoch": 0.9829759273639568, + "grad_norm": 0.5213297178508581, + "learning_rate": 1.6078382958321336e-05, + "loss": 0.3725, + "step": 2057 + }, + { + "epoch": 0.9834537960695299, + "grad_norm": 0.5282681790101221, + "learning_rate": 1.6074200027183584e-05, + "loss": 0.3695, + "step": 2058 + }, + { + "epoch": 0.9839316647751031, + "grad_norm": 0.489145805606624, + "learning_rate": 1.6070015411169896e-05, + "loss": 0.3623, + "step": 2059 + }, + { + "epoch": 0.9844095334806762, + "grad_norm": 0.5331793559039849, + "learning_rate": 1.6065829111441e-05, + "loss": 0.3556, + "step": 2060 + }, + { + "epoch": 0.9848874021862494, + "grad_norm": 0.5443653643393922, + "learning_rate": 1.6061641129158112e-05, + "loss": 0.3988, + "step": 2061 + }, + { + "epoch": 0.9853652708918225, + "grad_norm": 0.8351330747470432, + "learning_rate": 1.60574514654829e-05, + "loss": 0.3804, + "step": 2062 + }, + { + "epoch": 0.9858431395973957, + "grad_norm": 0.5030615466689651, + "learning_rate": 1.6053260121577503e-05, + "loss": 0.3618, + "step": 2063 + }, + { + "epoch": 0.9863210083029688, + "grad_norm": 0.5471550674629175, + "learning_rate": 1.6049067098604523e-05, + "loss": 0.3557, + "step": 2064 + }, + { + "epoch": 0.9867988770085419, + "grad_norm": 0.48069192998466753, + "learning_rate": 1.6044872397727037e-05, + "loss": 0.3612, + "step": 2065 + }, + { + "epoch": 0.9872767457141151, + "grad_norm": 0.5253037063797116, + "learning_rate": 1.6040676020108577e-05, + "loss": 0.3655, + "step": 2066 + }, + { + "epoch": 0.9877546144196881, + "grad_norm": 0.5275280865811858, + "learning_rate": 1.6036477966913143e-05, + "loss": 0.3852, + "step": 2067 + }, + { + "epoch": 0.9882324831252614, + "grad_norm": 0.47485994365355333, + "learning_rate": 1.6032278239305204e-05, + "loss": 0.3638, + "step": 2068 + }, + { + "epoch": 0.9887103518308344, + "grad_norm": 0.5362179099114655, + "learning_rate": 1.6028076838449692e-05, + "loss": 0.3725, + "step": 2069 + }, + { + "epoch": 0.9891882205364076, + "grad_norm": 0.5119867606693408, + "learning_rate": 1.6023873765511993e-05, + "loss": 0.3757, + "step": 2070 + }, + { + "epoch": 0.9896660892419807, + "grad_norm": 0.5570349801753108, + "learning_rate": 1.6019669021657972e-05, + "loss": 0.362, + "step": 2071 + }, + { + "epoch": 0.9901439579475539, + "grad_norm": 0.5023908521667267, + "learning_rate": 1.601546260805395e-05, + "loss": 0.3664, + "step": 2072 + }, + { + "epoch": 0.990621826653127, + "grad_norm": 0.4919070804327344, + "learning_rate": 1.6011254525866715e-05, + "loss": 0.3746, + "step": 2073 + }, + { + "epoch": 0.9910996953587002, + "grad_norm": 0.5274265284295131, + "learning_rate": 1.600704477626351e-05, + "loss": 0.3554, + "step": 2074 + }, + { + "epoch": 0.9915775640642733, + "grad_norm": 0.4973674393943186, + "learning_rate": 1.6002833360412044e-05, + "loss": 0.3734, + "step": 2075 + }, + { + "epoch": 0.9920554327698465, + "grad_norm": 0.6994137247566886, + "learning_rate": 1.599862027948049e-05, + "loss": 0.3785, + "step": 2076 + }, + { + "epoch": 0.9925333014754196, + "grad_norm": 0.5005262403669962, + "learning_rate": 1.5994405534637487e-05, + "loss": 0.3508, + "step": 2077 + }, + { + "epoch": 0.9930111701809927, + "grad_norm": 0.48923707105340497, + "learning_rate": 1.5990189127052128e-05, + "loss": 0.3767, + "step": 2078 + }, + { + "epoch": 0.9934890388865659, + "grad_norm": 0.5287580848621545, + "learning_rate": 1.5985971057893973e-05, + "loss": 0.3754, + "step": 2079 + }, + { + "epoch": 0.993966907592139, + "grad_norm": 0.4820940737829678, + "learning_rate": 1.5981751328333036e-05, + "loss": 0.3631, + "step": 2080 + }, + { + "epoch": 0.9944447762977122, + "grad_norm": 0.47169419878350755, + "learning_rate": 1.5977529939539794e-05, + "loss": 0.3737, + "step": 2081 + }, + { + "epoch": 0.9949226450032853, + "grad_norm": 0.4819615935924389, + "learning_rate": 1.597330689268519e-05, + "loss": 0.3776, + "step": 2082 + }, + { + "epoch": 0.9954005137088585, + "grad_norm": 0.4915767151527447, + "learning_rate": 1.5969082188940623e-05, + "loss": 0.3855, + "step": 2083 + }, + { + "epoch": 0.9958783824144316, + "grad_norm": 0.5283871739026111, + "learning_rate": 1.5964855829477946e-05, + "loss": 0.3643, + "step": 2084 + }, + { + "epoch": 0.9963562511200048, + "grad_norm": 0.5657962366230493, + "learning_rate": 1.5960627815469486e-05, + "loss": 0.3671, + "step": 2085 + }, + { + "epoch": 0.9968341198255779, + "grad_norm": 0.47929648182468015, + "learning_rate": 1.5956398148088007e-05, + "loss": 0.3733, + "step": 2086 + }, + { + "epoch": 0.9973119885311511, + "grad_norm": 0.5282895558309355, + "learning_rate": 1.5952166828506754e-05, + "loss": 0.3546, + "step": 2087 + }, + { + "epoch": 0.9977898572367242, + "grad_norm": 0.49273301808521663, + "learning_rate": 1.5947933857899418e-05, + "loss": 0.3794, + "step": 2088 + }, + { + "epoch": 0.9982677259422974, + "grad_norm": 0.5307309337337829, + "learning_rate": 1.594369923744015e-05, + "loss": 0.367, + "step": 2089 + }, + { + "epoch": 0.9987455946478705, + "grad_norm": 0.5584015875131911, + "learning_rate": 1.5939462968303554e-05, + "loss": 0.3699, + "step": 2090 + }, + { + "epoch": 0.9992234633534436, + "grad_norm": 0.5866978710305609, + "learning_rate": 1.5935225051664708e-05, + "loss": 0.3759, + "step": 2091 + }, + { + "epoch": 0.9997013320590168, + "grad_norm": 0.48873807272784037, + "learning_rate": 1.593098548869912e-05, + "loss": 0.3608, + "step": 2092 + }, + { + "epoch": 1.0, + "grad_norm": 0.5723334542119868, + "learning_rate": 1.5926744280582786e-05, + "loss": 0.3643, + "step": 2093 + }, + { + "epoch": 1.000477868705573, + "grad_norm": 0.6300673986894956, + "learning_rate": 1.5922501428492126e-05, + "loss": 0.333, + "step": 2094 + }, + { + "epoch": 1.0009557374111462, + "grad_norm": 0.5750968585044649, + "learning_rate": 1.5918256933604047e-05, + "loss": 0.3407, + "step": 2095 + }, + { + "epoch": 1.0014336061167195, + "grad_norm": 0.5649190535406365, + "learning_rate": 1.591401079709589e-05, + "loss": 0.3313, + "step": 2096 + }, + { + "epoch": 1.0019114748222926, + "grad_norm": 0.6470098562888061, + "learning_rate": 1.590976302014546e-05, + "loss": 0.3193, + "step": 2097 + }, + { + "epoch": 1.0023893435278657, + "grad_norm": 0.6124457488598071, + "learning_rate": 1.5905513603931013e-05, + "loss": 0.3453, + "step": 2098 + }, + { + "epoch": 1.0028672122334388, + "grad_norm": 0.5569653564403705, + "learning_rate": 1.5901262549631266e-05, + "loss": 0.3308, + "step": 2099 + }, + { + "epoch": 1.003345080939012, + "grad_norm": 0.5630136553754124, + "learning_rate": 1.589700985842538e-05, + "loss": 0.3401, + "step": 2100 + }, + { + "epoch": 1.0038229496445852, + "grad_norm": 0.5491390499825274, + "learning_rate": 1.5892755531492986e-05, + "loss": 0.3368, + "step": 2101 + }, + { + "epoch": 1.0043008183501583, + "grad_norm": 0.5802336855773184, + "learning_rate": 1.5888499570014152e-05, + "loss": 0.3379, + "step": 2102 + }, + { + "epoch": 1.0047786870557314, + "grad_norm": 0.5027412997145168, + "learning_rate": 1.5884241975169406e-05, + "loss": 0.3239, + "step": 2103 + }, + { + "epoch": 1.0052565557613047, + "grad_norm": 0.5219023282328462, + "learning_rate": 1.5879982748139738e-05, + "loss": 0.3316, + "step": 2104 + }, + { + "epoch": 1.0057344244668778, + "grad_norm": 0.6198051403881502, + "learning_rate": 1.5875721890106574e-05, + "loss": 0.3355, + "step": 2105 + }, + { + "epoch": 1.0062122931724509, + "grad_norm": 0.5823835508633309, + "learning_rate": 1.58714594022518e-05, + "loss": 0.3421, + "step": 2106 + }, + { + "epoch": 1.006690161878024, + "grad_norm": 0.6314654286023943, + "learning_rate": 1.586719528575776e-05, + "loss": 0.3492, + "step": 2107 + }, + { + "epoch": 1.007168030583597, + "grad_norm": 0.5494826948922337, + "learning_rate": 1.5862929541807247e-05, + "loss": 0.3187, + "step": 2108 + }, + { + "epoch": 1.0076458992891704, + "grad_norm": 0.49367247951130727, + "learning_rate": 1.5858662171583495e-05, + "loss": 0.343, + "step": 2109 + }, + { + "epoch": 1.0081237679947435, + "grad_norm": 0.5867003697639408, + "learning_rate": 1.5854393176270205e-05, + "loss": 0.3222, + "step": 2110 + }, + { + "epoch": 1.0086016367003166, + "grad_norm": 0.5202276726676689, + "learning_rate": 1.585012255705152e-05, + "loss": 0.3395, + "step": 2111 + }, + { + "epoch": 1.0090795054058896, + "grad_norm": 0.5331521180511377, + "learning_rate": 1.5845850315112025e-05, + "loss": 0.304, + "step": 2112 + }, + { + "epoch": 1.009557374111463, + "grad_norm": 0.5838843873042685, + "learning_rate": 1.5841576451636777e-05, + "loss": 0.3392, + "step": 2113 + }, + { + "epoch": 1.010035242817036, + "grad_norm": 0.5162331359042389, + "learning_rate": 1.5837300967811258e-05, + "loss": 0.343, + "step": 2114 + }, + { + "epoch": 1.0105131115226091, + "grad_norm": 0.6115190735592178, + "learning_rate": 1.5833023864821427e-05, + "loss": 0.321, + "step": 2115 + }, + { + "epoch": 1.0109909802281822, + "grad_norm": 0.5925028855716654, + "learning_rate": 1.5828745143853665e-05, + "loss": 0.3393, + "step": 2116 + }, + { + "epoch": 1.0114688489337555, + "grad_norm": 0.6132879239893447, + "learning_rate": 1.5824464806094817e-05, + "loss": 0.3292, + "step": 2117 + }, + { + "epoch": 1.0119467176393286, + "grad_norm": 0.5969110143142136, + "learning_rate": 1.5820182852732177e-05, + "loss": 0.3234, + "step": 2118 + }, + { + "epoch": 1.0124245863449017, + "grad_norm": 0.6131274616744073, + "learning_rate": 1.5815899284953477e-05, + "loss": 0.3437, + "step": 2119 + }, + { + "epoch": 1.0129024550504748, + "grad_norm": 0.5161682873500958, + "learning_rate": 1.5811614103946905e-05, + "loss": 0.3366, + "step": 2120 + }, + { + "epoch": 1.013380323756048, + "grad_norm": 0.5311306907505919, + "learning_rate": 1.5807327310901096e-05, + "loss": 0.338, + "step": 2121 + }, + { + "epoch": 1.0138581924616212, + "grad_norm": 0.5252428625482294, + "learning_rate": 1.580303890700513e-05, + "loss": 0.3215, + "step": 2122 + }, + { + "epoch": 1.0143360611671943, + "grad_norm": 0.5329298201469379, + "learning_rate": 1.579874889344854e-05, + "loss": 0.3231, + "step": 2123 + }, + { + "epoch": 1.0148139298727674, + "grad_norm": 0.8494224841188093, + "learning_rate": 1.579445727142129e-05, + "loss": 0.3209, + "step": 2124 + }, + { + "epoch": 1.0152917985783405, + "grad_norm": 0.575938313407013, + "learning_rate": 1.5790164042113805e-05, + "loss": 0.314, + "step": 2125 + }, + { + "epoch": 1.0157696672839138, + "grad_norm": 0.658256081712634, + "learning_rate": 1.5785869206716957e-05, + "loss": 0.3192, + "step": 2126 + }, + { + "epoch": 1.016247535989487, + "grad_norm": 0.7068653729035548, + "learning_rate": 1.578157276642205e-05, + "loss": 0.3306, + "step": 2127 + }, + { + "epoch": 1.01672540469506, + "grad_norm": 0.5411767755265546, + "learning_rate": 1.577727472242084e-05, + "loss": 0.3233, + "step": 2128 + }, + { + "epoch": 1.017203273400633, + "grad_norm": 0.6005936988154575, + "learning_rate": 1.577297507590553e-05, + "loss": 0.3284, + "step": 2129 + }, + { + "epoch": 1.0176811421062064, + "grad_norm": 0.5499614192701662, + "learning_rate": 1.576867382806877e-05, + "loss": 0.32, + "step": 2130 + }, + { + "epoch": 1.0181590108117795, + "grad_norm": 0.5390243829826628, + "learning_rate": 1.5764370980103652e-05, + "loss": 0.3355, + "step": 2131 + }, + { + "epoch": 1.0186368795173526, + "grad_norm": 0.5867439844608485, + "learning_rate": 1.57600665332037e-05, + "loss": 0.3323, + "step": 2132 + }, + { + "epoch": 1.0191147482229257, + "grad_norm": 0.567986991755531, + "learning_rate": 1.5755760488562898e-05, + "loss": 0.336, + "step": 2133 + }, + { + "epoch": 1.0195926169284988, + "grad_norm": 0.5275519852089237, + "learning_rate": 1.575145284737567e-05, + "loss": 0.3272, + "step": 2134 + }, + { + "epoch": 1.020070485634072, + "grad_norm": 0.5116807564883972, + "learning_rate": 1.5747143610836873e-05, + "loss": 0.3287, + "step": 2135 + }, + { + "epoch": 1.0205483543396452, + "grad_norm": 0.5309021872208404, + "learning_rate": 1.5742832780141816e-05, + "loss": 0.3517, + "step": 2136 + }, + { + "epoch": 1.0210262230452183, + "grad_norm": 0.5056039849696519, + "learning_rate": 1.573852035648625e-05, + "loss": 0.3317, + "step": 2137 + }, + { + "epoch": 1.0215040917507914, + "grad_norm": 0.5168796985969982, + "learning_rate": 1.5734206341066363e-05, + "loss": 0.3287, + "step": 2138 + }, + { + "epoch": 1.0219819604563647, + "grad_norm": 0.4971819258293254, + "learning_rate": 1.5729890735078782e-05, + "loss": 0.3465, + "step": 2139 + }, + { + "epoch": 1.0224598291619378, + "grad_norm": 0.5362308867880675, + "learning_rate": 1.5725573539720592e-05, + "loss": 0.3321, + "step": 2140 + }, + { + "epoch": 1.0229376978675109, + "grad_norm": 0.5409439946440039, + "learning_rate": 1.5721254756189293e-05, + "loss": 0.3385, + "step": 2141 + }, + { + "epoch": 1.023415566573084, + "grad_norm": 0.5435540704296682, + "learning_rate": 1.5716934385682847e-05, + "loss": 0.3117, + "step": 2142 + }, + { + "epoch": 1.0238934352786573, + "grad_norm": 0.5353171577649578, + "learning_rate": 1.5712612429399648e-05, + "loss": 0.3213, + "step": 2143 + }, + { + "epoch": 1.0243713039842304, + "grad_norm": 0.5572982587837408, + "learning_rate": 1.570828888853853e-05, + "loss": 0.325, + "step": 2144 + }, + { + "epoch": 1.0248491726898035, + "grad_norm": 0.5032264715461846, + "learning_rate": 1.570396376429877e-05, + "loss": 0.3253, + "step": 2145 + }, + { + "epoch": 1.0253270413953766, + "grad_norm": 0.5157482930173531, + "learning_rate": 1.569963705788007e-05, + "loss": 0.3426, + "step": 2146 + }, + { + "epoch": 1.0258049101009497, + "grad_norm": 0.5008263600349416, + "learning_rate": 1.56953087704826e-05, + "loss": 0.343, + "step": 2147 + }, + { + "epoch": 1.026282778806523, + "grad_norm": 0.5392376218710371, + "learning_rate": 1.5690978903306936e-05, + "loss": 0.333, + "step": 2148 + }, + { + "epoch": 1.026760647512096, + "grad_norm": 0.49909224648803696, + "learning_rate": 1.5686647457554108e-05, + "loss": 0.3223, + "step": 2149 + }, + { + "epoch": 1.0272385162176692, + "grad_norm": 0.5514337816765721, + "learning_rate": 1.5682314434425593e-05, + "loss": 0.3257, + "step": 2150 + }, + { + "epoch": 1.0277163849232422, + "grad_norm": 0.5560498032013678, + "learning_rate": 1.5677979835123282e-05, + "loss": 0.3172, + "step": 2151 + }, + { + "epoch": 1.0281942536288156, + "grad_norm": 0.497633056500836, + "learning_rate": 1.5673643660849525e-05, + "loss": 0.3472, + "step": 2152 + }, + { + "epoch": 1.0286721223343886, + "grad_norm": 0.5090101129547484, + "learning_rate": 1.5669305912807095e-05, + "loss": 0.3431, + "step": 2153 + }, + { + "epoch": 1.0291499910399617, + "grad_norm": 0.5115208530872307, + "learning_rate": 1.5664966592199213e-05, + "loss": 0.3179, + "step": 2154 + }, + { + "epoch": 1.0296278597455348, + "grad_norm": 0.4941887725586954, + "learning_rate": 1.5660625700229526e-05, + "loss": 0.3199, + "step": 2155 + }, + { + "epoch": 1.0301057284511081, + "grad_norm": 0.5051050314513967, + "learning_rate": 1.5656283238102125e-05, + "loss": 0.3503, + "step": 2156 + }, + { + "epoch": 1.0305835971566812, + "grad_norm": 0.47274872720440475, + "learning_rate": 1.5651939207021522e-05, + "loss": 0.3417, + "step": 2157 + }, + { + "epoch": 1.0310614658622543, + "grad_norm": 0.5026973690751055, + "learning_rate": 1.5647593608192685e-05, + "loss": 0.3307, + "step": 2158 + }, + { + "epoch": 1.0315393345678274, + "grad_norm": 0.506764397545058, + "learning_rate": 1.5643246442821004e-05, + "loss": 0.3446, + "step": 2159 + }, + { + "epoch": 1.0320172032734007, + "grad_norm": 0.5727967952694278, + "learning_rate": 1.5638897712112303e-05, + "loss": 0.3459, + "step": 2160 + }, + { + "epoch": 1.0324950719789738, + "grad_norm": 0.59780425526287, + "learning_rate": 1.5634547417272847e-05, + "loss": 0.3292, + "step": 2161 + }, + { + "epoch": 1.032972940684547, + "grad_norm": 0.5022372242667101, + "learning_rate": 1.5630195559509326e-05, + "loss": 0.3218, + "step": 2162 + }, + { + "epoch": 1.03345080939012, + "grad_norm": 0.5536484199445086, + "learning_rate": 1.562584214002887e-05, + "loss": 0.3037, + "step": 2163 + }, + { + "epoch": 1.033928678095693, + "grad_norm": 0.5211875586311786, + "learning_rate": 1.562148716003905e-05, + "loss": 0.347, + "step": 2164 + }, + { + "epoch": 1.0344065468012664, + "grad_norm": 0.5504469873275528, + "learning_rate": 1.561713062074785e-05, + "loss": 0.3189, + "step": 2165 + }, + { + "epoch": 1.0348844155068395, + "grad_norm": 0.5578520136556673, + "learning_rate": 1.56127725233637e-05, + "loss": 0.3397, + "step": 2166 + }, + { + "epoch": 1.0353622842124126, + "grad_norm": 0.547653966471447, + "learning_rate": 1.560841286909546e-05, + "loss": 0.3355, + "step": 2167 + }, + { + "epoch": 1.0358401529179857, + "grad_norm": 0.5470544888308998, + "learning_rate": 1.5604051659152418e-05, + "loss": 0.3302, + "step": 2168 + }, + { + "epoch": 1.036318021623559, + "grad_norm": 0.5805024891652473, + "learning_rate": 1.5599688894744304e-05, + "loss": 0.3335, + "step": 2169 + }, + { + "epoch": 1.036795890329132, + "grad_norm": 0.5136197878985662, + "learning_rate": 1.5595324577081265e-05, + "loss": 0.3295, + "step": 2170 + }, + { + "epoch": 1.0372737590347052, + "grad_norm": 0.56863395023198, + "learning_rate": 1.5590958707373886e-05, + "loss": 0.3175, + "step": 2171 + }, + { + "epoch": 1.0377516277402783, + "grad_norm": 0.5502967372343314, + "learning_rate": 1.558659128683319e-05, + "loss": 0.337, + "step": 2172 + }, + { + "epoch": 1.0382294964458516, + "grad_norm": 0.5135053364771395, + "learning_rate": 1.558222231667061e-05, + "loss": 0.3324, + "step": 2173 + }, + { + "epoch": 1.0387073651514247, + "grad_norm": 0.537644706467191, + "learning_rate": 1.5577851798098032e-05, + "loss": 0.3261, + "step": 2174 + }, + { + "epoch": 1.0391852338569978, + "grad_norm": 0.5382510505587826, + "learning_rate": 1.5573479732327758e-05, + "loss": 0.338, + "step": 2175 + }, + { + "epoch": 1.0396631025625709, + "grad_norm": 0.49204890495941894, + "learning_rate": 1.556910612057252e-05, + "loss": 0.3477, + "step": 2176 + }, + { + "epoch": 1.040140971268144, + "grad_norm": 0.523151317823713, + "learning_rate": 1.5564730964045476e-05, + "loss": 0.352, + "step": 2177 + }, + { + "epoch": 1.0406188399737173, + "grad_norm": 0.4894794188961839, + "learning_rate": 1.556035426396023e-05, + "loss": 0.3135, + "step": 2178 + }, + { + "epoch": 1.0410967086792904, + "grad_norm": 0.6266176917586536, + "learning_rate": 1.555597602153079e-05, + "loss": 0.3143, + "step": 2179 + }, + { + "epoch": 1.0415745773848635, + "grad_norm": 0.5309556876141732, + "learning_rate": 1.555159623797161e-05, + "loss": 0.3403, + "step": 2180 + }, + { + "epoch": 1.0420524460904366, + "grad_norm": 0.4931662085144718, + "learning_rate": 1.554721491449756e-05, + "loss": 0.3397, + "step": 2181 + }, + { + "epoch": 1.0425303147960099, + "grad_norm": 0.5645039752023514, + "learning_rate": 1.5542832052323943e-05, + "loss": 0.3396, + "step": 2182 + }, + { + "epoch": 1.043008183501583, + "grad_norm": 0.49990356604600616, + "learning_rate": 1.553844765266649e-05, + "loss": 0.339, + "step": 2183 + }, + { + "epoch": 1.043486052207156, + "grad_norm": 0.5468317171758479, + "learning_rate": 1.5534061716741358e-05, + "loss": 0.3219, + "step": 2184 + }, + { + "epoch": 1.0439639209127292, + "grad_norm": 1.455991273998016, + "learning_rate": 1.552967424576512e-05, + "loss": 0.3444, + "step": 2185 + }, + { + "epoch": 1.0444417896183025, + "grad_norm": 0.5138914378146582, + "learning_rate": 1.5525285240954793e-05, + "loss": 0.3437, + "step": 2186 + }, + { + "epoch": 1.0449196583238756, + "grad_norm": 0.4771708845133923, + "learning_rate": 1.55208947035278e-05, + "loss": 0.3401, + "step": 2187 + }, + { + "epoch": 1.0453975270294487, + "grad_norm": 17.211331468923103, + "learning_rate": 1.5516502634702003e-05, + "loss": 0.3233, + "step": 2188 + }, + { + "epoch": 1.0458753957350218, + "grad_norm": 0.6438223472017501, + "learning_rate": 1.5512109035695688e-05, + "loss": 0.3215, + "step": 2189 + }, + { + "epoch": 1.0463532644405948, + "grad_norm": 0.5071979054947092, + "learning_rate": 1.5507713907727557e-05, + "loss": 0.3381, + "step": 2190 + }, + { + "epoch": 1.0468311331461682, + "grad_norm": 0.5360652181387802, + "learning_rate": 1.550331725201674e-05, + "loss": 0.3309, + "step": 2191 + }, + { + "epoch": 1.0473090018517413, + "grad_norm": 0.5811188677891611, + "learning_rate": 1.54989190697828e-05, + "loss": 0.3271, + "step": 2192 + }, + { + "epoch": 1.0477868705573143, + "grad_norm": 0.49109998644777375, + "learning_rate": 1.5494519362245702e-05, + "loss": 0.3217, + "step": 2193 + }, + { + "epoch": 1.0482647392628874, + "grad_norm": 0.5249085574500518, + "learning_rate": 1.549011813062586e-05, + "loss": 0.3363, + "step": 2194 + }, + { + "epoch": 1.0487426079684607, + "grad_norm": 0.5450121620506873, + "learning_rate": 1.5485715376144087e-05, + "loss": 0.3368, + "step": 2195 + }, + { + "epoch": 1.0492204766740338, + "grad_norm": 0.5333106429783062, + "learning_rate": 1.5481311100021642e-05, + "loss": 0.3482, + "step": 2196 + }, + { + "epoch": 1.049698345379607, + "grad_norm": 0.5132325689081423, + "learning_rate": 1.5476905303480183e-05, + "loss": 0.3235, + "step": 2197 + }, + { + "epoch": 1.05017621408518, + "grad_norm": 0.5603270076262434, + "learning_rate": 1.5472497987741803e-05, + "loss": 0.3245, + "step": 2198 + }, + { + "epoch": 1.0506540827907533, + "grad_norm": 0.5083807526273509, + "learning_rate": 1.5468089154029016e-05, + "loss": 0.3382, + "step": 2199 + }, + { + "epoch": 1.0511319514963264, + "grad_norm": 0.5338317283874571, + "learning_rate": 1.5463678803564753e-05, + "loss": 0.3225, + "step": 2200 + }, + { + "epoch": 1.0516098202018995, + "grad_norm": 0.5268204769201611, + "learning_rate": 1.5459266937572367e-05, + "loss": 0.3419, + "step": 2201 + }, + { + "epoch": 1.0520876889074726, + "grad_norm": 0.4795433149023876, + "learning_rate": 1.5454853557275632e-05, + "loss": 0.3464, + "step": 2202 + }, + { + "epoch": 1.0525655576130457, + "grad_norm": 0.5808022832179441, + "learning_rate": 1.5450438663898743e-05, + "loss": 0.3177, + "step": 2203 + }, + { + "epoch": 1.053043426318619, + "grad_norm": 0.5145172026904812, + "learning_rate": 1.5446022258666313e-05, + "loss": 0.3361, + "step": 2204 + }, + { + "epoch": 1.0535212950241921, + "grad_norm": 0.47386368364154696, + "learning_rate": 1.5441604342803374e-05, + "loss": 0.3188, + "step": 2205 + }, + { + "epoch": 1.0539991637297652, + "grad_norm": 0.6038585929683551, + "learning_rate": 1.5437184917535377e-05, + "loss": 0.3415, + "step": 2206 + }, + { + "epoch": 1.0544770324353383, + "grad_norm": 0.51095983032144, + "learning_rate": 1.5432763984088195e-05, + "loss": 0.328, + "step": 2207 + }, + { + "epoch": 1.0549549011409116, + "grad_norm": 0.49778348332654354, + "learning_rate": 1.5428341543688116e-05, + "loss": 0.3249, + "step": 2208 + }, + { + "epoch": 1.0554327698464847, + "grad_norm": 0.5813291087757343, + "learning_rate": 1.542391759756185e-05, + "loss": 0.3324, + "step": 2209 + }, + { + "epoch": 1.0559106385520578, + "grad_norm": 0.5194247870224709, + "learning_rate": 1.5419492146936518e-05, + "loss": 0.3412, + "step": 2210 + }, + { + "epoch": 1.056388507257631, + "grad_norm": 0.49251753974846363, + "learning_rate": 1.5415065193039658e-05, + "loss": 0.3357, + "step": 2211 + }, + { + "epoch": 1.0568663759632042, + "grad_norm": 0.5290839901414063, + "learning_rate": 1.5410636737099238e-05, + "loss": 0.327, + "step": 2212 + }, + { + "epoch": 1.0573442446687773, + "grad_norm": 0.5447923991162895, + "learning_rate": 1.5406206780343626e-05, + "loss": 0.3332, + "step": 2213 + }, + { + "epoch": 1.0578221133743504, + "grad_norm": 0.563508010318956, + "learning_rate": 1.540177532400162e-05, + "loss": 0.3398, + "step": 2214 + }, + { + "epoch": 1.0582999820799235, + "grad_norm": 0.5608591424899783, + "learning_rate": 1.5397342369302425e-05, + "loss": 0.3121, + "step": 2215 + }, + { + "epoch": 1.0587778507854968, + "grad_norm": 0.5390541825320886, + "learning_rate": 1.5392907917475662e-05, + "loss": 0.3263, + "step": 2216 + }, + { + "epoch": 1.05925571949107, + "grad_norm": 0.6375454291641146, + "learning_rate": 1.5388471969751373e-05, + "loss": 0.3286, + "step": 2217 + }, + { + "epoch": 1.059733588196643, + "grad_norm": 0.6197563874990852, + "learning_rate": 1.538403452736001e-05, + "loss": 0.3375, + "step": 2218 + }, + { + "epoch": 1.060211456902216, + "grad_norm": 0.6485799859247667, + "learning_rate": 1.5379595591532442e-05, + "loss": 0.3498, + "step": 2219 + }, + { + "epoch": 1.0606893256077892, + "grad_norm": 0.5016640951997636, + "learning_rate": 1.5375155163499953e-05, + "loss": 0.3103, + "step": 2220 + }, + { + "epoch": 1.0611671943133625, + "grad_norm": 0.505528792203374, + "learning_rate": 1.5370713244494235e-05, + "loss": 0.3398, + "step": 2221 + }, + { + "epoch": 1.0616450630189356, + "grad_norm": 0.6607061590143123, + "learning_rate": 1.53662698357474e-05, + "loss": 0.3382, + "step": 2222 + }, + { + "epoch": 1.0621229317245087, + "grad_norm": 0.5195925433144004, + "learning_rate": 1.536182493849198e-05, + "loss": 0.3231, + "step": 2223 + }, + { + "epoch": 1.0626008004300818, + "grad_norm": 0.5789827056376714, + "learning_rate": 1.5357378553960892e-05, + "loss": 0.3313, + "step": 2224 + }, + { + "epoch": 1.063078669135655, + "grad_norm": 0.5365461420949971, + "learning_rate": 1.5352930683387502e-05, + "loss": 0.3299, + "step": 2225 + }, + { + "epoch": 1.0635565378412282, + "grad_norm": 0.512379234100911, + "learning_rate": 1.5348481328005566e-05, + "loss": 0.321, + "step": 2226 + }, + { + "epoch": 1.0640344065468013, + "grad_norm": 0.5335894256345767, + "learning_rate": 1.534403048904925e-05, + "loss": 0.3451, + "step": 2227 + }, + { + "epoch": 1.0645122752523744, + "grad_norm": 0.5464542705568193, + "learning_rate": 1.5339578167753145e-05, + "loss": 0.3389, + "step": 2228 + }, + { + "epoch": 1.0649901439579477, + "grad_norm": 0.5232741311541103, + "learning_rate": 1.5335124365352246e-05, + "loss": 0.3342, + "step": 2229 + }, + { + "epoch": 1.0654680126635208, + "grad_norm": 0.5900396821697615, + "learning_rate": 1.5330669083081956e-05, + "loss": 0.3391, + "step": 2230 + }, + { + "epoch": 1.0659458813690939, + "grad_norm": 0.5093924605652755, + "learning_rate": 1.5326212322178097e-05, + "loss": 0.3286, + "step": 2231 + }, + { + "epoch": 1.066423750074667, + "grad_norm": 0.5716841100121078, + "learning_rate": 1.5321754083876893e-05, + "loss": 0.323, + "step": 2232 + }, + { + "epoch": 1.06690161878024, + "grad_norm": 0.4842785242623549, + "learning_rate": 1.5317294369414975e-05, + "loss": 0.3408, + "step": 2233 + }, + { + "epoch": 1.0673794874858133, + "grad_norm": 0.513994095767651, + "learning_rate": 1.53128331800294e-05, + "loss": 0.3466, + "step": 2234 + }, + { + "epoch": 1.0678573561913864, + "grad_norm": 0.5758288986174495, + "learning_rate": 1.5308370516957617e-05, + "loss": 0.3217, + "step": 2235 + }, + { + "epoch": 1.0683352248969595, + "grad_norm": 0.5200971785900195, + "learning_rate": 1.5303906381437487e-05, + "loss": 0.3251, + "step": 2236 + }, + { + "epoch": 1.0688130936025326, + "grad_norm": 0.5033891597260092, + "learning_rate": 1.529944077470729e-05, + "loss": 0.3236, + "step": 2237 + }, + { + "epoch": 1.069290962308106, + "grad_norm": 0.49510942051055734, + "learning_rate": 1.52949736980057e-05, + "loss": 0.3232, + "step": 2238 + }, + { + "epoch": 1.069768831013679, + "grad_norm": 0.49918931484754825, + "learning_rate": 1.529050515257181e-05, + "loss": 0.3292, + "step": 2239 + }, + { + "epoch": 1.0702466997192521, + "grad_norm": 0.5263649316343764, + "learning_rate": 1.528603513964511e-05, + "loss": 0.3186, + "step": 2240 + }, + { + "epoch": 1.0707245684248252, + "grad_norm": 0.506050997373646, + "learning_rate": 1.528156366046551e-05, + "loss": 0.3386, + "step": 2241 + }, + { + "epoch": 1.0712024371303985, + "grad_norm": 0.4759187913918262, + "learning_rate": 1.5277090716273313e-05, + "loss": 0.3244, + "step": 2242 + }, + { + "epoch": 1.0716803058359716, + "grad_norm": 0.5170077950073158, + "learning_rate": 1.527261630830924e-05, + "loss": 0.34, + "step": 2243 + }, + { + "epoch": 1.0721581745415447, + "grad_norm": 0.5371316049130486, + "learning_rate": 1.526814043781441e-05, + "loss": 0.3139, + "step": 2244 + }, + { + "epoch": 1.0726360432471178, + "grad_norm": 0.4661135714534529, + "learning_rate": 1.5263663106030347e-05, + "loss": 0.3431, + "step": 2245 + }, + { + "epoch": 1.073113911952691, + "grad_norm": 0.5050579846579514, + "learning_rate": 1.5259184314198995e-05, + "loss": 0.3182, + "step": 2246 + }, + { + "epoch": 1.0735917806582642, + "grad_norm": 0.49161171778636, + "learning_rate": 1.5254704063562678e-05, + "loss": 0.3662, + "step": 2247 + }, + { + "epoch": 1.0740696493638373, + "grad_norm": 0.6153694810824738, + "learning_rate": 1.5250222355364149e-05, + "loss": 0.3144, + "step": 2248 + }, + { + "epoch": 1.0745475180694104, + "grad_norm": 0.4888202707715523, + "learning_rate": 1.5245739190846549e-05, + "loss": 0.3398, + "step": 2249 + }, + { + "epoch": 1.0750253867749835, + "grad_norm": 0.5124355341785777, + "learning_rate": 1.5241254571253433e-05, + "loss": 0.3548, + "step": 2250 + }, + { + "epoch": 1.0755032554805568, + "grad_norm": 0.5077616262970712, + "learning_rate": 1.5236768497828753e-05, + "loss": 0.3502, + "step": 2251 + }, + { + "epoch": 1.07598112418613, + "grad_norm": 0.53688126358671, + "learning_rate": 1.5232280971816864e-05, + "loss": 0.3308, + "step": 2252 + }, + { + "epoch": 1.076458992891703, + "grad_norm": 0.49623504126840645, + "learning_rate": 1.5227791994462529e-05, + "loss": 0.3179, + "step": 2253 + }, + { + "epoch": 1.076936861597276, + "grad_norm": 0.5575295987434887, + "learning_rate": 1.5223301567010916e-05, + "loss": 0.321, + "step": 2254 + }, + { + "epoch": 1.0774147303028494, + "grad_norm": 0.5298944158079708, + "learning_rate": 1.5218809690707583e-05, + "loss": 0.3341, + "step": 2255 + }, + { + "epoch": 1.0778925990084225, + "grad_norm": 0.5077709934710473, + "learning_rate": 1.5214316366798498e-05, + "loss": 0.3521, + "step": 2256 + }, + { + "epoch": 1.0783704677139956, + "grad_norm": 0.5439955998090592, + "learning_rate": 1.5209821596530035e-05, + "loss": 0.3184, + "step": 2257 + }, + { + "epoch": 1.0788483364195687, + "grad_norm": 0.5084561135875706, + "learning_rate": 1.5205325381148958e-05, + "loss": 0.345, + "step": 2258 + }, + { + "epoch": 1.0793262051251418, + "grad_norm": 0.5025281601561041, + "learning_rate": 1.5200827721902443e-05, + "loss": 0.3134, + "step": 2259 + }, + { + "epoch": 1.079804073830715, + "grad_norm": 0.5229250055944893, + "learning_rate": 1.5196328620038059e-05, + "loss": 0.3437, + "step": 2260 + }, + { + "epoch": 1.0802819425362882, + "grad_norm": 0.5010024267654524, + "learning_rate": 1.5191828076803776e-05, + "loss": 0.3161, + "step": 2261 + }, + { + "epoch": 1.0807598112418613, + "grad_norm": 0.5752421274767535, + "learning_rate": 1.5187326093447965e-05, + "loss": 0.346, + "step": 2262 + }, + { + "epoch": 1.0812376799474344, + "grad_norm": 0.5513552600942755, + "learning_rate": 1.5182822671219404e-05, + "loss": 0.3368, + "step": 2263 + }, + { + "epoch": 1.0817155486530077, + "grad_norm": 0.49596162792066967, + "learning_rate": 1.5178317811367254e-05, + "loss": 0.325, + "step": 2264 + }, + { + "epoch": 1.0821934173585808, + "grad_norm": 0.5479700565884935, + "learning_rate": 1.5173811515141083e-05, + "loss": 0.3346, + "step": 2265 + }, + { + "epoch": 1.0826712860641539, + "grad_norm": 0.6256543187747687, + "learning_rate": 1.516930378379087e-05, + "loss": 0.3251, + "step": 2266 + }, + { + "epoch": 1.083149154769727, + "grad_norm": 0.5689155161140599, + "learning_rate": 1.516479461856697e-05, + "loss": 0.3363, + "step": 2267 + }, + { + "epoch": 1.0836270234753003, + "grad_norm": 0.7023942415790342, + "learning_rate": 1.5160284020720144e-05, + "loss": 0.3256, + "step": 2268 + }, + { + "epoch": 1.0841048921808734, + "grad_norm": 0.4944938967568079, + "learning_rate": 1.5155771991501562e-05, + "loss": 0.3397, + "step": 2269 + }, + { + "epoch": 1.0845827608864465, + "grad_norm": 0.7545407502316881, + "learning_rate": 1.5151258532162771e-05, + "loss": 0.3239, + "step": 2270 + }, + { + "epoch": 1.0850606295920195, + "grad_norm": 0.5318090108055559, + "learning_rate": 1.5146743643955732e-05, + "loss": 0.3243, + "step": 2271 + }, + { + "epoch": 1.0855384982975926, + "grad_norm": 0.5221147872004993, + "learning_rate": 1.5142227328132797e-05, + "loss": 0.3364, + "step": 2272 + }, + { + "epoch": 1.086016367003166, + "grad_norm": 0.5704820936647907, + "learning_rate": 1.5137709585946705e-05, + "loss": 0.3423, + "step": 2273 + }, + { + "epoch": 1.086494235708739, + "grad_norm": 0.5013932264970669, + "learning_rate": 1.51331904186506e-05, + "loss": 0.3258, + "step": 2274 + }, + { + "epoch": 1.0869721044143121, + "grad_norm": 0.5257762249096876, + "learning_rate": 1.5128669827498024e-05, + "loss": 0.3246, + "step": 2275 + }, + { + "epoch": 1.0874499731198852, + "grad_norm": 0.5139561282311831, + "learning_rate": 1.5124147813742904e-05, + "loss": 0.3293, + "step": 2276 + }, + { + "epoch": 1.0879278418254585, + "grad_norm": 0.543726884049005, + "learning_rate": 1.5119624378639568e-05, + "loss": 0.3555, + "step": 2277 + }, + { + "epoch": 1.0884057105310316, + "grad_norm": 0.512178392507077, + "learning_rate": 1.511509952344274e-05, + "loss": 0.3254, + "step": 2278 + }, + { + "epoch": 1.0888835792366047, + "grad_norm": 0.5176231019242892, + "learning_rate": 1.511057324940753e-05, + "loss": 0.3253, + "step": 2279 + }, + { + "epoch": 1.0893614479421778, + "grad_norm": 0.5278862653717663, + "learning_rate": 1.5106045557789453e-05, + "loss": 0.3346, + "step": 2280 + }, + { + "epoch": 1.0898393166477511, + "grad_norm": 0.47608912937489073, + "learning_rate": 1.5101516449844407e-05, + "loss": 0.3325, + "step": 2281 + }, + { + "epoch": 1.0903171853533242, + "grad_norm": 0.5309476770511793, + "learning_rate": 1.5096985926828684e-05, + "loss": 0.351, + "step": 2282 + }, + { + "epoch": 1.0907950540588973, + "grad_norm": 0.5133648196697228, + "learning_rate": 1.5092453989998976e-05, + "loss": 0.3369, + "step": 2283 + }, + { + "epoch": 1.0912729227644704, + "grad_norm": 0.5748784259519858, + "learning_rate": 1.5087920640612361e-05, + "loss": 0.342, + "step": 2284 + }, + { + "epoch": 1.0917507914700435, + "grad_norm": 0.51205359618062, + "learning_rate": 1.5083385879926309e-05, + "loss": 0.3304, + "step": 2285 + }, + { + "epoch": 1.0922286601756168, + "grad_norm": 0.5640393858567604, + "learning_rate": 1.5078849709198687e-05, + "loss": 0.3529, + "step": 2286 + }, + { + "epoch": 1.09270652888119, + "grad_norm": 0.5141124022477992, + "learning_rate": 1.5074312129687741e-05, + "loss": 0.3425, + "step": 2287 + }, + { + "epoch": 1.093184397586763, + "grad_norm": 0.5187378970830254, + "learning_rate": 1.5069773142652119e-05, + "loss": 0.3378, + "step": 2288 + }, + { + "epoch": 1.093662266292336, + "grad_norm": 0.520566150228239, + "learning_rate": 1.506523274935086e-05, + "loss": 0.3461, + "step": 2289 + }, + { + "epoch": 1.0941401349979094, + "grad_norm": 0.5296392819928428, + "learning_rate": 1.5060690951043385e-05, + "loss": 0.3417, + "step": 2290 + }, + { + "epoch": 1.0946180037034825, + "grad_norm": 0.5195069878439671, + "learning_rate": 1.5056147748989505e-05, + "loss": 0.3234, + "step": 2291 + }, + { + "epoch": 1.0950958724090556, + "grad_norm": 0.5634053365773759, + "learning_rate": 1.5051603144449431e-05, + "loss": 0.3476, + "step": 2292 + }, + { + "epoch": 1.0955737411146287, + "grad_norm": 0.5457090574500167, + "learning_rate": 1.5047057138683753e-05, + "loss": 0.3214, + "step": 2293 + }, + { + "epoch": 1.096051609820202, + "grad_norm": 0.49911077929816133, + "learning_rate": 1.5042509732953454e-05, + "loss": 0.3414, + "step": 2294 + }, + { + "epoch": 1.096529478525775, + "grad_norm": 2.702449341602149, + "learning_rate": 1.5037960928519902e-05, + "loss": 0.3377, + "step": 2295 + }, + { + "epoch": 1.0970073472313482, + "grad_norm": 0.5565135853631065, + "learning_rate": 1.5033410726644859e-05, + "loss": 0.3343, + "step": 2296 + }, + { + "epoch": 1.0974852159369213, + "grad_norm": 0.5073307379974941, + "learning_rate": 1.5028859128590468e-05, + "loss": 0.327, + "step": 2297 + }, + { + "epoch": 1.0979630846424944, + "grad_norm": 0.5160051255941388, + "learning_rate": 1.502430613561926e-05, + "loss": 0.344, + "step": 2298 + }, + { + "epoch": 1.0984409533480677, + "grad_norm": 0.5348498573334795, + "learning_rate": 1.5019751748994158e-05, + "loss": 0.3494, + "step": 2299 + }, + { + "epoch": 1.0989188220536408, + "grad_norm": 0.4865115046264663, + "learning_rate": 1.501519596997847e-05, + "loss": 0.3435, + "step": 2300 + }, + { + "epoch": 1.0993966907592139, + "grad_norm": 0.5216309305575126, + "learning_rate": 1.5010638799835884e-05, + "loss": 0.33, + "step": 2301 + }, + { + "epoch": 1.099874559464787, + "grad_norm": 0.539223989848975, + "learning_rate": 1.5006080239830483e-05, + "loss": 0.3361, + "step": 2302 + }, + { + "epoch": 1.1003524281703603, + "grad_norm": 0.5001901728249805, + "learning_rate": 1.5001520291226727e-05, + "loss": 0.3236, + "step": 2303 + }, + { + "epoch": 1.1008302968759334, + "grad_norm": 0.515833509744522, + "learning_rate": 1.499695895528947e-05, + "loss": 0.3476, + "step": 2304 + }, + { + "epoch": 1.1013081655815065, + "grad_norm": 0.5136798035818348, + "learning_rate": 1.499239623328394e-05, + "loss": 0.346, + "step": 2305 + }, + { + "epoch": 1.1017860342870796, + "grad_norm": 0.5181371149654749, + "learning_rate": 1.4987832126475763e-05, + "loss": 0.3427, + "step": 2306 + }, + { + "epoch": 1.1022639029926529, + "grad_norm": 0.5071439491474284, + "learning_rate": 1.4983266636130935e-05, + "loss": 0.3404, + "step": 2307 + }, + { + "epoch": 1.102741771698226, + "grad_norm": 0.5024865933279972, + "learning_rate": 1.4978699763515848e-05, + "loss": 0.3408, + "step": 2308 + }, + { + "epoch": 1.103219640403799, + "grad_norm": 0.6901876193242537, + "learning_rate": 1.4974131509897269e-05, + "loss": 0.3246, + "step": 2309 + }, + { + "epoch": 1.1036975091093721, + "grad_norm": 0.5498690262019805, + "learning_rate": 1.4969561876542348e-05, + "loss": 0.339, + "step": 2310 + }, + { + "epoch": 1.1041753778149452, + "grad_norm": 0.5305378391568346, + "learning_rate": 1.4964990864718627e-05, + "loss": 0.324, + "step": 2311 + }, + { + "epoch": 1.1046532465205186, + "grad_norm": 0.6920996981731619, + "learning_rate": 1.496041847569402e-05, + "loss": 0.327, + "step": 2312 + }, + { + "epoch": 1.1051311152260916, + "grad_norm": 0.5051871088425068, + "learning_rate": 1.4955844710736829e-05, + "loss": 0.324, + "step": 2313 + }, + { + "epoch": 1.1056089839316647, + "grad_norm": 0.5231668443052977, + "learning_rate": 1.4951269571115735e-05, + "loss": 0.3192, + "step": 2314 + }, + { + "epoch": 1.1060868526372378, + "grad_norm": 0.5107337355608277, + "learning_rate": 1.4946693058099802e-05, + "loss": 0.3187, + "step": 2315 + }, + { + "epoch": 1.1065647213428111, + "grad_norm": 0.504994234754607, + "learning_rate": 1.494211517295847e-05, + "loss": 0.3274, + "step": 2316 + }, + { + "epoch": 1.1070425900483842, + "grad_norm": 0.6160822307451901, + "learning_rate": 1.4937535916961568e-05, + "loss": 0.3296, + "step": 2317 + }, + { + "epoch": 1.1075204587539573, + "grad_norm": 0.4975876476425877, + "learning_rate": 1.49329552913793e-05, + "loss": 0.3389, + "step": 2318 + }, + { + "epoch": 1.1079983274595304, + "grad_norm": 0.5403670488971715, + "learning_rate": 1.4928373297482249e-05, + "loss": 0.3455, + "step": 2319 + }, + { + "epoch": 1.1084761961651037, + "grad_norm": 0.529726563264269, + "learning_rate": 1.4923789936541378e-05, + "loss": 0.3406, + "step": 2320 + }, + { + "epoch": 1.1089540648706768, + "grad_norm": 2.35075832261799, + "learning_rate": 1.4919205209828037e-05, + "loss": 0.3349, + "step": 2321 + }, + { + "epoch": 1.10943193357625, + "grad_norm": 0.5425366907719269, + "learning_rate": 1.4914619118613942e-05, + "loss": 0.3317, + "step": 2322 + }, + { + "epoch": 1.109909802281823, + "grad_norm": 0.5093459744924752, + "learning_rate": 1.4910031664171195e-05, + "loss": 0.3486, + "step": 2323 + }, + { + "epoch": 1.110387670987396, + "grad_norm": 0.5160824382283055, + "learning_rate": 1.4905442847772278e-05, + "loss": 0.3339, + "step": 2324 + }, + { + "epoch": 1.1108655396929694, + "grad_norm": 0.5457445322763627, + "learning_rate": 1.4900852670690044e-05, + "loss": 0.3391, + "step": 2325 + }, + { + "epoch": 1.1113434083985425, + "grad_norm": 0.5181476585227104, + "learning_rate": 1.4896261134197729e-05, + "loss": 0.3454, + "step": 2326 + }, + { + "epoch": 1.1118212771041156, + "grad_norm": 0.5151616532814238, + "learning_rate": 1.4891668239568943e-05, + "loss": 0.3323, + "step": 2327 + }, + { + "epoch": 1.1122991458096887, + "grad_norm": 0.48685718879651513, + "learning_rate": 1.4887073988077678e-05, + "loss": 0.3325, + "step": 2328 + }, + { + "epoch": 1.112777014515262, + "grad_norm": 0.6039961072105209, + "learning_rate": 1.4882478380998291e-05, + "loss": 0.3354, + "step": 2329 + }, + { + "epoch": 1.113254883220835, + "grad_norm": 0.49114939066967817, + "learning_rate": 1.487788141960553e-05, + "loss": 0.3262, + "step": 2330 + }, + { + "epoch": 1.1137327519264082, + "grad_norm": 0.46862190557531513, + "learning_rate": 1.4873283105174504e-05, + "loss": 0.3366, + "step": 2331 + }, + { + "epoch": 1.1142106206319813, + "grad_norm": 0.47383978484154615, + "learning_rate": 1.4868683438980714e-05, + "loss": 0.3241, + "step": 2332 + }, + { + "epoch": 1.1146884893375546, + "grad_norm": 0.48438365346141576, + "learning_rate": 1.4864082422300015e-05, + "loss": 0.3239, + "step": 2333 + }, + { + "epoch": 1.1151663580431277, + "grad_norm": 0.49902567827730593, + "learning_rate": 1.4859480056408653e-05, + "loss": 0.3343, + "step": 2334 + }, + { + "epoch": 1.1156442267487008, + "grad_norm": 0.5240825256150454, + "learning_rate": 1.4854876342583246e-05, + "loss": 0.3525, + "step": 2335 + }, + { + "epoch": 1.1161220954542739, + "grad_norm": 0.6398611930145781, + "learning_rate": 1.4850271282100779e-05, + "loss": 0.3283, + "step": 2336 + }, + { + "epoch": 1.116599964159847, + "grad_norm": 0.5213749367846112, + "learning_rate": 1.4845664876238615e-05, + "loss": 0.3461, + "step": 2337 + }, + { + "epoch": 1.1170778328654203, + "grad_norm": 0.5017747641723227, + "learning_rate": 1.4841057126274491e-05, + "loss": 0.3476, + "step": 2338 + }, + { + "epoch": 1.1175557015709934, + "grad_norm": 0.5121891657183757, + "learning_rate": 1.4836448033486513e-05, + "loss": 0.3116, + "step": 2339 + }, + { + "epoch": 1.1180335702765665, + "grad_norm": 0.490973436785604, + "learning_rate": 1.4831837599153165e-05, + "loss": 0.3479, + "step": 2340 + }, + { + "epoch": 1.1185114389821396, + "grad_norm": 0.4995891989120494, + "learning_rate": 1.4827225824553302e-05, + "loss": 0.3216, + "step": 2341 + }, + { + "epoch": 1.1189893076877129, + "grad_norm": 0.48709022056044177, + "learning_rate": 1.4822612710966143e-05, + "loss": 0.3325, + "step": 2342 + }, + { + "epoch": 1.119467176393286, + "grad_norm": 0.5110741917418703, + "learning_rate": 1.4817998259671286e-05, + "loss": 0.3411, + "step": 2343 + }, + { + "epoch": 1.119945045098859, + "grad_norm": 0.526551969506569, + "learning_rate": 1.4813382471948705e-05, + "loss": 0.3283, + "step": 2344 + }, + { + "epoch": 1.1204229138044322, + "grad_norm": 0.487810166808925, + "learning_rate": 1.4808765349078729e-05, + "loss": 0.3299, + "step": 2345 + }, + { + "epoch": 1.1209007825100055, + "grad_norm": 0.5296951175479421, + "learning_rate": 1.4804146892342071e-05, + "loss": 0.3379, + "step": 2346 + }, + { + "epoch": 1.1213786512155786, + "grad_norm": 0.5101831769474524, + "learning_rate": 1.4799527103019808e-05, + "loss": 0.3271, + "step": 2347 + }, + { + "epoch": 1.1218565199211517, + "grad_norm": 0.4890714139873493, + "learning_rate": 1.4794905982393388e-05, + "loss": 0.3395, + "step": 2348 + }, + { + "epoch": 1.1223343886267247, + "grad_norm": 0.5450992295068559, + "learning_rate": 1.4790283531744634e-05, + "loss": 0.3446, + "step": 2349 + }, + { + "epoch": 1.1228122573322978, + "grad_norm": 0.5031828458579483, + "learning_rate": 1.4785659752355724e-05, + "loss": 0.3466, + "step": 2350 + }, + { + "epoch": 1.1232901260378712, + "grad_norm": 0.4811659005185453, + "learning_rate": 1.4781034645509216e-05, + "loss": 0.3485, + "step": 2351 + }, + { + "epoch": 1.1237679947434442, + "grad_norm": 0.5448333215238423, + "learning_rate": 1.4776408212488035e-05, + "loss": 0.3263, + "step": 2352 + }, + { + "epoch": 1.1242458634490173, + "grad_norm": 0.4953819912341556, + "learning_rate": 1.4771780454575468e-05, + "loss": 0.3272, + "step": 2353 + }, + { + "epoch": 1.1247237321545904, + "grad_norm": 0.5175483590354045, + "learning_rate": 1.4767151373055178e-05, + "loss": 0.3269, + "step": 2354 + }, + { + "epoch": 1.1252016008601637, + "grad_norm": 0.5840296576158932, + "learning_rate": 1.4762520969211186e-05, + "loss": 0.3311, + "step": 2355 + }, + { + "epoch": 1.1256794695657368, + "grad_norm": 0.5464923552621009, + "learning_rate": 1.4757889244327888e-05, + "loss": 0.3181, + "step": 2356 + }, + { + "epoch": 1.12615733827131, + "grad_norm": 0.5188633658259454, + "learning_rate": 1.475325619969004e-05, + "loss": 0.3477, + "step": 2357 + }, + { + "epoch": 1.126635206976883, + "grad_norm": 0.4888981161848857, + "learning_rate": 1.4748621836582772e-05, + "loss": 0.3253, + "step": 2358 + }, + { + "epoch": 1.1271130756824563, + "grad_norm": 0.5231748369329128, + "learning_rate": 1.4743986156291568e-05, + "loss": 0.3304, + "step": 2359 + }, + { + "epoch": 1.1275909443880294, + "grad_norm": 0.4988116880762236, + "learning_rate": 1.4739349160102285e-05, + "loss": 0.3313, + "step": 2360 + }, + { + "epoch": 1.1280688130936025, + "grad_norm": 0.5036551455737499, + "learning_rate": 1.4734710849301151e-05, + "loss": 0.3258, + "step": 2361 + }, + { + "epoch": 1.1285466817991756, + "grad_norm": 0.5138628273300756, + "learning_rate": 1.473007122517474e-05, + "loss": 0.3305, + "step": 2362 + }, + { + "epoch": 1.1290245505047487, + "grad_norm": 0.5138521817190567, + "learning_rate": 1.472543028901001e-05, + "loss": 0.3438, + "step": 2363 + }, + { + "epoch": 1.129502419210322, + "grad_norm": 0.5299139873711323, + "learning_rate": 1.4720788042094273e-05, + "loss": 0.3243, + "step": 2364 + }, + { + "epoch": 1.129980287915895, + "grad_norm": 0.5461978922038623, + "learning_rate": 1.4716144485715209e-05, + "loss": 0.3236, + "step": 2365 + }, + { + "epoch": 1.1304581566214682, + "grad_norm": 0.5311350713847056, + "learning_rate": 1.4711499621160855e-05, + "loss": 0.3174, + "step": 2366 + }, + { + "epoch": 1.1309360253270415, + "grad_norm": 0.5009730559856843, + "learning_rate": 1.4706853449719614e-05, + "loss": 0.3376, + "step": 2367 + }, + { + "epoch": 1.1314138940326146, + "grad_norm": 0.5018126239017102, + "learning_rate": 1.4702205972680254e-05, + "loss": 0.3328, + "step": 2368 + }, + { + "epoch": 1.1318917627381877, + "grad_norm": 0.5242488655503064, + "learning_rate": 1.4697557191331902e-05, + "loss": 0.3193, + "step": 2369 + }, + { + "epoch": 1.1323696314437608, + "grad_norm": 0.5945549331790201, + "learning_rate": 1.4692907106964051e-05, + "loss": 0.3345, + "step": 2370 + }, + { + "epoch": 1.1328475001493339, + "grad_norm": 0.48153137632362025, + "learning_rate": 1.468825572086655e-05, + "loss": 0.3378, + "step": 2371 + }, + { + "epoch": 1.1333253688549072, + "grad_norm": 0.5151533016446908, + "learning_rate": 1.4683603034329608e-05, + "loss": 0.3168, + "step": 2372 + }, + { + "epoch": 1.1338032375604803, + "grad_norm": 0.49149298419228526, + "learning_rate": 1.4678949048643806e-05, + "loss": 0.3354, + "step": 2373 + }, + { + "epoch": 1.1342811062660534, + "grad_norm": 0.48897908673118196, + "learning_rate": 1.4674293765100069e-05, + "loss": 0.3339, + "step": 2374 + }, + { + "epoch": 1.1347589749716265, + "grad_norm": 0.4949702713316802, + "learning_rate": 1.4669637184989696e-05, + "loss": 0.3257, + "step": 2375 + }, + { + "epoch": 1.1352368436771996, + "grad_norm": 0.5058801752873772, + "learning_rate": 1.466497930960434e-05, + "loss": 0.3519, + "step": 2376 + }, + { + "epoch": 1.1357147123827729, + "grad_norm": 0.5237225993986185, + "learning_rate": 1.4660320140236014e-05, + "loss": 0.3322, + "step": 2377 + }, + { + "epoch": 1.136192581088346, + "grad_norm": 0.5177111438994864, + "learning_rate": 1.4655659678177084e-05, + "loss": 0.336, + "step": 2378 + }, + { + "epoch": 1.136670449793919, + "grad_norm": 0.5012469860916707, + "learning_rate": 1.4650997924720288e-05, + "loss": 0.32, + "step": 2379 + }, + { + "epoch": 1.1371483184994924, + "grad_norm": 0.5648557105154629, + "learning_rate": 1.4646334881158704e-05, + "loss": 0.3446, + "step": 2380 + }, + { + "epoch": 1.1376261872050655, + "grad_norm": 0.5492302394701447, + "learning_rate": 1.4641670548785788e-05, + "loss": 0.3237, + "step": 2381 + }, + { + "epoch": 1.1381040559106386, + "grad_norm": 0.5032033816755968, + "learning_rate": 1.4637004928895337e-05, + "loss": 0.336, + "step": 2382 + }, + { + "epoch": 1.1385819246162117, + "grad_norm": 0.5123273009616124, + "learning_rate": 1.4632338022781516e-05, + "loss": 0.3177, + "step": 2383 + }, + { + "epoch": 1.1390597933217848, + "grad_norm": 0.5450557175277998, + "learning_rate": 1.462766983173884e-05, + "loss": 0.3262, + "step": 2384 + }, + { + "epoch": 1.139537662027358, + "grad_norm": 0.5720917143183127, + "learning_rate": 1.4623000357062184e-05, + "loss": 0.3367, + "step": 2385 + }, + { + "epoch": 1.1400155307329312, + "grad_norm": 0.5807405509549664, + "learning_rate": 1.4618329600046774e-05, + "loss": 0.347, + "step": 2386 + }, + { + "epoch": 1.1404933994385043, + "grad_norm": 0.5038498790994034, + "learning_rate": 1.4613657561988203e-05, + "loss": 0.3368, + "step": 2387 + }, + { + "epoch": 1.1409712681440773, + "grad_norm": 0.4737365464414105, + "learning_rate": 1.4608984244182407e-05, + "loss": 0.3223, + "step": 2388 + }, + { + "epoch": 1.1414491368496504, + "grad_norm": 0.5476800314149111, + "learning_rate": 1.4604309647925683e-05, + "loss": 0.335, + "step": 2389 + }, + { + "epoch": 1.1419270055552238, + "grad_norm": 0.522067736414712, + "learning_rate": 1.459963377451468e-05, + "loss": 0.3407, + "step": 2390 + }, + { + "epoch": 1.1424048742607968, + "grad_norm": 0.5362139688122035, + "learning_rate": 1.4594956625246406e-05, + "loss": 0.3258, + "step": 2391 + }, + { + "epoch": 1.14288274296637, + "grad_norm": 0.5547169113461832, + "learning_rate": 1.4590278201418216e-05, + "loss": 0.3184, + "step": 2392 + }, + { + "epoch": 1.1433606116719432, + "grad_norm": 0.5163474508287502, + "learning_rate": 1.4585598504327824e-05, + "loss": 0.3507, + "step": 2393 + }, + { + "epoch": 1.1438384803775163, + "grad_norm": 0.5023941568737598, + "learning_rate": 1.4580917535273295e-05, + "loss": 0.3541, + "step": 2394 + }, + { + "epoch": 1.1443163490830894, + "grad_norm": 0.5253487591862723, + "learning_rate": 1.457623529555305e-05, + "loss": 0.3262, + "step": 2395 + }, + { + "epoch": 1.1447942177886625, + "grad_norm": 0.534817117306971, + "learning_rate": 1.4571551786465857e-05, + "loss": 0.3324, + "step": 2396 + }, + { + "epoch": 1.1452720864942356, + "grad_norm": 0.48543838148344, + "learning_rate": 1.4566867009310834e-05, + "loss": 0.3483, + "step": 2397 + }, + { + "epoch": 1.145749955199809, + "grad_norm": 0.5361413462770422, + "learning_rate": 1.4562180965387466e-05, + "loss": 0.3236, + "step": 2398 + }, + { + "epoch": 1.146227823905382, + "grad_norm": 0.5551669716622398, + "learning_rate": 1.4557493655995574e-05, + "loss": 0.3553, + "step": 2399 + }, + { + "epoch": 1.1467056926109551, + "grad_norm": 0.5044244521639527, + "learning_rate": 1.4552805082435333e-05, + "loss": 0.3353, + "step": 2400 + }, + { + "epoch": 1.1471835613165282, + "grad_norm": 0.5363754449521199, + "learning_rate": 1.4548115246007274e-05, + "loss": 0.3359, + "step": 2401 + }, + { + "epoch": 1.1476614300221013, + "grad_norm": 0.513128864486388, + "learning_rate": 1.4543424148012271e-05, + "loss": 0.3385, + "step": 2402 + }, + { + "epoch": 1.1481392987276746, + "grad_norm": 0.46287451620253967, + "learning_rate": 1.4538731789751553e-05, + "loss": 0.3286, + "step": 2403 + }, + { + "epoch": 1.1486171674332477, + "grad_norm": 0.5195791493585902, + "learning_rate": 1.45340381725267e-05, + "loss": 0.34, + "step": 2404 + }, + { + "epoch": 1.1490950361388208, + "grad_norm": 0.48538977574421593, + "learning_rate": 1.4529343297639638e-05, + "loss": 0.3307, + "step": 2405 + }, + { + "epoch": 1.1495729048443941, + "grad_norm": 0.5095807086855236, + "learning_rate": 1.452464716639264e-05, + "loss": 0.3502, + "step": 2406 + }, + { + "epoch": 1.1500507735499672, + "grad_norm": 0.5122639691694912, + "learning_rate": 1.4519949780088334e-05, + "loss": 0.3292, + "step": 2407 + }, + { + "epoch": 1.1505286422555403, + "grad_norm": 0.5380110437402467, + "learning_rate": 1.4515251140029687e-05, + "loss": 0.3302, + "step": 2408 + }, + { + "epoch": 1.1510065109611134, + "grad_norm": 0.5213594922561937, + "learning_rate": 1.451055124752002e-05, + "loss": 0.3154, + "step": 2409 + }, + { + "epoch": 1.1514843796666865, + "grad_norm": 0.5161145005085588, + "learning_rate": 1.4505850103863007e-05, + "loss": 0.3373, + "step": 2410 + }, + { + "epoch": 1.1519622483722598, + "grad_norm": 0.5206093606448915, + "learning_rate": 1.4501147710362653e-05, + "loss": 0.3371, + "step": 2411 + }, + { + "epoch": 1.152440117077833, + "grad_norm": 0.5091973446696731, + "learning_rate": 1.4496444068323322e-05, + "loss": 0.3327, + "step": 2412 + }, + { + "epoch": 1.152917985783406, + "grad_norm": 0.5085178563346524, + "learning_rate": 1.4491739179049725e-05, + "loss": 0.3239, + "step": 2413 + }, + { + "epoch": 1.153395854488979, + "grad_norm": 0.5268216105789499, + "learning_rate": 1.448703304384691e-05, + "loss": 0.3334, + "step": 2414 + }, + { + "epoch": 1.1538737231945524, + "grad_norm": 0.5068318386596699, + "learning_rate": 1.448232566402028e-05, + "loss": 0.3527, + "step": 2415 + }, + { + "epoch": 1.1543515919001255, + "grad_norm": 0.5258368149602686, + "learning_rate": 1.4477617040875577e-05, + "loss": 0.3258, + "step": 2416 + }, + { + "epoch": 1.1548294606056986, + "grad_norm": 0.47858102510009803, + "learning_rate": 1.4472907175718893e-05, + "loss": 0.3402, + "step": 2417 + }, + { + "epoch": 1.1553073293112717, + "grad_norm": 0.5821829343093783, + "learning_rate": 1.4468196069856658e-05, + "loss": 0.3197, + "step": 2418 + }, + { + "epoch": 1.155785198016845, + "grad_norm": 0.473803724054973, + "learning_rate": 1.4463483724595651e-05, + "loss": 0.3324, + "step": 2419 + }, + { + "epoch": 1.156263066722418, + "grad_norm": 0.512993585884921, + "learning_rate": 1.4458770141242992e-05, + "loss": 0.3341, + "step": 2420 + }, + { + "epoch": 1.1567409354279912, + "grad_norm": 0.49790696160613324, + "learning_rate": 1.4454055321106148e-05, + "loss": 0.3195, + "step": 2421 + }, + { + "epoch": 1.1572188041335643, + "grad_norm": 0.4852347254650604, + "learning_rate": 1.4449339265492927e-05, + "loss": 0.3296, + "step": 2422 + }, + { + "epoch": 1.1576966728391374, + "grad_norm": 0.5307482887330455, + "learning_rate": 1.4444621975711477e-05, + "loss": 0.3257, + "step": 2423 + }, + { + "epoch": 1.1581745415447107, + "grad_norm": 0.5158352775082153, + "learning_rate": 1.4439903453070294e-05, + "loss": 0.3439, + "step": 2424 + }, + { + "epoch": 1.1586524102502838, + "grad_norm": 0.5214779865598925, + "learning_rate": 1.4435183698878212e-05, + "loss": 0.3232, + "step": 2425 + }, + { + "epoch": 1.1591302789558569, + "grad_norm": 0.5149532717689103, + "learning_rate": 1.4430462714444406e-05, + "loss": 0.3374, + "step": 2426 + }, + { + "epoch": 1.15960814766143, + "grad_norm": 0.5313361793801107, + "learning_rate": 1.4425740501078396e-05, + "loss": 0.3196, + "step": 2427 + }, + { + "epoch": 1.1600860163670033, + "grad_norm": 0.49763038284494304, + "learning_rate": 1.4421017060090041e-05, + "loss": 0.3389, + "step": 2428 + }, + { + "epoch": 1.1605638850725764, + "grad_norm": 0.5899500716656553, + "learning_rate": 1.4416292392789538e-05, + "loss": 0.3331, + "step": 2429 + }, + { + "epoch": 1.1610417537781494, + "grad_norm": 0.4872995509813868, + "learning_rate": 1.4411566500487425e-05, + "loss": 0.3466, + "step": 2430 + }, + { + "epoch": 1.1615196224837225, + "grad_norm": 0.4954045549900169, + "learning_rate": 1.4406839384494585e-05, + "loss": 0.3374, + "step": 2431 + }, + { + "epoch": 1.1619974911892959, + "grad_norm": 0.4851398103962068, + "learning_rate": 1.4402111046122234e-05, + "loss": 0.3233, + "step": 2432 + }, + { + "epoch": 1.162475359894869, + "grad_norm": 0.49511298286491195, + "learning_rate": 1.4397381486681931e-05, + "loss": 0.3317, + "step": 2433 + }, + { + "epoch": 1.162953228600442, + "grad_norm": 0.4953595652477772, + "learning_rate": 1.4392650707485574e-05, + "loss": 0.32, + "step": 2434 + }, + { + "epoch": 1.1634310973060151, + "grad_norm": 0.5090056906997068, + "learning_rate": 1.4387918709845395e-05, + "loss": 0.3406, + "step": 2435 + }, + { + "epoch": 1.1639089660115882, + "grad_norm": 0.47175952406629523, + "learning_rate": 1.4383185495073968e-05, + "loss": 0.351, + "step": 2436 + }, + { + "epoch": 1.1643868347171615, + "grad_norm": 0.5195708641390865, + "learning_rate": 1.43784510644842e-05, + "loss": 0.327, + "step": 2437 + }, + { + "epoch": 1.1648647034227346, + "grad_norm": 0.49425130257319605, + "learning_rate": 1.4373715419389345e-05, + "loss": 0.3203, + "step": 2438 + }, + { + "epoch": 1.1653425721283077, + "grad_norm": 0.49907031699241045, + "learning_rate": 1.4368978561102982e-05, + "loss": 0.3254, + "step": 2439 + }, + { + "epoch": 1.1658204408338808, + "grad_norm": 0.5690059412126197, + "learning_rate": 1.4364240490939032e-05, + "loss": 0.339, + "step": 2440 + }, + { + "epoch": 1.1662983095394541, + "grad_norm": 0.48782220485318945, + "learning_rate": 1.4359501210211754e-05, + "loss": 0.3175, + "step": 2441 + }, + { + "epoch": 1.1667761782450272, + "grad_norm": 0.48282271375944663, + "learning_rate": 1.4354760720235743e-05, + "loss": 0.3276, + "step": 2442 + }, + { + "epoch": 1.1672540469506003, + "grad_norm": 0.4907377583967225, + "learning_rate": 1.4350019022325925e-05, + "loss": 0.332, + "step": 2443 + }, + { + "epoch": 1.1677319156561734, + "grad_norm": 0.5201773113156611, + "learning_rate": 1.434527611779756e-05, + "loss": 0.3322, + "step": 2444 + }, + { + "epoch": 1.1682097843617467, + "grad_norm": 0.5073601879220666, + "learning_rate": 1.4340532007966252e-05, + "loss": 0.344, + "step": 2445 + }, + { + "epoch": 1.1686876530673198, + "grad_norm": 0.4966658318824344, + "learning_rate": 1.4335786694147931e-05, + "loss": 0.3286, + "step": 2446 + }, + { + "epoch": 1.169165521772893, + "grad_norm": 0.47795836779748363, + "learning_rate": 1.4331040177658859e-05, + "loss": 0.3203, + "step": 2447 + }, + { + "epoch": 1.169643390478466, + "grad_norm": 0.5167223624760666, + "learning_rate": 1.4326292459815642e-05, + "loss": 0.3289, + "step": 2448 + }, + { + "epoch": 1.170121259184039, + "grad_norm": 0.4943415537304743, + "learning_rate": 1.4321543541935213e-05, + "loss": 0.3412, + "step": 2449 + }, + { + "epoch": 1.1705991278896124, + "grad_norm": 0.5447813509352554, + "learning_rate": 1.4316793425334836e-05, + "loss": 0.3317, + "step": 2450 + }, + { + "epoch": 1.1710769965951855, + "grad_norm": 0.6040250815392916, + "learning_rate": 1.4312042111332108e-05, + "loss": 0.319, + "step": 2451 + }, + { + "epoch": 1.1715548653007586, + "grad_norm": 0.4897011714819888, + "learning_rate": 1.4307289601244963e-05, + "loss": 0.3253, + "step": 2452 + }, + { + "epoch": 1.1720327340063317, + "grad_norm": 0.519171989422915, + "learning_rate": 1.4302535896391661e-05, + "loss": 0.3387, + "step": 2453 + }, + { + "epoch": 1.172510602711905, + "grad_norm": 0.5310376803286037, + "learning_rate": 1.4297780998090795e-05, + "loss": 0.3306, + "step": 2454 + }, + { + "epoch": 1.172988471417478, + "grad_norm": 0.5546851814204874, + "learning_rate": 1.4293024907661295e-05, + "loss": 0.3302, + "step": 2455 + }, + { + "epoch": 1.1734663401230512, + "grad_norm": 0.5438187932879726, + "learning_rate": 1.4288267626422411e-05, + "loss": 0.3306, + "step": 2456 + }, + { + "epoch": 1.1739442088286243, + "grad_norm": 0.5999235748445584, + "learning_rate": 1.4283509155693734e-05, + "loss": 0.3299, + "step": 2457 + }, + { + "epoch": 1.1744220775341976, + "grad_norm": 0.5267230789014691, + "learning_rate": 1.4278749496795174e-05, + "loss": 0.3385, + "step": 2458 + }, + { + "epoch": 1.1748999462397707, + "grad_norm": 0.49465625934833485, + "learning_rate": 1.4273988651046982e-05, + "loss": 0.3446, + "step": 2459 + }, + { + "epoch": 1.1753778149453438, + "grad_norm": 0.520908355310508, + "learning_rate": 1.4269226619769727e-05, + "loss": 0.3317, + "step": 2460 + }, + { + "epoch": 1.1758556836509169, + "grad_norm": 0.49990608969106165, + "learning_rate": 1.4264463404284317e-05, + "loss": 0.3267, + "step": 2461 + }, + { + "epoch": 1.17633355235649, + "grad_norm": 0.49260831125565, + "learning_rate": 1.4259699005911984e-05, + "loss": 0.334, + "step": 2462 + }, + { + "epoch": 1.1768114210620633, + "grad_norm": 0.5685639467668431, + "learning_rate": 1.4254933425974284e-05, + "loss": 0.3169, + "step": 2463 + }, + { + "epoch": 1.1772892897676364, + "grad_norm": 0.5053854894890485, + "learning_rate": 1.4250166665793106e-05, + "loss": 0.3172, + "step": 2464 + }, + { + "epoch": 1.1777671584732095, + "grad_norm": 0.5094103880275894, + "learning_rate": 1.424539872669067e-05, + "loss": 0.3371, + "step": 2465 + }, + { + "epoch": 1.1782450271787825, + "grad_norm": 0.4946974997604784, + "learning_rate": 1.4240629609989513e-05, + "loss": 0.3334, + "step": 2466 + }, + { + "epoch": 1.1787228958843559, + "grad_norm": 0.5018625213670579, + "learning_rate": 1.4235859317012506e-05, + "loss": 0.321, + "step": 2467 + }, + { + "epoch": 1.179200764589929, + "grad_norm": 0.4709704159413237, + "learning_rate": 1.4231087849082848e-05, + "loss": 0.3285, + "step": 2468 + }, + { + "epoch": 1.179678633295502, + "grad_norm": 0.49926378629859847, + "learning_rate": 1.4226315207524049e-05, + "loss": 0.3313, + "step": 2469 + }, + { + "epoch": 1.1801565020010751, + "grad_norm": 0.512829518968713, + "learning_rate": 1.4221541393659966e-05, + "loss": 0.3374, + "step": 2470 + }, + { + "epoch": 1.1806343707066485, + "grad_norm": 0.49421509200107594, + "learning_rate": 1.4216766408814766e-05, + "loss": 0.3326, + "step": 2471 + }, + { + "epoch": 1.1811122394122215, + "grad_norm": 0.5693138964760737, + "learning_rate": 1.4211990254312948e-05, + "loss": 0.3413, + "step": 2472 + }, + { + "epoch": 1.1815901081177946, + "grad_norm": 0.4758930284278045, + "learning_rate": 1.4207212931479331e-05, + "loss": 0.3227, + "step": 2473 + }, + { + "epoch": 1.1820679768233677, + "grad_norm": 0.5414563707616173, + "learning_rate": 1.4202434441639061e-05, + "loss": 0.3565, + "step": 2474 + }, + { + "epoch": 1.1825458455289408, + "grad_norm": 0.47934944618315223, + "learning_rate": 1.4197654786117604e-05, + "loss": 0.3228, + "step": 2475 + }, + { + "epoch": 1.1830237142345141, + "grad_norm": 0.6167228683042635, + "learning_rate": 1.419287396624076e-05, + "loss": 0.3194, + "step": 2476 + }, + { + "epoch": 1.1835015829400872, + "grad_norm": 0.6097773368996701, + "learning_rate": 1.4188091983334636e-05, + "loss": 0.3344, + "step": 2477 + }, + { + "epoch": 1.1839794516456603, + "grad_norm": 0.5410843300407218, + "learning_rate": 1.4183308838725669e-05, + "loss": 0.3372, + "step": 2478 + }, + { + "epoch": 1.1844573203512334, + "grad_norm": 0.48327603901696636, + "learning_rate": 1.4178524533740628e-05, + "loss": 0.3308, + "step": 2479 + }, + { + "epoch": 1.1849351890568067, + "grad_norm": 0.5268010648750129, + "learning_rate": 1.4173739069706586e-05, + "loss": 0.3396, + "step": 2480 + }, + { + "epoch": 1.1854130577623798, + "grad_norm": 0.5058799850737847, + "learning_rate": 1.4168952447950948e-05, + "loss": 0.3488, + "step": 2481 + }, + { + "epoch": 1.185890926467953, + "grad_norm": 0.4793089155470465, + "learning_rate": 1.4164164669801444e-05, + "loss": 0.3202, + "step": 2482 + }, + { + "epoch": 1.186368795173526, + "grad_norm": 0.537347786961874, + "learning_rate": 1.4159375736586114e-05, + "loss": 0.323, + "step": 2483 + }, + { + "epoch": 1.1868466638790993, + "grad_norm": 0.5043366860217638, + "learning_rate": 1.4154585649633324e-05, + "loss": 0.3332, + "step": 2484 + }, + { + "epoch": 1.1873245325846724, + "grad_norm": 0.6175384664642016, + "learning_rate": 1.414979441027176e-05, + "loss": 0.3298, + "step": 2485 + }, + { + "epoch": 1.1878024012902455, + "grad_norm": 0.5172614792354202, + "learning_rate": 1.414500201983043e-05, + "loss": 0.3299, + "step": 2486 + }, + { + "epoch": 1.1882802699958186, + "grad_norm": 0.5074099118761909, + "learning_rate": 1.4140208479638653e-05, + "loss": 0.3314, + "step": 2487 + }, + { + "epoch": 1.1887581387013917, + "grad_norm": 0.49819077409033413, + "learning_rate": 1.4135413791026081e-05, + "loss": 0.3451, + "step": 2488 + }, + { + "epoch": 1.189236007406965, + "grad_norm": 0.5176243373050009, + "learning_rate": 1.4130617955322665e-05, + "loss": 0.3221, + "step": 2489 + }, + { + "epoch": 1.189713876112538, + "grad_norm": 0.473185476669963, + "learning_rate": 1.4125820973858693e-05, + "loss": 0.3231, + "step": 2490 + }, + { + "epoch": 1.1901917448181112, + "grad_norm": 0.48907220673787455, + "learning_rate": 1.4121022847964762e-05, + "loss": 0.3408, + "step": 2491 + }, + { + "epoch": 1.1906696135236843, + "grad_norm": 1.177648905452567, + "learning_rate": 1.4116223578971787e-05, + "loss": 0.332, + "step": 2492 + }, + { + "epoch": 1.1911474822292576, + "grad_norm": 0.4828061304183026, + "learning_rate": 1.4111423168210999e-05, + "loss": 0.319, + "step": 2493 + }, + { + "epoch": 1.1916253509348307, + "grad_norm": 0.5135930311328842, + "learning_rate": 1.410662161701395e-05, + "loss": 0.3381, + "step": 2494 + }, + { + "epoch": 1.1921032196404038, + "grad_norm": 0.526643592475081, + "learning_rate": 1.41018189267125e-05, + "loss": 0.3262, + "step": 2495 + }, + { + "epoch": 1.1925810883459769, + "grad_norm": 0.5090734041849178, + "learning_rate": 1.4097015098638838e-05, + "loss": 0.3276, + "step": 2496 + }, + { + "epoch": 1.1930589570515502, + "grad_norm": 0.5174675269837884, + "learning_rate": 1.4092210134125458e-05, + "loss": 0.3263, + "step": 2497 + }, + { + "epoch": 1.1935368257571233, + "grad_norm": 0.5214389237769186, + "learning_rate": 1.4087404034505167e-05, + "loss": 0.3375, + "step": 2498 + }, + { + "epoch": 1.1940146944626964, + "grad_norm": 0.4846029928861073, + "learning_rate": 1.4082596801111104e-05, + "loss": 0.3388, + "step": 2499 + }, + { + "epoch": 1.1944925631682695, + "grad_norm": 0.5209998258997901, + "learning_rate": 1.4077788435276701e-05, + "loss": 0.3373, + "step": 2500 + }, + { + "epoch": 1.1949704318738426, + "grad_norm": 0.5407837704720175, + "learning_rate": 1.4072978938335717e-05, + "loss": 0.3242, + "step": 2501 + }, + { + "epoch": 1.1954483005794159, + "grad_norm": 0.4792112154512138, + "learning_rate": 1.4068168311622223e-05, + "loss": 0.3262, + "step": 2502 + }, + { + "epoch": 1.195926169284989, + "grad_norm": 0.5012097821642677, + "learning_rate": 1.40633565564706e-05, + "loss": 0.3299, + "step": 2503 + }, + { + "epoch": 1.196404037990562, + "grad_norm": 0.5011100101721819, + "learning_rate": 1.4058543674215543e-05, + "loss": 0.3303, + "step": 2504 + }, + { + "epoch": 1.1968819066961351, + "grad_norm": 0.5072036552663972, + "learning_rate": 1.4053729666192067e-05, + "loss": 0.3034, + "step": 2505 + }, + { + "epoch": 1.1973597754017085, + "grad_norm": 0.49966051888354573, + "learning_rate": 1.4048914533735482e-05, + "loss": 0.3185, + "step": 2506 + }, + { + "epoch": 1.1978376441072816, + "grad_norm": 0.6201559448981543, + "learning_rate": 1.404409827818143e-05, + "loss": 0.3081, + "step": 2507 + }, + { + "epoch": 1.1983155128128546, + "grad_norm": 0.4892341169267986, + "learning_rate": 1.4039280900865855e-05, + "loss": 0.3347, + "step": 2508 + }, + { + "epoch": 1.1987933815184277, + "grad_norm": 0.4894821104819882, + "learning_rate": 1.4034462403125004e-05, + "loss": 0.319, + "step": 2509 + }, + { + "epoch": 1.199271250224001, + "grad_norm": 0.529046836458218, + "learning_rate": 1.4029642786295452e-05, + "loss": 0.3228, + "step": 2510 + }, + { + "epoch": 1.1997491189295741, + "grad_norm": 0.5119754528953527, + "learning_rate": 1.4024822051714075e-05, + "loss": 0.3259, + "step": 2511 + }, + { + "epoch": 1.2002269876351472, + "grad_norm": 0.5166507305196261, + "learning_rate": 1.4020000200718053e-05, + "loss": 0.347, + "step": 2512 + }, + { + "epoch": 1.2007048563407203, + "grad_norm": 0.485424117273842, + "learning_rate": 1.401517723464489e-05, + "loss": 0.3433, + "step": 2513 + }, + { + "epoch": 1.2011827250462934, + "grad_norm": 0.49766272512833354, + "learning_rate": 1.4010353154832388e-05, + "loss": 0.336, + "step": 2514 + }, + { + "epoch": 1.2016605937518667, + "grad_norm": 0.6389939051707664, + "learning_rate": 1.400552796261866e-05, + "loss": 0.3151, + "step": 2515 + }, + { + "epoch": 1.2021384624574398, + "grad_norm": 0.4842050986179893, + "learning_rate": 1.4000701659342136e-05, + "loss": 0.3315, + "step": 2516 + }, + { + "epoch": 1.202616331163013, + "grad_norm": 0.5199065736138204, + "learning_rate": 1.3995874246341542e-05, + "loss": 0.329, + "step": 2517 + }, + { + "epoch": 1.203094199868586, + "grad_norm": 0.46844026904175, + "learning_rate": 1.3991045724955915e-05, + "loss": 0.3358, + "step": 2518 + }, + { + "epoch": 1.2035720685741593, + "grad_norm": 0.5097819606674651, + "learning_rate": 1.3986216096524606e-05, + "loss": 0.3282, + "step": 2519 + }, + { + "epoch": 1.2040499372797324, + "grad_norm": 0.49611533463068996, + "learning_rate": 1.3981385362387268e-05, + "loss": 0.3237, + "step": 2520 + }, + { + "epoch": 1.2045278059853055, + "grad_norm": 0.5427021170944505, + "learning_rate": 1.397655352388386e-05, + "loss": 0.3174, + "step": 2521 + }, + { + "epoch": 1.2050056746908786, + "grad_norm": 0.5177994962109214, + "learning_rate": 1.397172058235465e-05, + "loss": 0.3266, + "step": 2522 + }, + { + "epoch": 1.205483543396452, + "grad_norm": 0.5151693584485367, + "learning_rate": 1.3966886539140212e-05, + "loss": 0.343, + "step": 2523 + }, + { + "epoch": 1.205961412102025, + "grad_norm": 0.4877067051638373, + "learning_rate": 1.396205139558142e-05, + "loss": 0.3289, + "step": 2524 + }, + { + "epoch": 1.206439280807598, + "grad_norm": 0.4986624439516977, + "learning_rate": 1.3957215153019463e-05, + "loss": 0.3187, + "step": 2525 + }, + { + "epoch": 1.2069171495131712, + "grad_norm": 0.5098275657839579, + "learning_rate": 1.3952377812795826e-05, + "loss": 0.3306, + "step": 2526 + }, + { + "epoch": 1.2073950182187443, + "grad_norm": 0.4851080171025919, + "learning_rate": 1.3947539376252301e-05, + "loss": 0.3326, + "step": 2527 + }, + { + "epoch": 1.2078728869243176, + "grad_norm": 0.5181654078258601, + "learning_rate": 1.3942699844730986e-05, + "loss": 0.3275, + "step": 2528 + }, + { + "epoch": 1.2083507556298907, + "grad_norm": 0.517517079824198, + "learning_rate": 1.3937859219574286e-05, + "loss": 0.308, + "step": 2529 + }, + { + "epoch": 1.2088286243354638, + "grad_norm": 0.5359336855688807, + "learning_rate": 1.3933017502124897e-05, + "loss": 0.3184, + "step": 2530 + }, + { + "epoch": 1.2093064930410369, + "grad_norm": 0.5177636108490216, + "learning_rate": 1.392817469372583e-05, + "loss": 0.3318, + "step": 2531 + }, + { + "epoch": 1.2097843617466102, + "grad_norm": 0.5049686036498979, + "learning_rate": 1.3923330795720396e-05, + "loss": 0.3379, + "step": 2532 + }, + { + "epoch": 1.2102622304521833, + "grad_norm": 0.5034062129917729, + "learning_rate": 1.3918485809452204e-05, + "loss": 0.3297, + "step": 2533 + }, + { + "epoch": 1.2107400991577564, + "grad_norm": 0.5046311056670907, + "learning_rate": 1.3913639736265175e-05, + "loss": 0.3385, + "step": 2534 + }, + { + "epoch": 1.2112179678633295, + "grad_norm": 0.7815693194085443, + "learning_rate": 1.3908792577503514e-05, + "loss": 0.3385, + "step": 2535 + }, + { + "epoch": 1.2116958365689028, + "grad_norm": 0.47743692428948703, + "learning_rate": 1.3903944334511744e-05, + "loss": 0.3177, + "step": 2536 + }, + { + "epoch": 1.2121737052744759, + "grad_norm": 0.49090736922505357, + "learning_rate": 1.3899095008634678e-05, + "loss": 0.3218, + "step": 2537 + }, + { + "epoch": 1.212651573980049, + "grad_norm": 0.49841642881825793, + "learning_rate": 1.3894244601217435e-05, + "loss": 0.3346, + "step": 2538 + }, + { + "epoch": 1.213129442685622, + "grad_norm": 0.510969551833143, + "learning_rate": 1.3889393113605433e-05, + "loss": 0.3552, + "step": 2539 + }, + { + "epoch": 1.2136073113911952, + "grad_norm": 0.5523292379752209, + "learning_rate": 1.3884540547144393e-05, + "loss": 0.3211, + "step": 2540 + }, + { + "epoch": 1.2140851800967685, + "grad_norm": 0.5051371714386153, + "learning_rate": 1.3879686903180326e-05, + "loss": 0.3393, + "step": 2541 + }, + { + "epoch": 1.2145630488023416, + "grad_norm": 0.5044231031807489, + "learning_rate": 1.3874832183059545e-05, + "loss": 0.3157, + "step": 2542 + }, + { + "epoch": 1.2150409175079147, + "grad_norm": 0.4730004046440912, + "learning_rate": 1.3869976388128672e-05, + "loss": 0.3501, + "step": 2543 + }, + { + "epoch": 1.215518786213488, + "grad_norm": 0.5105649664830538, + "learning_rate": 1.3865119519734611e-05, + "loss": 0.3328, + "step": 2544 + }, + { + "epoch": 1.215996654919061, + "grad_norm": 0.5704906525223905, + "learning_rate": 1.3860261579224574e-05, + "loss": 0.3322, + "step": 2545 + }, + { + "epoch": 1.2164745236246342, + "grad_norm": 0.525856706643172, + "learning_rate": 1.3855402567946072e-05, + "loss": 0.3202, + "step": 2546 + }, + { + "epoch": 1.2169523923302072, + "grad_norm": 0.47546063015364237, + "learning_rate": 1.3850542487246903e-05, + "loss": 0.3377, + "step": 2547 + }, + { + "epoch": 1.2174302610357803, + "grad_norm": 0.5451603336435644, + "learning_rate": 1.384568133847517e-05, + "loss": 0.3452, + "step": 2548 + }, + { + "epoch": 1.2179081297413537, + "grad_norm": 0.5361634977822047, + "learning_rate": 1.3840819122979272e-05, + "loss": 0.3131, + "step": 2549 + }, + { + "epoch": 1.2183859984469267, + "grad_norm": 0.4922706268092114, + "learning_rate": 1.3835955842107897e-05, + "loss": 0.3306, + "step": 2550 + }, + { + "epoch": 1.2188638671524998, + "grad_norm": 0.4894912959757622, + "learning_rate": 1.3831091497210043e-05, + "loss": 0.322, + "step": 2551 + }, + { + "epoch": 1.219341735858073, + "grad_norm": 0.5551732852060824, + "learning_rate": 1.3826226089634982e-05, + "loss": 0.3223, + "step": 2552 + }, + { + "epoch": 1.219819604563646, + "grad_norm": 0.47984089277387154, + "learning_rate": 1.3821359620732297e-05, + "loss": 0.3399, + "step": 2553 + }, + { + "epoch": 1.2202974732692193, + "grad_norm": 0.469484790257942, + "learning_rate": 1.3816492091851865e-05, + "loss": 0.3439, + "step": 2554 + }, + { + "epoch": 1.2207753419747924, + "grad_norm": 0.49706698283821926, + "learning_rate": 1.3811623504343845e-05, + "loss": 0.3425, + "step": 2555 + }, + { + "epoch": 1.2212532106803655, + "grad_norm": 0.4864721219524991, + "learning_rate": 1.3806753859558702e-05, + "loss": 0.3333, + "step": 2556 + }, + { + "epoch": 1.2217310793859388, + "grad_norm": 0.531371918260641, + "learning_rate": 1.380188315884719e-05, + "loss": 0.3262, + "step": 2557 + }, + { + "epoch": 1.222208948091512, + "grad_norm": 0.508703916265606, + "learning_rate": 1.3797011403560349e-05, + "loss": 0.3339, + "step": 2558 + }, + { + "epoch": 1.222686816797085, + "grad_norm": 0.5519217880427334, + "learning_rate": 1.3792138595049526e-05, + "loss": 0.3313, + "step": 2559 + }, + { + "epoch": 1.2231646855026581, + "grad_norm": 0.49067395021756355, + "learning_rate": 1.378726473466635e-05, + "loss": 0.3301, + "step": 2560 + }, + { + "epoch": 1.2236425542082312, + "grad_norm": 0.527772112286325, + "learning_rate": 1.378238982376274e-05, + "loss": 0.3121, + "step": 2561 + }, + { + "epoch": 1.2241204229138045, + "grad_norm": 0.5044285212896721, + "learning_rate": 1.3777513863690914e-05, + "loss": 0.3276, + "step": 2562 + }, + { + "epoch": 1.2245982916193776, + "grad_norm": 0.5706596519616292, + "learning_rate": 1.3772636855803378e-05, + "loss": 0.3331, + "step": 2563 + }, + { + "epoch": 1.2250761603249507, + "grad_norm": 0.4736938143718017, + "learning_rate": 1.3767758801452926e-05, + "loss": 0.3358, + "step": 2564 + }, + { + "epoch": 1.2255540290305238, + "grad_norm": 0.5097229175872106, + "learning_rate": 1.3762879701992642e-05, + "loss": 0.3303, + "step": 2565 + }, + { + "epoch": 1.2260318977360969, + "grad_norm": 0.5296217110279442, + "learning_rate": 1.3757999558775907e-05, + "loss": 0.3316, + "step": 2566 + }, + { + "epoch": 1.2265097664416702, + "grad_norm": 0.4965806245245254, + "learning_rate": 1.3753118373156382e-05, + "loss": 0.311, + "step": 2567 + }, + { + "epoch": 1.2269876351472433, + "grad_norm": 0.49384330392321485, + "learning_rate": 1.3748236146488028e-05, + "loss": 0.3394, + "step": 2568 + }, + { + "epoch": 1.2274655038528164, + "grad_norm": 0.5159965075461034, + "learning_rate": 1.3743352880125083e-05, + "loss": 0.315, + "step": 2569 + }, + { + "epoch": 1.2279433725583897, + "grad_norm": 0.5382950937633958, + "learning_rate": 1.373846857542208e-05, + "loss": 0.3495, + "step": 2570 + }, + { + "epoch": 1.2284212412639628, + "grad_norm": 0.48821357626356915, + "learning_rate": 1.3733583233733843e-05, + "loss": 0.3366, + "step": 2571 + }, + { + "epoch": 1.2288991099695359, + "grad_norm": 0.4912616599041392, + "learning_rate": 1.372869685641547e-05, + "loss": 0.3233, + "step": 2572 + }, + { + "epoch": 1.229376978675109, + "grad_norm": 0.49906610048849775, + "learning_rate": 1.3723809444822366e-05, + "loss": 0.3223, + "step": 2573 + }, + { + "epoch": 1.229854847380682, + "grad_norm": 0.4900856446449379, + "learning_rate": 1.371892100031021e-05, + "loss": 0.3358, + "step": 2574 + }, + { + "epoch": 1.2303327160862554, + "grad_norm": 0.5218820319587112, + "learning_rate": 1.3714031524234965e-05, + "loss": 0.3134, + "step": 2575 + }, + { + "epoch": 1.2308105847918285, + "grad_norm": 0.4782986698985612, + "learning_rate": 1.3709141017952893e-05, + "loss": 0.3125, + "step": 2576 + }, + { + "epoch": 1.2312884534974016, + "grad_norm": 0.5293380049327159, + "learning_rate": 1.370424948282053e-05, + "loss": 0.3269, + "step": 2577 + }, + { + "epoch": 1.2317663222029747, + "grad_norm": 0.48617546343080253, + "learning_rate": 1.3699356920194702e-05, + "loss": 0.3262, + "step": 2578 + }, + { + "epoch": 1.2322441909085478, + "grad_norm": 0.489962199133281, + "learning_rate": 1.3694463331432521e-05, + "loss": 0.3145, + "step": 2579 + }, + { + "epoch": 1.232722059614121, + "grad_norm": 0.5271714061634383, + "learning_rate": 1.3689568717891381e-05, + "loss": 0.305, + "step": 2580 + }, + { + "epoch": 1.2331999283196942, + "grad_norm": 0.476057076836738, + "learning_rate": 1.368467308092896e-05, + "loss": 0.3246, + "step": 2581 + }, + { + "epoch": 1.2336777970252673, + "grad_norm": 0.5037616861616993, + "learning_rate": 1.3679776421903222e-05, + "loss": 0.3157, + "step": 2582 + }, + { + "epoch": 1.2341556657308406, + "grad_norm": 0.5029866004003386, + "learning_rate": 1.3674878742172415e-05, + "loss": 0.3424, + "step": 2583 + }, + { + "epoch": 1.2346335344364137, + "grad_norm": 0.49776455157060684, + "learning_rate": 1.366998004309507e-05, + "loss": 0.3221, + "step": 2584 + }, + { + "epoch": 1.2351114031419868, + "grad_norm": 0.5159003230211062, + "learning_rate": 1.3665080326029997e-05, + "loss": 0.3369, + "step": 2585 + }, + { + "epoch": 1.2355892718475598, + "grad_norm": 0.4945390309105578, + "learning_rate": 1.3660179592336296e-05, + "loss": 0.324, + "step": 2586 + }, + { + "epoch": 1.236067140553133, + "grad_norm": 0.5028457586094409, + "learning_rate": 1.3655277843373338e-05, + "loss": 0.3276, + "step": 2587 + }, + { + "epoch": 1.2365450092587063, + "grad_norm": 0.4924865463215211, + "learning_rate": 1.3650375080500784e-05, + "loss": 0.3102, + "step": 2588 + }, + { + "epoch": 1.2370228779642793, + "grad_norm": 0.4786677818333961, + "learning_rate": 1.3645471305078575e-05, + "loss": 0.3336, + "step": 2589 + }, + { + "epoch": 1.2375007466698524, + "grad_norm": 0.5645830343314627, + "learning_rate": 1.364056651846693e-05, + "loss": 0.3441, + "step": 2590 + }, + { + "epoch": 1.2379786153754255, + "grad_norm": 0.4772358981883002, + "learning_rate": 1.3635660722026352e-05, + "loss": 0.3442, + "step": 2591 + }, + { + "epoch": 1.2384564840809986, + "grad_norm": 0.5656570549920833, + "learning_rate": 1.3630753917117624e-05, + "loss": 0.3081, + "step": 2592 + }, + { + "epoch": 1.238934352786572, + "grad_norm": 0.4751884801558594, + "learning_rate": 1.3625846105101801e-05, + "loss": 0.3142, + "step": 2593 + }, + { + "epoch": 1.239412221492145, + "grad_norm": 2.4050639525172075, + "learning_rate": 1.3620937287340228e-05, + "loss": 0.3273, + "step": 2594 + }, + { + "epoch": 1.2398900901977181, + "grad_norm": 0.7444744255219318, + "learning_rate": 1.3616027465194525e-05, + "loss": 0.3273, + "step": 2595 + }, + { + "epoch": 1.2403679589032914, + "grad_norm": 0.4944733158259898, + "learning_rate": 1.3611116640026589e-05, + "loss": 0.3399, + "step": 2596 + }, + { + "epoch": 1.2408458276088645, + "grad_norm": 0.4818780698812894, + "learning_rate": 1.3606204813198593e-05, + "loss": 0.3213, + "step": 2597 + }, + { + "epoch": 1.2413236963144376, + "grad_norm": 0.48772958226512186, + "learning_rate": 1.3601291986072999e-05, + "loss": 0.3328, + "step": 2598 + }, + { + "epoch": 1.2418015650200107, + "grad_norm": 0.49554621709972824, + "learning_rate": 1.3596378160012529e-05, + "loss": 0.3458, + "step": 2599 + }, + { + "epoch": 1.2422794337255838, + "grad_norm": 0.473500813505667, + "learning_rate": 1.35914633363802e-05, + "loss": 0.3203, + "step": 2600 + }, + { + "epoch": 1.2427573024311571, + "grad_norm": 0.48856018275808843, + "learning_rate": 1.358654751653929e-05, + "loss": 0.3171, + "step": 2601 + }, + { + "epoch": 1.2432351711367302, + "grad_norm": 0.5132783506070369, + "learning_rate": 1.3581630701853368e-05, + "loss": 0.3246, + "step": 2602 + }, + { + "epoch": 1.2437130398423033, + "grad_norm": 0.4879512811328692, + "learning_rate": 1.3576712893686268e-05, + "loss": 0.2965, + "step": 2603 + }, + { + "epoch": 1.2441909085478764, + "grad_norm": 0.5692497626729707, + "learning_rate": 1.3571794093402103e-05, + "loss": 0.3554, + "step": 2604 + }, + { + "epoch": 1.2446687772534497, + "grad_norm": 0.5262501553177124, + "learning_rate": 1.356687430236526e-05, + "loss": 0.3426, + "step": 2605 + }, + { + "epoch": 1.2451466459590228, + "grad_norm": 0.5101017072346472, + "learning_rate": 1.3561953521940408e-05, + "loss": 0.3384, + "step": 2606 + }, + { + "epoch": 1.245624514664596, + "grad_norm": 0.4855194753569897, + "learning_rate": 1.3557031753492477e-05, + "loss": 0.3314, + "step": 2607 + }, + { + "epoch": 1.246102383370169, + "grad_norm": 0.5165611849608196, + "learning_rate": 1.3552108998386683e-05, + "loss": 0.3408, + "step": 2608 + }, + { + "epoch": 1.2465802520757423, + "grad_norm": 0.4974036775097272, + "learning_rate": 1.3547185257988513e-05, + "loss": 0.3221, + "step": 2609 + }, + { + "epoch": 1.2470581207813154, + "grad_norm": 0.4947570822821488, + "learning_rate": 1.3542260533663723e-05, + "loss": 0.3427, + "step": 2610 + }, + { + "epoch": 1.2475359894868885, + "grad_norm": 0.48247105943498364, + "learning_rate": 1.3537334826778343e-05, + "loss": 0.33, + "step": 2611 + }, + { + "epoch": 1.2480138581924616, + "grad_norm": 0.5029602456053662, + "learning_rate": 1.3532408138698685e-05, + "loss": 0.337, + "step": 2612 + }, + { + "epoch": 1.2484917268980347, + "grad_norm": 0.5072947162416663, + "learning_rate": 1.3527480470791314e-05, + "loss": 0.3383, + "step": 2613 + }, + { + "epoch": 1.248969595603608, + "grad_norm": 0.49966515091084346, + "learning_rate": 1.3522551824423088e-05, + "loss": 0.3276, + "step": 2614 + }, + { + "epoch": 1.249447464309181, + "grad_norm": 0.4845192026566661, + "learning_rate": 1.351762220096112e-05, + "loss": 0.3403, + "step": 2615 + }, + { + "epoch": 1.2499253330147542, + "grad_norm": 0.48326585792046045, + "learning_rate": 1.3512691601772803e-05, + "loss": 0.3138, + "step": 2616 + }, + { + "epoch": 1.2504032017203273, + "grad_norm": 0.500541614774048, + "learning_rate": 1.3507760028225801e-05, + "loss": 0.3183, + "step": 2617 + }, + { + "epoch": 1.2508810704259004, + "grad_norm": 0.46031252587482197, + "learning_rate": 1.3502827481688041e-05, + "loss": 0.3235, + "step": 2618 + }, + { + "epoch": 1.2513589391314737, + "grad_norm": 0.4621265173598907, + "learning_rate": 1.3497893963527729e-05, + "loss": 0.3192, + "step": 2619 + }, + { + "epoch": 1.2518368078370468, + "grad_norm": 0.509128957862185, + "learning_rate": 1.3492959475113332e-05, + "loss": 0.343, + "step": 2620 + }, + { + "epoch": 1.2523146765426199, + "grad_norm": 0.5017135977140528, + "learning_rate": 1.348802401781359e-05, + "loss": 0.3434, + "step": 2621 + }, + { + "epoch": 1.2527925452481932, + "grad_norm": 0.48983060466434775, + "learning_rate": 1.3483087592997513e-05, + "loss": 0.3393, + "step": 2622 + }, + { + "epoch": 1.2532704139537663, + "grad_norm": 0.4917900123369827, + "learning_rate": 1.347815020203438e-05, + "loss": 0.334, + "step": 2623 + }, + { + "epoch": 1.2537482826593394, + "grad_norm": 0.48250587974454096, + "learning_rate": 1.3473211846293735e-05, + "loss": 0.331, + "step": 2624 + }, + { + "epoch": 1.2542261513649124, + "grad_norm": 0.5248984409083738, + "learning_rate": 1.3468272527145388e-05, + "loss": 0.3289, + "step": 2625 + }, + { + "epoch": 1.2547040200704855, + "grad_norm": 0.7447735031783237, + "learning_rate": 1.3463332245959424e-05, + "loss": 0.3237, + "step": 2626 + }, + { + "epoch": 1.2551818887760589, + "grad_norm": 0.4813371324594707, + "learning_rate": 1.3458391004106184e-05, + "loss": 0.341, + "step": 2627 + }, + { + "epoch": 1.255659757481632, + "grad_norm": 0.5512321025879464, + "learning_rate": 1.3453448802956285e-05, + "loss": 0.3284, + "step": 2628 + }, + { + "epoch": 1.256137626187205, + "grad_norm": 0.5150834661725827, + "learning_rate": 1.344850564388061e-05, + "loss": 0.3277, + "step": 2629 + }, + { + "epoch": 1.2566154948927781, + "grad_norm": 0.5120299966158544, + "learning_rate": 1.3443561528250295e-05, + "loss": 0.3403, + "step": 2630 + }, + { + "epoch": 1.2570933635983512, + "grad_norm": 0.4814484918217983, + "learning_rate": 1.3438616457436758e-05, + "loss": 0.3525, + "step": 2631 + }, + { + "epoch": 1.2575712323039245, + "grad_norm": 0.5120493174867615, + "learning_rate": 1.343367043281167e-05, + "loss": 0.3409, + "step": 2632 + }, + { + "epoch": 1.2580491010094976, + "grad_norm": 0.47147647454359, + "learning_rate": 1.3428723455746972e-05, + "loss": 0.3457, + "step": 2633 + }, + { + "epoch": 1.2585269697150707, + "grad_norm": 0.4898796593893291, + "learning_rate": 1.3423775527614871e-05, + "loss": 0.3284, + "step": 2634 + }, + { + "epoch": 1.259004838420644, + "grad_norm": 0.48315639065037624, + "learning_rate": 1.3418826649787834e-05, + "loss": 0.319, + "step": 2635 + }, + { + "epoch": 1.2594827071262171, + "grad_norm": 0.4518418312294045, + "learning_rate": 1.341387682363859e-05, + "loss": 0.3303, + "step": 2636 + }, + { + "epoch": 1.2599605758317902, + "grad_norm": 0.4850719475140083, + "learning_rate": 1.3408926050540134e-05, + "loss": 0.3323, + "step": 2637 + }, + { + "epoch": 1.2604384445373633, + "grad_norm": 0.4608901894307272, + "learning_rate": 1.3403974331865728e-05, + "loss": 0.3441, + "step": 2638 + }, + { + "epoch": 1.2609163132429364, + "grad_norm": 0.5002804502797722, + "learning_rate": 1.3399021668988882e-05, + "loss": 0.3091, + "step": 2639 + }, + { + "epoch": 1.2613941819485097, + "grad_norm": 0.49469049914409363, + "learning_rate": 1.3394068063283387e-05, + "loss": 0.3302, + "step": 2640 + }, + { + "epoch": 1.2618720506540828, + "grad_norm": 0.6390540371820533, + "learning_rate": 1.3389113516123283e-05, + "loss": 0.3219, + "step": 2641 + }, + { + "epoch": 1.262349919359656, + "grad_norm": 0.5039329426448652, + "learning_rate": 1.3384158028882866e-05, + "loss": 0.3264, + "step": 2642 + }, + { + "epoch": 1.262827788065229, + "grad_norm": 0.4908786517804285, + "learning_rate": 1.337920160293671e-05, + "loss": 0.3383, + "step": 2643 + }, + { + "epoch": 1.263305656770802, + "grad_norm": 0.475559094015669, + "learning_rate": 1.3374244239659641e-05, + "loss": 0.335, + "step": 2644 + }, + { + "epoch": 1.2637835254763754, + "grad_norm": 0.5008300872967887, + "learning_rate": 1.3369285940426737e-05, + "loss": 0.3163, + "step": 2645 + }, + { + "epoch": 1.2642613941819485, + "grad_norm": 0.5167558213932772, + "learning_rate": 1.3364326706613346e-05, + "loss": 0.3169, + "step": 2646 + }, + { + "epoch": 1.2647392628875216, + "grad_norm": 0.5070324119135403, + "learning_rate": 1.3359366539595075e-05, + "loss": 0.3486, + "step": 2647 + }, + { + "epoch": 1.265217131593095, + "grad_norm": 0.4926151078159255, + "learning_rate": 1.3354405440747783e-05, + "loss": 0.322, + "step": 2648 + }, + { + "epoch": 1.265695000298668, + "grad_norm": 0.531687534308957, + "learning_rate": 1.3349443411447591e-05, + "loss": 0.3295, + "step": 2649 + }, + { + "epoch": 1.266172869004241, + "grad_norm": 0.4865326711170691, + "learning_rate": 1.334448045307088e-05, + "loss": 0.321, + "step": 2650 + }, + { + "epoch": 1.2666507377098142, + "grad_norm": 0.505824257323991, + "learning_rate": 1.3339516566994285e-05, + "loss": 0.3275, + "step": 2651 + }, + { + "epoch": 1.2671286064153873, + "grad_norm": 0.4743225313426183, + "learning_rate": 1.3334551754594709e-05, + "loss": 0.3338, + "step": 2652 + }, + { + "epoch": 1.2676064751209606, + "grad_norm": 0.51248355708598, + "learning_rate": 1.3329586017249293e-05, + "loss": 0.3227, + "step": 2653 + }, + { + "epoch": 1.2680843438265337, + "grad_norm": 0.48662715453553024, + "learning_rate": 1.3324619356335446e-05, + "loss": 0.3229, + "step": 2654 + }, + { + "epoch": 1.2685622125321068, + "grad_norm": 0.5130913956035601, + "learning_rate": 1.331965177323084e-05, + "loss": 0.3297, + "step": 2655 + }, + { + "epoch": 1.26904008123768, + "grad_norm": 0.507745869575156, + "learning_rate": 1.3314683269313387e-05, + "loss": 0.3303, + "step": 2656 + }, + { + "epoch": 1.269517949943253, + "grad_norm": 0.5295327741107102, + "learning_rate": 1.3309713845961265e-05, + "loss": 0.3353, + "step": 2657 + }, + { + "epoch": 1.2699958186488263, + "grad_norm": 0.5362619437276863, + "learning_rate": 1.3304743504552906e-05, + "loss": 0.3198, + "step": 2658 + }, + { + "epoch": 1.2704736873543994, + "grad_norm": 0.7456002921673187, + "learning_rate": 1.3299772246466992e-05, + "loss": 0.3247, + "step": 2659 + }, + { + "epoch": 1.2709515560599725, + "grad_norm": 0.5057040297711566, + "learning_rate": 1.3294800073082464e-05, + "loss": 0.3366, + "step": 2660 + }, + { + "epoch": 1.2714294247655458, + "grad_norm": 0.5114205500351166, + "learning_rate": 1.3289826985778515e-05, + "loss": 0.3081, + "step": 2661 + }, + { + "epoch": 1.2719072934711189, + "grad_norm": 0.5107340207654342, + "learning_rate": 1.3284852985934591e-05, + "loss": 0.3003, + "step": 2662 + }, + { + "epoch": 1.272385162176692, + "grad_norm": 0.5064044347392543, + "learning_rate": 1.3279878074930394e-05, + "loss": 0.3368, + "step": 2663 + }, + { + "epoch": 1.272863030882265, + "grad_norm": 0.5498353953286683, + "learning_rate": 1.3274902254145876e-05, + "loss": 0.3265, + "step": 2664 + }, + { + "epoch": 1.2733408995878381, + "grad_norm": 0.527749650382651, + "learning_rate": 1.3269925524961237e-05, + "loss": 0.3413, + "step": 2665 + }, + { + "epoch": 1.2738187682934115, + "grad_norm": 0.4800823655478267, + "learning_rate": 1.326494788875694e-05, + "loss": 0.3383, + "step": 2666 + }, + { + "epoch": 1.2742966369989845, + "grad_norm": 0.5476129798251085, + "learning_rate": 1.3259969346913692e-05, + "loss": 0.3114, + "step": 2667 + }, + { + "epoch": 1.2747745057045576, + "grad_norm": 0.6895619251669788, + "learning_rate": 1.3254989900812452e-05, + "loss": 0.3184, + "step": 2668 + }, + { + "epoch": 1.275252374410131, + "grad_norm": 0.5299012176977083, + "learning_rate": 1.3250009551834431e-05, + "loss": 0.3135, + "step": 2669 + }, + { + "epoch": 1.275730243115704, + "grad_norm": 0.5236139281767733, + "learning_rate": 1.3245028301361086e-05, + "loss": 0.3399, + "step": 2670 + }, + { + "epoch": 1.2762081118212771, + "grad_norm": 0.5049494339980716, + "learning_rate": 1.3240046150774136e-05, + "loss": 0.3293, + "step": 2671 + }, + { + "epoch": 1.2766859805268502, + "grad_norm": 0.48846700131030796, + "learning_rate": 1.3235063101455536e-05, + "loss": 0.3272, + "step": 2672 + }, + { + "epoch": 1.2771638492324233, + "grad_norm": 0.534758434518732, + "learning_rate": 1.3230079154787497e-05, + "loss": 0.2966, + "step": 2673 + }, + { + "epoch": 1.2776417179379966, + "grad_norm": 0.5290754783911104, + "learning_rate": 1.3225094312152478e-05, + "loss": 0.336, + "step": 2674 + }, + { + "epoch": 1.2781195866435697, + "grad_norm": 0.5164880573517012, + "learning_rate": 1.3220108574933185e-05, + "loss": 0.3283, + "step": 2675 + }, + { + "epoch": 1.2785974553491428, + "grad_norm": 0.5334112157901388, + "learning_rate": 1.3215121944512576e-05, + "loss": 0.3334, + "step": 2676 + }, + { + "epoch": 1.279075324054716, + "grad_norm": 1.0889045439453695, + "learning_rate": 1.3210134422273855e-05, + "loss": 0.3356, + "step": 2677 + }, + { + "epoch": 1.279553192760289, + "grad_norm": 0.5519985462389302, + "learning_rate": 1.3205146009600472e-05, + "loss": 0.3169, + "step": 2678 + }, + { + "epoch": 1.2800310614658623, + "grad_norm": 0.5034434905243013, + "learning_rate": 1.320015670787612e-05, + "loss": 0.3264, + "step": 2679 + }, + { + "epoch": 1.2805089301714354, + "grad_norm": 0.4936078005967817, + "learning_rate": 1.3195166518484748e-05, + "loss": 0.3266, + "step": 2680 + }, + { + "epoch": 1.2809867988770085, + "grad_norm": 0.48996148603955003, + "learning_rate": 1.3190175442810547e-05, + "loss": 0.3149, + "step": 2681 + }, + { + "epoch": 1.2814646675825818, + "grad_norm": 0.5937307442317561, + "learning_rate": 1.3185183482237948e-05, + "loss": 0.3398, + "step": 2682 + }, + { + "epoch": 1.281942536288155, + "grad_norm": 0.5041569401113013, + "learning_rate": 1.318019063815164e-05, + "loss": 0.3533, + "step": 2683 + }, + { + "epoch": 1.282420404993728, + "grad_norm": 0.49226884478826943, + "learning_rate": 1.3175196911936548e-05, + "loss": 0.3384, + "step": 2684 + }, + { + "epoch": 1.282898273699301, + "grad_norm": 0.5095296957203082, + "learning_rate": 1.317020230497784e-05, + "loss": 0.3261, + "step": 2685 + }, + { + "epoch": 1.2833761424048742, + "grad_norm": 0.5756732067034811, + "learning_rate": 1.3165206818660932e-05, + "loss": 0.333, + "step": 2686 + }, + { + "epoch": 1.2838540111104475, + "grad_norm": 0.5299900729104405, + "learning_rate": 1.3160210454371489e-05, + "loss": 0.3262, + "step": 2687 + }, + { + "epoch": 1.2843318798160206, + "grad_norm": 0.5971638970062076, + "learning_rate": 1.315521321349541e-05, + "loss": 0.3198, + "step": 2688 + }, + { + "epoch": 1.2848097485215937, + "grad_norm": 0.4998008800910582, + "learning_rate": 1.3150215097418844e-05, + "loss": 0.3317, + "step": 2689 + }, + { + "epoch": 1.2852876172271668, + "grad_norm": 0.4930588171021608, + "learning_rate": 1.3145216107528178e-05, + "loss": 0.3055, + "step": 2690 + }, + { + "epoch": 1.2857654859327399, + "grad_norm": 0.5053649686384691, + "learning_rate": 1.3140216245210042e-05, + "loss": 0.3419, + "step": 2691 + }, + { + "epoch": 1.2862433546383132, + "grad_norm": 0.4874790099765457, + "learning_rate": 1.3135215511851316e-05, + "loss": 0.336, + "step": 2692 + }, + { + "epoch": 1.2867212233438863, + "grad_norm": 0.5212769061425828, + "learning_rate": 1.313021390883911e-05, + "loss": 0.3201, + "step": 2693 + }, + { + "epoch": 1.2871990920494594, + "grad_norm": 0.5057814940111431, + "learning_rate": 1.312521143756078e-05, + "loss": 0.3087, + "step": 2694 + }, + { + "epoch": 1.2876769607550327, + "grad_norm": 0.5430285465859317, + "learning_rate": 1.3120208099403926e-05, + "loss": 0.3223, + "step": 2695 + }, + { + "epoch": 1.2881548294606058, + "grad_norm": 0.4973193693517145, + "learning_rate": 1.3115203895756387e-05, + "loss": 0.3249, + "step": 2696 + }, + { + "epoch": 1.2886326981661789, + "grad_norm": 0.48876830415257927, + "learning_rate": 1.3110198828006236e-05, + "loss": 0.3256, + "step": 2697 + }, + { + "epoch": 1.289110566871752, + "grad_norm": 0.5073517633637624, + "learning_rate": 1.3105192897541792e-05, + "loss": 0.3369, + "step": 2698 + }, + { + "epoch": 1.289588435577325, + "grad_norm": 0.5115023299359397, + "learning_rate": 1.3100186105751615e-05, + "loss": 0.3112, + "step": 2699 + }, + { + "epoch": 1.2900663042828984, + "grad_norm": 0.5050875752293276, + "learning_rate": 1.3095178454024496e-05, + "loss": 0.3355, + "step": 2700 + }, + { + "epoch": 1.2905441729884715, + "grad_norm": 0.48487507417501574, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.3494, + "step": 2701 + }, + { + "epoch": 1.2910220416940446, + "grad_norm": 0.6454288527132577, + "learning_rate": 1.308516057631582e-05, + "loss": 0.3378, + "step": 2702 + }, + { + "epoch": 1.2914999103996176, + "grad_norm": 0.5860456115191699, + "learning_rate": 1.3080150353113044e-05, + "loss": 0.3425, + "step": 2703 + }, + { + "epoch": 1.2919777791051907, + "grad_norm": 0.45560558853026417, + "learning_rate": 1.3075139275530893e-05, + "loss": 0.3333, + "step": 2704 + }, + { + "epoch": 1.292455647810764, + "grad_norm": 0.5735696204556753, + "learning_rate": 1.3070127344959348e-05, + "loss": 0.3034, + "step": 2705 + }, + { + "epoch": 1.2929335165163371, + "grad_norm": 0.5440263491737388, + "learning_rate": 1.3065114562788634e-05, + "loss": 0.3148, + "step": 2706 + }, + { + "epoch": 1.2934113852219102, + "grad_norm": 0.4894882914596722, + "learning_rate": 1.3060100930409211e-05, + "loss": 0.3359, + "step": 2707 + }, + { + "epoch": 1.2938892539274836, + "grad_norm": 0.5571970234505838, + "learning_rate": 1.3055086449211768e-05, + "loss": 0.3288, + "step": 2708 + }, + { + "epoch": 1.2943671226330566, + "grad_norm": 0.4614963731916626, + "learning_rate": 1.3050071120587235e-05, + "loss": 0.3191, + "step": 2709 + }, + { + "epoch": 1.2948449913386297, + "grad_norm": 0.5415712777984418, + "learning_rate": 1.3045054945926775e-05, + "loss": 0.3393, + "step": 2710 + }, + { + "epoch": 1.2953228600442028, + "grad_norm": 0.5125831058498063, + "learning_rate": 1.3040037926621788e-05, + "loss": 0.3201, + "step": 2711 + }, + { + "epoch": 1.295800728749776, + "grad_norm": 0.5121610378114612, + "learning_rate": 1.3035020064063903e-05, + "loss": 0.3415, + "step": 2712 + }, + { + "epoch": 1.2962785974553492, + "grad_norm": 0.5247492831665274, + "learning_rate": 1.3030001359644992e-05, + "loss": 0.3252, + "step": 2713 + }, + { + "epoch": 1.2967564661609223, + "grad_norm": 0.9578688622282728, + "learning_rate": 1.302498181475715e-05, + "loss": 0.3308, + "step": 2714 + }, + { + "epoch": 1.2972343348664954, + "grad_norm": 0.5081119642289837, + "learning_rate": 1.3019961430792711e-05, + "loss": 0.3129, + "step": 2715 + }, + { + "epoch": 1.2977122035720685, + "grad_norm": 0.486093493880196, + "learning_rate": 1.3014940209144246e-05, + "loss": 0.3437, + "step": 2716 + }, + { + "epoch": 1.2981900722776416, + "grad_norm": 0.5533735834037948, + "learning_rate": 1.3009918151204546e-05, + "loss": 0.3192, + "step": 2717 + }, + { + "epoch": 1.298667940983215, + "grad_norm": 0.5027180369228621, + "learning_rate": 1.3004895258366648e-05, + "loss": 0.3539, + "step": 2718 + }, + { + "epoch": 1.299145809688788, + "grad_norm": 0.48700562710108203, + "learning_rate": 1.299987153202381e-05, + "loss": 0.3283, + "step": 2719 + }, + { + "epoch": 1.299623678394361, + "grad_norm": 0.4947556284065124, + "learning_rate": 1.2994846973569524e-05, + "loss": 0.3273, + "step": 2720 + }, + { + "epoch": 1.3001015470999344, + "grad_norm": 0.5278204218639244, + "learning_rate": 1.298982158439752e-05, + "loss": 0.3283, + "step": 2721 + }, + { + "epoch": 1.3005794158055075, + "grad_norm": 0.5311707177146678, + "learning_rate": 1.2984795365901743e-05, + "loss": 0.3358, + "step": 2722 + }, + { + "epoch": 1.3010572845110806, + "grad_norm": 0.5090844743344694, + "learning_rate": 1.2979768319476384e-05, + "loss": 0.3265, + "step": 2723 + }, + { + "epoch": 1.3015351532166537, + "grad_norm": 0.5069541011918202, + "learning_rate": 1.2974740446515858e-05, + "loss": 0.3422, + "step": 2724 + }, + { + "epoch": 1.3020130219222268, + "grad_norm": 0.48331918054796685, + "learning_rate": 1.2969711748414804e-05, + "loss": 0.3206, + "step": 2725 + }, + { + "epoch": 1.3024908906278, + "grad_norm": 0.5096220010199282, + "learning_rate": 1.2964682226568095e-05, + "loss": 0.3428, + "step": 2726 + }, + { + "epoch": 1.3029687593333732, + "grad_norm": 0.5373918271686221, + "learning_rate": 1.2959651882370835e-05, + "loss": 0.3182, + "step": 2727 + }, + { + "epoch": 1.3034466280389463, + "grad_norm": 0.48152585122740144, + "learning_rate": 1.2954620717218344e-05, + "loss": 0.3366, + "step": 2728 + }, + { + "epoch": 1.3039244967445194, + "grad_norm": 0.5135718903681161, + "learning_rate": 1.2949588732506191e-05, + "loss": 0.3185, + "step": 2729 + }, + { + "epoch": 1.3044023654500925, + "grad_norm": 0.5112482878597828, + "learning_rate": 1.2944555929630152e-05, + "loss": 0.3175, + "step": 2730 + }, + { + "epoch": 1.3048802341556658, + "grad_norm": 0.5064609836837177, + "learning_rate": 1.293952230998624e-05, + "loss": 0.3276, + "step": 2731 + }, + { + "epoch": 1.3053581028612389, + "grad_norm": 0.5400667483073638, + "learning_rate": 1.2934487874970686e-05, + "loss": 0.3254, + "step": 2732 + }, + { + "epoch": 1.305835971566812, + "grad_norm": 0.8384923478396311, + "learning_rate": 1.2929452625979966e-05, + "loss": 0.3323, + "step": 2733 + }, + { + "epoch": 1.3063138402723853, + "grad_norm": 0.5025088436289159, + "learning_rate": 1.2924416564410755e-05, + "loss": 0.3337, + "step": 2734 + }, + { + "epoch": 1.3067917089779584, + "grad_norm": 0.5521316915977825, + "learning_rate": 1.2919379691659979e-05, + "loss": 0.3338, + "step": 2735 + }, + { + "epoch": 1.3072695776835315, + "grad_norm": 0.5297751581493867, + "learning_rate": 1.2914342009124777e-05, + "loss": 0.3064, + "step": 2736 + }, + { + "epoch": 1.3077474463891046, + "grad_norm": 0.5163194639773859, + "learning_rate": 1.2909303518202502e-05, + "loss": 0.3227, + "step": 2737 + }, + { + "epoch": 1.3082253150946777, + "grad_norm": 0.48736656477750007, + "learning_rate": 1.2904264220290755e-05, + "loss": 0.3385, + "step": 2738 + }, + { + "epoch": 1.308703183800251, + "grad_norm": 0.5131870252317862, + "learning_rate": 1.2899224116787345e-05, + "loss": 0.3316, + "step": 2739 + }, + { + "epoch": 1.309181052505824, + "grad_norm": 0.5101460923469608, + "learning_rate": 1.2894183209090304e-05, + "loss": 0.3302, + "step": 2740 + }, + { + "epoch": 1.3096589212113972, + "grad_norm": 0.6483849369195238, + "learning_rate": 1.2889141498597893e-05, + "loss": 0.3144, + "step": 2741 + }, + { + "epoch": 1.3101367899169702, + "grad_norm": 0.4884644846621732, + "learning_rate": 1.2884098986708598e-05, + "loss": 0.3326, + "step": 2742 + }, + { + "epoch": 1.3106146586225433, + "grad_norm": 0.49766916739509637, + "learning_rate": 1.2879055674821112e-05, + "loss": 0.346, + "step": 2743 + }, + { + "epoch": 1.3110925273281167, + "grad_norm": 0.505704966484551, + "learning_rate": 1.2874011564334372e-05, + "loss": 0.3242, + "step": 2744 + }, + { + "epoch": 1.3115703960336897, + "grad_norm": 0.49985444305963106, + "learning_rate": 1.2868966656647522e-05, + "loss": 0.3169, + "step": 2745 + }, + { + "epoch": 1.3120482647392628, + "grad_norm": 0.4722113638853128, + "learning_rate": 1.2863920953159925e-05, + "loss": 0.3359, + "step": 2746 + }, + { + "epoch": 1.3125261334448362, + "grad_norm": 0.5155161827129077, + "learning_rate": 1.2858874455271175e-05, + "loss": 0.3321, + "step": 2747 + }, + { + "epoch": 1.3130040021504092, + "grad_norm": 0.5850743401779861, + "learning_rate": 1.2853827164381083e-05, + "loss": 0.3404, + "step": 2748 + }, + { + "epoch": 1.3134818708559823, + "grad_norm": 0.47900054565538824, + "learning_rate": 1.2848779081889675e-05, + "loss": 0.3299, + "step": 2749 + }, + { + "epoch": 1.3139597395615554, + "grad_norm": 0.4846014694563064, + "learning_rate": 1.2843730209197203e-05, + "loss": 0.317, + "step": 2750 + }, + { + "epoch": 1.3144376082671285, + "grad_norm": 0.46319015268026265, + "learning_rate": 1.283868054770413e-05, + "loss": 0.3649, + "step": 2751 + }, + { + "epoch": 1.3149154769727018, + "grad_norm": 0.492209610537854, + "learning_rate": 1.2833630098811148e-05, + "loss": 0.3354, + "step": 2752 + }, + { + "epoch": 1.315393345678275, + "grad_norm": 0.5469966096817243, + "learning_rate": 1.2828578863919163e-05, + "loss": 0.3318, + "step": 2753 + }, + { + "epoch": 1.315871214383848, + "grad_norm": 0.7017701345315566, + "learning_rate": 1.2823526844429295e-05, + "loss": 0.343, + "step": 2754 + }, + { + "epoch": 1.3163490830894211, + "grad_norm": 0.4870050694221161, + "learning_rate": 1.2818474041742885e-05, + "loss": 0.3287, + "step": 2755 + }, + { + "epoch": 1.3168269517949942, + "grad_norm": 0.48028268862938356, + "learning_rate": 1.2813420457261497e-05, + "loss": 0.325, + "step": 2756 + }, + { + "epoch": 1.3173048205005675, + "grad_norm": 0.47555681810939554, + "learning_rate": 1.2808366092386896e-05, + "loss": 0.3383, + "step": 2757 + }, + { + "epoch": 1.3177826892061406, + "grad_norm": 0.47978915398997984, + "learning_rate": 1.2803310948521083e-05, + "loss": 0.3344, + "step": 2758 + }, + { + "epoch": 1.3182605579117137, + "grad_norm": 0.5676291606710551, + "learning_rate": 1.2798255027066265e-05, + "loss": 0.3367, + "step": 2759 + }, + { + "epoch": 1.318738426617287, + "grad_norm": 0.47898495928629903, + "learning_rate": 1.2793198329424858e-05, + "loss": 0.344, + "step": 2760 + }, + { + "epoch": 1.31921629532286, + "grad_norm": 0.6291553763138783, + "learning_rate": 1.278814085699951e-05, + "loss": 0.3288, + "step": 2761 + }, + { + "epoch": 1.3196941640284332, + "grad_norm": 0.5128423783465226, + "learning_rate": 1.2783082611193068e-05, + "loss": 0.3326, + "step": 2762 + }, + { + "epoch": 1.3201720327340063, + "grad_norm": 0.4966607745937759, + "learning_rate": 1.2778023593408601e-05, + "loss": 0.335, + "step": 2763 + }, + { + "epoch": 1.3206499014395794, + "grad_norm": 0.4621870832599053, + "learning_rate": 1.2772963805049395e-05, + "loss": 0.3169, + "step": 2764 + }, + { + "epoch": 1.3211277701451527, + "grad_norm": 0.5336568579759654, + "learning_rate": 1.2767903247518945e-05, + "loss": 0.3197, + "step": 2765 + }, + { + "epoch": 1.3216056388507258, + "grad_norm": 0.4998758991079805, + "learning_rate": 1.2762841922220956e-05, + "loss": 0.3101, + "step": 2766 + }, + { + "epoch": 1.3220835075562989, + "grad_norm": 0.5061171116591409, + "learning_rate": 1.2757779830559353e-05, + "loss": 0.3191, + "step": 2767 + }, + { + "epoch": 1.322561376261872, + "grad_norm": 0.5779265310597284, + "learning_rate": 1.2752716973938272e-05, + "loss": 0.3102, + "step": 2768 + }, + { + "epoch": 1.323039244967445, + "grad_norm": 0.4806329769254405, + "learning_rate": 1.274765335376206e-05, + "loss": 0.3441, + "step": 2769 + }, + { + "epoch": 1.3235171136730184, + "grad_norm": 0.5262517952171601, + "learning_rate": 1.2742588971435276e-05, + "loss": 0.3289, + "step": 2770 + }, + { + "epoch": 1.3239949823785915, + "grad_norm": 0.5469131458946188, + "learning_rate": 1.273752382836269e-05, + "loss": 0.3281, + "step": 2771 + }, + { + "epoch": 1.3244728510841646, + "grad_norm": 0.4981852013260321, + "learning_rate": 1.2732457925949282e-05, + "loss": 0.3339, + "step": 2772 + }, + { + "epoch": 1.3249507197897379, + "grad_norm": 0.4995218099055462, + "learning_rate": 1.2727391265600248e-05, + "loss": 0.3217, + "step": 2773 + }, + { + "epoch": 1.325428588495311, + "grad_norm": 0.48442166594011293, + "learning_rate": 1.2722323848720985e-05, + "loss": 0.3198, + "step": 2774 + }, + { + "epoch": 1.325906457200884, + "grad_norm": 0.5288357240858039, + "learning_rate": 1.2717255676717106e-05, + "loss": 0.3265, + "step": 2775 + }, + { + "epoch": 1.3263843259064572, + "grad_norm": 0.47363656909947727, + "learning_rate": 1.2712186750994437e-05, + "loss": 0.3144, + "step": 2776 + }, + { + "epoch": 1.3268621946120303, + "grad_norm": 0.5621473769140592, + "learning_rate": 1.2707117072959004e-05, + "loss": 0.3201, + "step": 2777 + }, + { + "epoch": 1.3273400633176036, + "grad_norm": 0.516657131750651, + "learning_rate": 1.2702046644017045e-05, + "loss": 0.3407, + "step": 2778 + }, + { + "epoch": 1.3278179320231767, + "grad_norm": 0.529619985203502, + "learning_rate": 1.2696975465575016e-05, + "loss": 0.3256, + "step": 2779 + }, + { + "epoch": 1.3282958007287498, + "grad_norm": 0.5434444774186435, + "learning_rate": 1.2691903539039563e-05, + "loss": 0.3228, + "step": 2780 + }, + { + "epoch": 1.3287736694343228, + "grad_norm": 0.565815798825105, + "learning_rate": 1.2686830865817552e-05, + "loss": 0.3327, + "step": 2781 + }, + { + "epoch": 1.329251538139896, + "grad_norm": 0.5113159211723257, + "learning_rate": 1.2681757447316057e-05, + "loss": 0.336, + "step": 2782 + }, + { + "epoch": 1.3297294068454693, + "grad_norm": 0.48646729323156335, + "learning_rate": 1.2676683284942348e-05, + "loss": 0.3377, + "step": 2783 + }, + { + "epoch": 1.3302072755510423, + "grad_norm": 0.4730669634348989, + "learning_rate": 1.267160838010391e-05, + "loss": 0.3511, + "step": 2784 + }, + { + "epoch": 1.3306851442566154, + "grad_norm": 0.6446175620584933, + "learning_rate": 1.2666532734208437e-05, + "loss": 0.3382, + "step": 2785 + }, + { + "epoch": 1.3311630129621888, + "grad_norm": 0.5040172790962374, + "learning_rate": 1.2661456348663822e-05, + "loss": 0.3291, + "step": 2786 + }, + { + "epoch": 1.3316408816677618, + "grad_norm": 0.6517602921578377, + "learning_rate": 1.265637922487816e-05, + "loss": 0.3216, + "step": 2787 + }, + { + "epoch": 1.332118750373335, + "grad_norm": 0.4648517246826999, + "learning_rate": 1.2651301364259761e-05, + "loss": 0.3355, + "step": 2788 + }, + { + "epoch": 1.332596619078908, + "grad_norm": 0.48877805021759047, + "learning_rate": 1.2646222768217129e-05, + "loss": 0.3331, + "step": 2789 + }, + { + "epoch": 1.3330744877844811, + "grad_norm": 0.4820080738115575, + "learning_rate": 1.264114343815898e-05, + "loss": 0.3148, + "step": 2790 + }, + { + "epoch": 1.3335523564900544, + "grad_norm": 0.4842200928813744, + "learning_rate": 1.2636063375494233e-05, + "loss": 0.3271, + "step": 2791 + }, + { + "epoch": 1.3340302251956275, + "grad_norm": 0.48858773566706376, + "learning_rate": 1.2630982581632003e-05, + "loss": 0.3306, + "step": 2792 + }, + { + "epoch": 1.3345080939012006, + "grad_norm": 0.49277764185033907, + "learning_rate": 1.2625901057981613e-05, + "loss": 0.3288, + "step": 2793 + }, + { + "epoch": 1.3349859626067737, + "grad_norm": 0.5015471955508418, + "learning_rate": 1.2620818805952595e-05, + "loss": 0.315, + "step": 2794 + }, + { + "epoch": 1.3354638313123468, + "grad_norm": 0.4703720797913821, + "learning_rate": 1.2615735826954664e-05, + "loss": 0.3377, + "step": 2795 + }, + { + "epoch": 1.3359417000179201, + "grad_norm": 0.6095651211109612, + "learning_rate": 1.2610652122397762e-05, + "loss": 0.3285, + "step": 2796 + }, + { + "epoch": 1.3364195687234932, + "grad_norm": 0.4528886888495506, + "learning_rate": 1.260556769369201e-05, + "loss": 0.3222, + "step": 2797 + }, + { + "epoch": 1.3368974374290663, + "grad_norm": 0.46151024325444917, + "learning_rate": 1.2600482542247738e-05, + "loss": 0.3311, + "step": 2798 + }, + { + "epoch": 1.3373753061346396, + "grad_norm": 0.5144167233653287, + "learning_rate": 1.2595396669475486e-05, + "loss": 0.3292, + "step": 2799 + }, + { + "epoch": 1.3378531748402127, + "grad_norm": 0.5408339927194712, + "learning_rate": 1.2590310076785974e-05, + "loss": 0.3179, + "step": 2800 + }, + { + "epoch": 1.3383310435457858, + "grad_norm": 0.47650637067616053, + "learning_rate": 1.258522276559014e-05, + "loss": 0.3355, + "step": 2801 + }, + { + "epoch": 1.338808912251359, + "grad_norm": 0.525246628210698, + "learning_rate": 1.2580134737299117e-05, + "loss": 0.3481, + "step": 2802 + }, + { + "epoch": 1.339286780956932, + "grad_norm": 0.47691152981429813, + "learning_rate": 1.2575045993324227e-05, + "loss": 0.3122, + "step": 2803 + }, + { + "epoch": 1.3397646496625053, + "grad_norm": 0.5092137433283072, + "learning_rate": 1.2569956535077004e-05, + "loss": 0.3324, + "step": 2804 + }, + { + "epoch": 1.3402425183680784, + "grad_norm": 0.4952198893971536, + "learning_rate": 1.256486636396917e-05, + "loss": 0.338, + "step": 2805 + }, + { + "epoch": 1.3407203870736515, + "grad_norm": 0.49148265729272733, + "learning_rate": 1.255977548141265e-05, + "loss": 0.3133, + "step": 2806 + }, + { + "epoch": 1.3411982557792246, + "grad_norm": 0.5396008187377243, + "learning_rate": 1.2554683888819565e-05, + "loss": 0.3187, + "step": 2807 + }, + { + "epoch": 1.3416761244847977, + "grad_norm": 0.4824817484621732, + "learning_rate": 1.2549591587602237e-05, + "loss": 0.3304, + "step": 2808 + }, + { + "epoch": 1.342153993190371, + "grad_norm": 0.5081977637354367, + "learning_rate": 1.2544498579173172e-05, + "loss": 0.3142, + "step": 2809 + }, + { + "epoch": 1.342631861895944, + "grad_norm": 0.5058206945057618, + "learning_rate": 1.2539404864945087e-05, + "loss": 0.317, + "step": 2810 + }, + { + "epoch": 1.3431097306015172, + "grad_norm": 0.5153353255064529, + "learning_rate": 1.2534310446330888e-05, + "loss": 0.3176, + "step": 2811 + }, + { + "epoch": 1.3435875993070905, + "grad_norm": 0.5318910708535983, + "learning_rate": 1.2529215324743673e-05, + "loss": 0.308, + "step": 2812 + }, + { + "epoch": 1.3440654680126636, + "grad_norm": 0.496994923710053, + "learning_rate": 1.2524119501596743e-05, + "loss": 0.3226, + "step": 2813 + }, + { + "epoch": 1.3445433367182367, + "grad_norm": 0.5165301727279924, + "learning_rate": 1.251902297830359e-05, + "loss": 0.336, + "step": 2814 + }, + { + "epoch": 1.3450212054238098, + "grad_norm": 0.5340709701111669, + "learning_rate": 1.2513925756277894e-05, + "loss": 0.3234, + "step": 2815 + }, + { + "epoch": 1.3454990741293829, + "grad_norm": 0.4828618104957075, + "learning_rate": 1.250882783693354e-05, + "loss": 0.3006, + "step": 2816 + }, + { + "epoch": 1.3459769428349562, + "grad_norm": 0.4785810307178594, + "learning_rate": 1.25037292216846e-05, + "loss": 0.3301, + "step": 2817 + }, + { + "epoch": 1.3464548115405293, + "grad_norm": 0.5032661037822173, + "learning_rate": 1.2498629911945333e-05, + "loss": 0.3262, + "step": 2818 + }, + { + "epoch": 1.3469326802461024, + "grad_norm": 0.48799683567616237, + "learning_rate": 1.249352990913021e-05, + "loss": 0.3166, + "step": 2819 + }, + { + "epoch": 1.3474105489516754, + "grad_norm": 0.5225514388182845, + "learning_rate": 1.2488429214653871e-05, + "loss": 0.3338, + "step": 2820 + }, + { + "epoch": 1.3478884176572485, + "grad_norm": 0.4688356836854529, + "learning_rate": 1.2483327829931167e-05, + "loss": 0.3117, + "step": 2821 + }, + { + "epoch": 1.3483662863628219, + "grad_norm": 0.5582134330183454, + "learning_rate": 1.2478225756377127e-05, + "loss": 0.3239, + "step": 2822 + }, + { + "epoch": 1.348844155068395, + "grad_norm": 0.4909856374763454, + "learning_rate": 1.2473122995406976e-05, + "loss": 0.3374, + "step": 2823 + }, + { + "epoch": 1.349322023773968, + "grad_norm": 0.5121950978077848, + "learning_rate": 1.2468019548436132e-05, + "loss": 0.3232, + "step": 2824 + }, + { + "epoch": 1.3497998924795414, + "grad_norm": 0.5347242883406231, + "learning_rate": 1.24629154168802e-05, + "loss": 0.3201, + "step": 2825 + }, + { + "epoch": 1.3502777611851144, + "grad_norm": 0.7886377488771786, + "learning_rate": 1.2457810602154975e-05, + "loss": 0.3208, + "step": 2826 + }, + { + "epoch": 1.3507556298906875, + "grad_norm": 0.48634722610155756, + "learning_rate": 1.2452705105676448e-05, + "loss": 0.3164, + "step": 2827 + }, + { + "epoch": 1.3512334985962606, + "grad_norm": 0.49071957483260065, + "learning_rate": 1.244759892886079e-05, + "loss": 0.3365, + "step": 2828 + }, + { + "epoch": 1.3517113673018337, + "grad_norm": 0.4953873498833253, + "learning_rate": 1.2442492073124359e-05, + "loss": 0.321, + "step": 2829 + }, + { + "epoch": 1.352189236007407, + "grad_norm": 0.5251226282353867, + "learning_rate": 1.2437384539883715e-05, + "loss": 0.3327, + "step": 2830 + }, + { + "epoch": 1.3526671047129801, + "grad_norm": 0.5013267329927095, + "learning_rate": 1.2432276330555592e-05, + "loss": 0.3445, + "step": 2831 + }, + { + "epoch": 1.3531449734185532, + "grad_norm": 0.8028493838704872, + "learning_rate": 1.2427167446556922e-05, + "loss": 0.3166, + "step": 2832 + }, + { + "epoch": 1.3536228421241263, + "grad_norm": 0.6378917954070072, + "learning_rate": 1.2422057889304814e-05, + "loss": 0.3396, + "step": 2833 + }, + { + "epoch": 1.3541007108296994, + "grad_norm": 0.4859271357599526, + "learning_rate": 1.2416947660216576e-05, + "loss": 0.3371, + "step": 2834 + }, + { + "epoch": 1.3545785795352727, + "grad_norm": 0.5227282184131874, + "learning_rate": 1.2411836760709686e-05, + "loss": 0.327, + "step": 2835 + }, + { + "epoch": 1.3550564482408458, + "grad_norm": 0.5208799708860445, + "learning_rate": 1.2406725192201828e-05, + "loss": 0.3254, + "step": 2836 + }, + { + "epoch": 1.355534316946419, + "grad_norm": 0.4795223980334988, + "learning_rate": 1.2401612956110853e-05, + "loss": 0.3111, + "step": 2837 + }, + { + "epoch": 1.3560121856519922, + "grad_norm": 0.4963240280920323, + "learning_rate": 1.2396500053854808e-05, + "loss": 0.3259, + "step": 2838 + }, + { + "epoch": 1.3564900543575653, + "grad_norm": 0.5039449229996626, + "learning_rate": 1.2391386486851922e-05, + "loss": 0.3197, + "step": 2839 + }, + { + "epoch": 1.3569679230631384, + "grad_norm": 0.5317251552116927, + "learning_rate": 1.2386272256520606e-05, + "loss": 0.3197, + "step": 2840 + }, + { + "epoch": 1.3574457917687115, + "grad_norm": 0.48744894671494793, + "learning_rate": 1.2381157364279462e-05, + "loss": 0.3035, + "step": 2841 + }, + { + "epoch": 1.3579236604742846, + "grad_norm": 0.5025271430359258, + "learning_rate": 1.2376041811547268e-05, + "loss": 0.3314, + "step": 2842 + }, + { + "epoch": 1.358401529179858, + "grad_norm": 0.5167243594092309, + "learning_rate": 1.2370925599742987e-05, + "loss": 0.3226, + "step": 2843 + }, + { + "epoch": 1.358879397885431, + "grad_norm": 0.525633231146885, + "learning_rate": 1.2365808730285764e-05, + "loss": 0.3359, + "step": 2844 + }, + { + "epoch": 1.359357266591004, + "grad_norm": 0.4991777969329675, + "learning_rate": 1.2360691204594937e-05, + "loss": 0.3209, + "step": 2845 + }, + { + "epoch": 1.3598351352965774, + "grad_norm": 0.5286964966712626, + "learning_rate": 1.2355573024090009e-05, + "loss": 0.3243, + "step": 2846 + }, + { + "epoch": 1.3603130040021503, + "grad_norm": 0.48540137645110326, + "learning_rate": 1.2350454190190675e-05, + "loss": 0.3146, + "step": 2847 + }, + { + "epoch": 1.3607908727077236, + "grad_norm": 0.526573346210916, + "learning_rate": 1.2345334704316811e-05, + "loss": 0.3272, + "step": 2848 + }, + { + "epoch": 1.3612687414132967, + "grad_norm": 0.5248630946951779, + "learning_rate": 1.2340214567888472e-05, + "loss": 0.3318, + "step": 2849 + }, + { + "epoch": 1.3617466101188698, + "grad_norm": 0.49262488475200295, + "learning_rate": 1.2335093782325889e-05, + "loss": 0.3335, + "step": 2850 + }, + { + "epoch": 1.362224478824443, + "grad_norm": 0.47664312818269816, + "learning_rate": 1.2329972349049481e-05, + "loss": 0.3357, + "step": 2851 + }, + { + "epoch": 1.3627023475300162, + "grad_norm": 0.49333473606642414, + "learning_rate": 1.2324850269479847e-05, + "loss": 0.3193, + "step": 2852 + }, + { + "epoch": 1.3631802162355893, + "grad_norm": 0.5352064541201343, + "learning_rate": 1.2319727545037753e-05, + "loss": 0.3372, + "step": 2853 + }, + { + "epoch": 1.3636580849411624, + "grad_norm": 0.5155080301298953, + "learning_rate": 1.2314604177144164e-05, + "loss": 0.3165, + "step": 2854 + }, + { + "epoch": 1.3641359536467355, + "grad_norm": 0.5556617290050194, + "learning_rate": 1.2309480167220203e-05, + "loss": 0.3062, + "step": 2855 + }, + { + "epoch": 1.3646138223523088, + "grad_norm": 0.5075291890940449, + "learning_rate": 1.230435551668718e-05, + "loss": 0.3345, + "step": 2856 + }, + { + "epoch": 1.3650916910578819, + "grad_norm": 0.5139412287417211, + "learning_rate": 1.2299230226966587e-05, + "loss": 0.3351, + "step": 2857 + }, + { + "epoch": 1.365569559763455, + "grad_norm": 0.5335170451249204, + "learning_rate": 1.2294104299480085e-05, + "loss": 0.3316, + "step": 2858 + }, + { + "epoch": 1.3660474284690283, + "grad_norm": 0.5623627174173891, + "learning_rate": 1.2288977735649518e-05, + "loss": 0.3182, + "step": 2859 + }, + { + "epoch": 1.3665252971746014, + "grad_norm": 0.48937860284820955, + "learning_rate": 1.2283850536896907e-05, + "loss": 0.3301, + "step": 2860 + }, + { + "epoch": 1.3670031658801745, + "grad_norm": 0.4993453513384397, + "learning_rate": 1.2278722704644439e-05, + "loss": 0.3349, + "step": 2861 + }, + { + "epoch": 1.3674810345857475, + "grad_norm": 0.5215921358568462, + "learning_rate": 1.227359424031449e-05, + "loss": 0.3166, + "step": 2862 + }, + { + "epoch": 1.3679589032913206, + "grad_norm": 0.473824073344398, + "learning_rate": 1.2268465145329607e-05, + "loss": 0.329, + "step": 2863 + }, + { + "epoch": 1.368436771996894, + "grad_norm": 0.5100010802249643, + "learning_rate": 1.2263335421112505e-05, + "loss": 0.3178, + "step": 2864 + }, + { + "epoch": 1.368914640702467, + "grad_norm": 0.49849508667435044, + "learning_rate": 1.2258205069086082e-05, + "loss": 0.3331, + "step": 2865 + }, + { + "epoch": 1.3693925094080401, + "grad_norm": 0.5169260084417553, + "learning_rate": 1.2253074090673408e-05, + "loss": 0.3419, + "step": 2866 + }, + { + "epoch": 1.3698703781136132, + "grad_norm": 0.4756763753549606, + "learning_rate": 1.2247942487297724e-05, + "loss": 0.3263, + "step": 2867 + }, + { + "epoch": 1.3703482468191863, + "grad_norm": 0.492096668882402, + "learning_rate": 1.2242810260382446e-05, + "loss": 0.3301, + "step": 2868 + }, + { + "epoch": 1.3708261155247596, + "grad_norm": 0.49548035638201127, + "learning_rate": 1.2237677411351165e-05, + "loss": 0.3235, + "step": 2869 + }, + { + "epoch": 1.3713039842303327, + "grad_norm": 0.5095279037269734, + "learning_rate": 1.2232543941627641e-05, + "loss": 0.3342, + "step": 2870 + }, + { + "epoch": 1.3717818529359058, + "grad_norm": 0.45819233635275786, + "learning_rate": 1.2227409852635811e-05, + "loss": 0.3265, + "step": 2871 + }, + { + "epoch": 1.3722597216414791, + "grad_norm": 0.4923000662072581, + "learning_rate": 1.2222275145799778e-05, + "loss": 0.3217, + "step": 2872 + }, + { + "epoch": 1.3727375903470522, + "grad_norm": 0.5112507834075313, + "learning_rate": 1.2217139822543819e-05, + "loss": 0.3066, + "step": 2873 + }, + { + "epoch": 1.3732154590526253, + "grad_norm": 0.4628910419474185, + "learning_rate": 1.2212003884292388e-05, + "loss": 0.316, + "step": 2874 + }, + { + "epoch": 1.3736933277581984, + "grad_norm": 0.45732279246199825, + "learning_rate": 1.2206867332470091e-05, + "loss": 0.3165, + "step": 2875 + }, + { + "epoch": 1.3741711964637715, + "grad_norm": 0.4936353320658925, + "learning_rate": 1.2201730168501729e-05, + "loss": 0.3251, + "step": 2876 + }, + { + "epoch": 1.3746490651693448, + "grad_norm": 0.48905812990241215, + "learning_rate": 1.2196592393812257e-05, + "loss": 0.3374, + "step": 2877 + }, + { + "epoch": 1.375126933874918, + "grad_norm": 0.45753365931472645, + "learning_rate": 1.2191454009826798e-05, + "loss": 0.3242, + "step": 2878 + }, + { + "epoch": 1.375604802580491, + "grad_norm": 0.5299641397547704, + "learning_rate": 1.2186315017970656e-05, + "loss": 0.3065, + "step": 2879 + }, + { + "epoch": 1.376082671286064, + "grad_norm": 0.4839322939919366, + "learning_rate": 1.2181175419669293e-05, + "loss": 0.317, + "step": 2880 + }, + { + "epoch": 1.3765605399916372, + "grad_norm": 0.48836787664932607, + "learning_rate": 1.2176035216348345e-05, + "loss": 0.3293, + "step": 2881 + }, + { + "epoch": 1.3770384086972105, + "grad_norm": 0.49187662653450037, + "learning_rate": 1.2170894409433612e-05, + "loss": 0.3438, + "step": 2882 + }, + { + "epoch": 1.3775162774027836, + "grad_norm": 0.4818132985229131, + "learning_rate": 1.2165753000351064e-05, + "loss": 0.3338, + "step": 2883 + }, + { + "epoch": 1.3779941461083567, + "grad_norm": 0.4742707254068882, + "learning_rate": 1.2160610990526836e-05, + "loss": 0.3123, + "step": 2884 + }, + { + "epoch": 1.37847201481393, + "grad_norm": 0.475050796561632, + "learning_rate": 1.215546838138723e-05, + "loss": 0.3223, + "step": 2885 + }, + { + "epoch": 1.378949883519503, + "grad_norm": 0.5011042293446867, + "learning_rate": 1.215032517435872e-05, + "loss": 0.3261, + "step": 2886 + }, + { + "epoch": 1.3794277522250762, + "grad_norm": 0.483985527438474, + "learning_rate": 1.2145181370867936e-05, + "loss": 0.3186, + "step": 2887 + }, + { + "epoch": 1.3799056209306493, + "grad_norm": 0.4541289680253298, + "learning_rate": 1.2140036972341683e-05, + "loss": 0.3343, + "step": 2888 + }, + { + "epoch": 1.3803834896362224, + "grad_norm": 0.5073229100491303, + "learning_rate": 1.213489198020692e-05, + "loss": 0.331, + "step": 2889 + }, + { + "epoch": 1.3808613583417957, + "grad_norm": 0.5173324784175866, + "learning_rate": 1.212974639589078e-05, + "loss": 0.3313, + "step": 2890 + }, + { + "epoch": 1.3813392270473688, + "grad_norm": 0.4874640856604241, + "learning_rate": 1.2124600220820562e-05, + "loss": 0.3372, + "step": 2891 + }, + { + "epoch": 1.3818170957529419, + "grad_norm": 0.4805367156153807, + "learning_rate": 1.2119453456423718e-05, + "loss": 0.3448, + "step": 2892 + }, + { + "epoch": 1.382294964458515, + "grad_norm": 0.48556515258960525, + "learning_rate": 1.211430610412787e-05, + "loss": 0.3233, + "step": 2893 + }, + { + "epoch": 1.382772833164088, + "grad_norm": 0.5315828725085151, + "learning_rate": 1.2109158165360805e-05, + "loss": 0.3142, + "step": 2894 + }, + { + "epoch": 1.3832507018696614, + "grad_norm": 0.48141420283737696, + "learning_rate": 1.2104009641550472e-05, + "loss": 0.3218, + "step": 2895 + }, + { + "epoch": 1.3837285705752345, + "grad_norm": 0.4990446188800693, + "learning_rate": 1.2098860534124976e-05, + "loss": 0.3286, + "step": 2896 + }, + { + "epoch": 1.3842064392808076, + "grad_norm": 0.49905032499820146, + "learning_rate": 1.2093710844512594e-05, + "loss": 0.3273, + "step": 2897 + }, + { + "epoch": 1.3846843079863809, + "grad_norm": 0.5206266774252051, + "learning_rate": 1.2088560574141754e-05, + "loss": 0.3233, + "step": 2898 + }, + { + "epoch": 1.385162176691954, + "grad_norm": 0.47976992992949313, + "learning_rate": 1.2083409724441054e-05, + "loss": 0.3318, + "step": 2899 + }, + { + "epoch": 1.385640045397527, + "grad_norm": 0.4858512164454944, + "learning_rate": 1.2078258296839245e-05, + "loss": 0.3497, + "step": 2900 + }, + { + "epoch": 1.3861179141031001, + "grad_norm": 0.5034404806016024, + "learning_rate": 1.2073106292765247e-05, + "loss": 0.3279, + "step": 2901 + }, + { + "epoch": 1.3865957828086732, + "grad_norm": 0.5079287528998049, + "learning_rate": 1.2067953713648126e-05, + "loss": 0.3344, + "step": 2902 + }, + { + "epoch": 1.3870736515142466, + "grad_norm": 0.47886319969478675, + "learning_rate": 1.206280056091713e-05, + "loss": 0.3384, + "step": 2903 + }, + { + "epoch": 1.3875515202198196, + "grad_norm": 0.5187096003859187, + "learning_rate": 1.2057646836001641e-05, + "loss": 0.3281, + "step": 2904 + }, + { + "epoch": 1.3880293889253927, + "grad_norm": 0.4615577488871791, + "learning_rate": 1.2052492540331218e-05, + "loss": 0.3235, + "step": 2905 + }, + { + "epoch": 1.3885072576309658, + "grad_norm": 0.48313478967101564, + "learning_rate": 1.2047337675335571e-05, + "loss": 0.3167, + "step": 2906 + }, + { + "epoch": 1.388985126336539, + "grad_norm": 0.48859241370609713, + "learning_rate": 1.2042182242444567e-05, + "loss": 0.3285, + "step": 2907 + }, + { + "epoch": 1.3894629950421122, + "grad_norm": 0.494594083600188, + "learning_rate": 1.203702624308823e-05, + "loss": 0.3207, + "step": 2908 + }, + { + "epoch": 1.3899408637476853, + "grad_norm": 0.46974833583616593, + "learning_rate": 1.2031869678696748e-05, + "loss": 0.3247, + "step": 2909 + }, + { + "epoch": 1.3904187324532584, + "grad_norm": 0.5040204897201126, + "learning_rate": 1.2026712550700457e-05, + "loss": 0.3303, + "step": 2910 + }, + { + "epoch": 1.3908966011588317, + "grad_norm": 0.47910736507397716, + "learning_rate": 1.2021554860529856e-05, + "loss": 0.314, + "step": 2911 + }, + { + "epoch": 1.3913744698644048, + "grad_norm": 0.4908900810546827, + "learning_rate": 1.2016396609615597e-05, + "loss": 0.3191, + "step": 2912 + }, + { + "epoch": 1.391852338569978, + "grad_norm": 0.4902328178847632, + "learning_rate": 1.2011237799388486e-05, + "loss": 0.3063, + "step": 2913 + }, + { + "epoch": 1.392330207275551, + "grad_norm": 0.48448162975602765, + "learning_rate": 1.2006078431279486e-05, + "loss": 0.334, + "step": 2914 + }, + { + "epoch": 1.392808075981124, + "grad_norm": 0.4975553113261312, + "learning_rate": 1.200091850671972e-05, + "loss": 0.3086, + "step": 2915 + }, + { + "epoch": 1.3932859446866974, + "grad_norm": 0.47682406789705983, + "learning_rate": 1.1995758027140451e-05, + "loss": 0.3267, + "step": 2916 + }, + { + "epoch": 1.3937638133922705, + "grad_norm": 0.4874223745730435, + "learning_rate": 1.1990596993973112e-05, + "loss": 0.3119, + "step": 2917 + }, + { + "epoch": 1.3942416820978436, + "grad_norm": 0.4853795113192791, + "learning_rate": 1.1985435408649281e-05, + "loss": 0.3346, + "step": 2918 + }, + { + "epoch": 1.3947195508034167, + "grad_norm": 0.46269214125213426, + "learning_rate": 1.1980273272600687e-05, + "loss": 0.3341, + "step": 2919 + }, + { + "epoch": 1.3951974195089898, + "grad_norm": 0.5053092201197434, + "learning_rate": 1.1975110587259222e-05, + "loss": 0.3229, + "step": 2920 + }, + { + "epoch": 1.395675288214563, + "grad_norm": 0.4944091898145227, + "learning_rate": 1.1969947354056918e-05, + "loss": 0.3285, + "step": 2921 + }, + { + "epoch": 1.3961531569201362, + "grad_norm": 0.49386779198974684, + "learning_rate": 1.1964783574425969e-05, + "loss": 0.3212, + "step": 2922 + }, + { + "epoch": 1.3966310256257093, + "grad_norm": 0.5269873184728784, + "learning_rate": 1.1959619249798717e-05, + "loss": 0.3294, + "step": 2923 + }, + { + "epoch": 1.3971088943312826, + "grad_norm": 0.46298505676389556, + "learning_rate": 1.1954454381607648e-05, + "loss": 0.3138, + "step": 2924 + }, + { + "epoch": 1.3975867630368557, + "grad_norm": 0.5451763604619123, + "learning_rate": 1.1949288971285411e-05, + "loss": 0.3109, + "step": 2925 + }, + { + "epoch": 1.3980646317424288, + "grad_norm": 0.47048163898437384, + "learning_rate": 1.19441230202648e-05, + "loss": 0.3332, + "step": 2926 + }, + { + "epoch": 1.3985425004480019, + "grad_norm": 0.5134522375456477, + "learning_rate": 1.1938956529978754e-05, + "loss": 0.3209, + "step": 2927 + }, + { + "epoch": 1.399020369153575, + "grad_norm": 0.5437669027850316, + "learning_rate": 1.1933789501860371e-05, + "loss": 0.3169, + "step": 2928 + }, + { + "epoch": 1.3994982378591483, + "grad_norm": 0.4756199727019128, + "learning_rate": 1.192862193734289e-05, + "loss": 0.3175, + "step": 2929 + }, + { + "epoch": 1.3999761065647214, + "grad_norm": 0.5356791744713957, + "learning_rate": 1.1923453837859706e-05, + "loss": 0.3342, + "step": 2930 + }, + { + "epoch": 1.4004539752702945, + "grad_norm": 0.5132793736491843, + "learning_rate": 1.1918285204844355e-05, + "loss": 0.3226, + "step": 2931 + }, + { + "epoch": 1.4009318439758676, + "grad_norm": 0.4654901065559778, + "learning_rate": 1.1913116039730528e-05, + "loss": 0.3326, + "step": 2932 + }, + { + "epoch": 1.4014097126814407, + "grad_norm": 0.5200095543826778, + "learning_rate": 1.1907946343952057e-05, + "loss": 0.321, + "step": 2933 + }, + { + "epoch": 1.401887581387014, + "grad_norm": 0.4772540680937219, + "learning_rate": 1.1902776118942924e-05, + "loss": 0.3287, + "step": 2934 + }, + { + "epoch": 1.402365450092587, + "grad_norm": 0.7737307275643527, + "learning_rate": 1.1897605366137264e-05, + "loss": 0.3153, + "step": 2935 + }, + { + "epoch": 1.4028433187981602, + "grad_norm": 0.5148598861162687, + "learning_rate": 1.1892434086969343e-05, + "loss": 0.3237, + "step": 2936 + }, + { + "epoch": 1.4033211875037335, + "grad_norm": 0.5176901401278625, + "learning_rate": 1.1887262282873593e-05, + "loss": 0.3143, + "step": 2937 + }, + { + "epoch": 1.4037990562093066, + "grad_norm": 0.5251202606650733, + "learning_rate": 1.1882089955284575e-05, + "loss": 0.3325, + "step": 2938 + }, + { + "epoch": 1.4042769249148797, + "grad_norm": 0.5736546704799379, + "learning_rate": 1.1876917105637e-05, + "loss": 0.317, + "step": 2939 + }, + { + "epoch": 1.4047547936204527, + "grad_norm": 0.5121472986243492, + "learning_rate": 1.1871743735365735e-05, + "loss": 0.33, + "step": 2940 + }, + { + "epoch": 1.4052326623260258, + "grad_norm": 0.51293687234706, + "learning_rate": 1.186656984590577e-05, + "loss": 0.3366, + "step": 2941 + }, + { + "epoch": 1.4057105310315992, + "grad_norm": 0.5289581413398837, + "learning_rate": 1.1861395438692256e-05, + "loss": 0.338, + "step": 2942 + }, + { + "epoch": 1.4061883997371722, + "grad_norm": 0.7060472699769851, + "learning_rate": 1.1856220515160483e-05, + "loss": 0.328, + "step": 2943 + }, + { + "epoch": 1.4066662684427453, + "grad_norm": 0.49082212031808403, + "learning_rate": 1.185104507674588e-05, + "loss": 0.3218, + "step": 2944 + }, + { + "epoch": 1.4071441371483184, + "grad_norm": 0.5266564439767433, + "learning_rate": 1.1845869124884027e-05, + "loss": 0.3254, + "step": 2945 + }, + { + "epoch": 1.4076220058538915, + "grad_norm": 0.4967432730788497, + "learning_rate": 1.1840692661010639e-05, + "loss": 0.3141, + "step": 2946 + }, + { + "epoch": 1.4080998745594648, + "grad_norm": 0.49284978510750543, + "learning_rate": 1.1835515686561574e-05, + "loss": 0.3151, + "step": 2947 + }, + { + "epoch": 1.408577743265038, + "grad_norm": 0.4584745987957341, + "learning_rate": 1.1830338202972838e-05, + "loss": 0.3367, + "step": 2948 + }, + { + "epoch": 1.409055611970611, + "grad_norm": 0.5227828009244597, + "learning_rate": 1.1825160211680571e-05, + "loss": 0.3316, + "step": 2949 + }, + { + "epoch": 1.4095334806761843, + "grad_norm": 1.0116286136942643, + "learning_rate": 1.1819981714121054e-05, + "loss": 0.3457, + "step": 2950 + }, + { + "epoch": 1.4100113493817574, + "grad_norm": 0.508515991330586, + "learning_rate": 1.1814802711730714e-05, + "loss": 0.3234, + "step": 2951 + }, + { + "epoch": 1.4104892180873305, + "grad_norm": 0.4806606845844688, + "learning_rate": 1.1809623205946116e-05, + "loss": 0.3171, + "step": 2952 + }, + { + "epoch": 1.4109670867929036, + "grad_norm": 0.5469751195822524, + "learning_rate": 1.180444319820396e-05, + "loss": 0.3236, + "step": 2953 + }, + { + "epoch": 1.4114449554984767, + "grad_norm": 0.5181364422493109, + "learning_rate": 1.179926268994109e-05, + "loss": 0.33, + "step": 2954 + }, + { + "epoch": 1.41192282420405, + "grad_norm": 0.4780046219803632, + "learning_rate": 1.1794081682594491e-05, + "loss": 0.3224, + "step": 2955 + }, + { + "epoch": 1.4124006929096231, + "grad_norm": 0.4912369040242893, + "learning_rate": 1.178890017760128e-05, + "loss": 0.3153, + "step": 2956 + }, + { + "epoch": 1.4128785616151962, + "grad_norm": 0.4775890106833153, + "learning_rate": 1.1783718176398716e-05, + "loss": 0.3332, + "step": 2957 + }, + { + "epoch": 1.4133564303207693, + "grad_norm": 0.525737532682014, + "learning_rate": 1.1778535680424192e-05, + "loss": 0.314, + "step": 2958 + }, + { + "epoch": 1.4138342990263424, + "grad_norm": 0.47808599069608393, + "learning_rate": 1.1773352691115246e-05, + "loss": 0.3091, + "step": 2959 + }, + { + "epoch": 1.4143121677319157, + "grad_norm": 0.5514083626189912, + "learning_rate": 1.176816920990954e-05, + "loss": 0.3335, + "step": 2960 + }, + { + "epoch": 1.4147900364374888, + "grad_norm": 0.5288775984235308, + "learning_rate": 1.176298523824489e-05, + "loss": 0.3496, + "step": 2961 + }, + { + "epoch": 1.4152679051430619, + "grad_norm": 0.5325892186828176, + "learning_rate": 1.1757800777559232e-05, + "loss": 0.303, + "step": 2962 + }, + { + "epoch": 1.4157457738486352, + "grad_norm": 0.48239236208741576, + "learning_rate": 1.1752615829290644e-05, + "loss": 0.3328, + "step": 2963 + }, + { + "epoch": 1.4162236425542083, + "grad_norm": 0.5050757806977875, + "learning_rate": 1.1747430394877342e-05, + "loss": 0.3391, + "step": 2964 + }, + { + "epoch": 1.4167015112597814, + "grad_norm": 0.5076547010445559, + "learning_rate": 1.174224447575767e-05, + "loss": 0.319, + "step": 2965 + }, + { + "epoch": 1.4171793799653545, + "grad_norm": 0.5138942551079375, + "learning_rate": 1.1737058073370116e-05, + "loss": 0.329, + "step": 2966 + }, + { + "epoch": 1.4176572486709276, + "grad_norm": 0.7215604590019389, + "learning_rate": 1.1731871189153295e-05, + "loss": 0.3423, + "step": 2967 + }, + { + "epoch": 1.4181351173765009, + "grad_norm": 0.4880111433749174, + "learning_rate": 1.1726683824545953e-05, + "loss": 0.3194, + "step": 2968 + }, + { + "epoch": 1.418612986082074, + "grad_norm": 0.49438163754649106, + "learning_rate": 1.1721495980986975e-05, + "loss": 0.3308, + "step": 2969 + }, + { + "epoch": 1.419090854787647, + "grad_norm": 0.5137865619807299, + "learning_rate": 1.171630765991538e-05, + "loss": 0.3181, + "step": 2970 + }, + { + "epoch": 1.4195687234932202, + "grad_norm": 0.5058423220393458, + "learning_rate": 1.1711118862770314e-05, + "loss": 0.3299, + "step": 2971 + }, + { + "epoch": 1.4200465921987933, + "grad_norm": 0.5134808993321874, + "learning_rate": 1.1705929590991062e-05, + "loss": 0.3328, + "step": 2972 + }, + { + "epoch": 1.4205244609043666, + "grad_norm": 0.5311157334978709, + "learning_rate": 1.1700739846017033e-05, + "loss": 0.3254, + "step": 2973 + }, + { + "epoch": 1.4210023296099397, + "grad_norm": 0.47075019738009893, + "learning_rate": 1.169554962928777e-05, + "loss": 0.3283, + "step": 2974 + }, + { + "epoch": 1.4214801983155128, + "grad_norm": 0.5260956963173294, + "learning_rate": 1.169035894224295e-05, + "loss": 0.3304, + "step": 2975 + }, + { + "epoch": 1.421958067021086, + "grad_norm": 0.5540899904553104, + "learning_rate": 1.1685167786322375e-05, + "loss": 0.3338, + "step": 2976 + }, + { + "epoch": 1.4224359357266592, + "grad_norm": 0.5415508885164177, + "learning_rate": 1.1679976162965984e-05, + "loss": 0.3244, + "step": 2977 + }, + { + "epoch": 1.4229138044322323, + "grad_norm": 0.47480665610205475, + "learning_rate": 1.1674784073613841e-05, + "loss": 0.3329, + "step": 2978 + }, + { + "epoch": 1.4233916731378053, + "grad_norm": 0.5732469730028841, + "learning_rate": 1.1669591519706134e-05, + "loss": 0.3413, + "step": 2979 + }, + { + "epoch": 1.4238695418433784, + "grad_norm": 0.49708436697419595, + "learning_rate": 1.1664398502683194e-05, + "loss": 0.3129, + "step": 2980 + }, + { + "epoch": 1.4243474105489518, + "grad_norm": 0.46717071543365596, + "learning_rate": 1.165920502398547e-05, + "loss": 0.3372, + "step": 2981 + }, + { + "epoch": 1.4248252792545248, + "grad_norm": 0.4770094789534314, + "learning_rate": 1.1654011085053537e-05, + "loss": 0.3341, + "step": 2982 + }, + { + "epoch": 1.425303147960098, + "grad_norm": 0.4957722154844253, + "learning_rate": 1.1648816687328104e-05, + "loss": 0.3112, + "step": 2983 + }, + { + "epoch": 1.425781016665671, + "grad_norm": 0.4954214821075703, + "learning_rate": 1.1643621832250012e-05, + "loss": 0.3202, + "step": 2984 + }, + { + "epoch": 1.4262588853712441, + "grad_norm": 0.560090241257771, + "learning_rate": 1.1638426521260211e-05, + "loss": 0.3387, + "step": 2985 + }, + { + "epoch": 1.4267367540768174, + "grad_norm": 0.5088929436420992, + "learning_rate": 1.1633230755799799e-05, + "loss": 0.3337, + "step": 2986 + }, + { + "epoch": 1.4272146227823905, + "grad_norm": 0.5423381244933303, + "learning_rate": 1.162803453730998e-05, + "loss": 0.3151, + "step": 2987 + }, + { + "epoch": 1.4276924914879636, + "grad_norm": 0.49395978930167594, + "learning_rate": 1.1622837867232102e-05, + "loss": 0.3265, + "step": 2988 + }, + { + "epoch": 1.428170360193537, + "grad_norm": 0.4718130842189427, + "learning_rate": 1.1617640747007626e-05, + "loss": 0.3246, + "step": 2989 + }, + { + "epoch": 1.42864822889911, + "grad_norm": 0.4980105057115814, + "learning_rate": 1.1612443178078138e-05, + "loss": 0.3472, + "step": 2990 + }, + { + "epoch": 1.4291260976046831, + "grad_norm": 0.5938682319676443, + "learning_rate": 1.1607245161885358e-05, + "loss": 0.3105, + "step": 2991 + }, + { + "epoch": 1.4296039663102562, + "grad_norm": 0.48289231319482784, + "learning_rate": 1.1602046699871126e-05, + "loss": 0.3224, + "step": 2992 + }, + { + "epoch": 1.4300818350158293, + "grad_norm": 0.5006882224163078, + "learning_rate": 1.1596847793477393e-05, + "loss": 0.333, + "step": 2993 + }, + { + "epoch": 1.4305597037214026, + "grad_norm": 0.5004173467454556, + "learning_rate": 1.1591648444146251e-05, + "loss": 0.3128, + "step": 2994 + }, + { + "epoch": 1.4310375724269757, + "grad_norm": 0.49393923359090247, + "learning_rate": 1.1586448653319908e-05, + "loss": 0.3196, + "step": 2995 + }, + { + "epoch": 1.4315154411325488, + "grad_norm": 0.4696580955679868, + "learning_rate": 1.1581248422440692e-05, + "loss": 0.3249, + "step": 2996 + }, + { + "epoch": 1.431993309838122, + "grad_norm": 0.5122849607619827, + "learning_rate": 1.1576047752951056e-05, + "loss": 0.3295, + "step": 2997 + }, + { + "epoch": 1.432471178543695, + "grad_norm": 0.5080265014360067, + "learning_rate": 1.157084664629358e-05, + "loss": 0.318, + "step": 2998 + }, + { + "epoch": 1.4329490472492683, + "grad_norm": 0.5283144934239468, + "learning_rate": 1.1565645103910945e-05, + "loss": 0.3259, + "step": 2999 + }, + { + "epoch": 1.4334269159548414, + "grad_norm": 0.5254577539682926, + "learning_rate": 1.156044312724598e-05, + "loss": 0.3244, + "step": 3000 + }, + { + "epoch": 1.4339047846604145, + "grad_norm": 0.4926424316230558, + "learning_rate": 1.1555240717741618e-05, + "loss": 0.3417, + "step": 3001 + }, + { + "epoch": 1.4343826533659878, + "grad_norm": 0.4796896772668644, + "learning_rate": 1.1550037876840913e-05, + "loss": 0.3166, + "step": 3002 + }, + { + "epoch": 1.434860522071561, + "grad_norm": 0.4773964347570398, + "learning_rate": 1.1544834605987042e-05, + "loss": 0.3325, + "step": 3003 + }, + { + "epoch": 1.435338390777134, + "grad_norm": 0.5159550135561818, + "learning_rate": 1.1539630906623305e-05, + "loss": 0.3169, + "step": 3004 + }, + { + "epoch": 1.435816259482707, + "grad_norm": 0.4972490804066091, + "learning_rate": 1.153442678019311e-05, + "loss": 0.3306, + "step": 3005 + }, + { + "epoch": 1.4362941281882802, + "grad_norm": 0.48572512446743665, + "learning_rate": 1.1529222228139993e-05, + "loss": 0.3196, + "step": 3006 + }, + { + "epoch": 1.4367719968938535, + "grad_norm": 0.4956598068058492, + "learning_rate": 1.1524017251907609e-05, + "loss": 0.3233, + "step": 3007 + }, + { + "epoch": 1.4372498655994266, + "grad_norm": 0.47225254602078437, + "learning_rate": 1.151881185293972e-05, + "loss": 0.322, + "step": 3008 + }, + { + "epoch": 1.4377277343049997, + "grad_norm": 0.4994314156150159, + "learning_rate": 1.1513606032680214e-05, + "loss": 0.33, + "step": 3009 + }, + { + "epoch": 1.4382056030105728, + "grad_norm": 0.4932853873998178, + "learning_rate": 1.1508399792573095e-05, + "loss": 0.3316, + "step": 3010 + }, + { + "epoch": 1.4386834717161459, + "grad_norm": 0.4692505918945855, + "learning_rate": 1.150319313406248e-05, + "loss": 0.3068, + "step": 3011 + }, + { + "epoch": 1.4391613404217192, + "grad_norm": 0.5056071919354226, + "learning_rate": 1.1497986058592607e-05, + "loss": 0.3485, + "step": 3012 + }, + { + "epoch": 1.4396392091272923, + "grad_norm": 0.5537829323649248, + "learning_rate": 1.1492778567607826e-05, + "loss": 0.2966, + "step": 3013 + }, + { + "epoch": 1.4401170778328654, + "grad_norm": 0.49754562492925625, + "learning_rate": 1.1487570662552601e-05, + "loss": 0.3409, + "step": 3014 + }, + { + "epoch": 1.4405949465384387, + "grad_norm": 0.5261937593332422, + "learning_rate": 1.1482362344871514e-05, + "loss": 0.3353, + "step": 3015 + }, + { + "epoch": 1.4410728152440118, + "grad_norm": 0.44720642661167864, + "learning_rate": 1.1477153616009262e-05, + "loss": 0.3287, + "step": 3016 + }, + { + "epoch": 1.4415506839495849, + "grad_norm": 0.46950785439169124, + "learning_rate": 1.1471944477410652e-05, + "loss": 0.3198, + "step": 3017 + }, + { + "epoch": 1.442028552655158, + "grad_norm": 0.5086685855274399, + "learning_rate": 1.1466734930520609e-05, + "loss": 0.309, + "step": 3018 + }, + { + "epoch": 1.442506421360731, + "grad_norm": 0.47131182541150224, + "learning_rate": 1.1461524976784172e-05, + "loss": 0.3426, + "step": 3019 + }, + { + "epoch": 1.4429842900663044, + "grad_norm": 0.4858076737245921, + "learning_rate": 1.1456314617646482e-05, + "loss": 0.3335, + "step": 3020 + }, + { + "epoch": 1.4434621587718774, + "grad_norm": 0.47177866647330885, + "learning_rate": 1.145110385455281e-05, + "loss": 0.3117, + "step": 3021 + }, + { + "epoch": 1.4439400274774505, + "grad_norm": 0.49146463401072915, + "learning_rate": 1.1445892688948525e-05, + "loss": 0.3288, + "step": 3022 + }, + { + "epoch": 1.4444178961830239, + "grad_norm": 0.5021205966554858, + "learning_rate": 1.1440681122279113e-05, + "loss": 0.3244, + "step": 3023 + }, + { + "epoch": 1.4448957648885967, + "grad_norm": 0.4819803103747588, + "learning_rate": 1.1435469155990171e-05, + "loss": 0.3103, + "step": 3024 + }, + { + "epoch": 1.44537363359417, + "grad_norm": 1.383385618058039, + "learning_rate": 1.1430256791527406e-05, + "loss": 0.3312, + "step": 3025 + }, + { + "epoch": 1.4458515022997431, + "grad_norm": 0.5170297557146272, + "learning_rate": 1.1425044030336636e-05, + "loss": 0.3127, + "step": 3026 + }, + { + "epoch": 1.4463293710053162, + "grad_norm": 0.4832092172218538, + "learning_rate": 1.1419830873863792e-05, + "loss": 0.3267, + "step": 3027 + }, + { + "epoch": 1.4468072397108895, + "grad_norm": 0.5132105382295371, + "learning_rate": 1.1414617323554906e-05, + "loss": 0.3224, + "step": 3028 + }, + { + "epoch": 1.4472851084164626, + "grad_norm": 0.5023592255793703, + "learning_rate": 1.1409403380856128e-05, + "loss": 0.3197, + "step": 3029 + }, + { + "epoch": 1.4477629771220357, + "grad_norm": 0.49076288043984206, + "learning_rate": 1.1404189047213716e-05, + "loss": 0.3323, + "step": 3030 + }, + { + "epoch": 1.4482408458276088, + "grad_norm": 0.49562922226214057, + "learning_rate": 1.139897432407403e-05, + "loss": 0.333, + "step": 3031 + }, + { + "epoch": 1.448718714533182, + "grad_norm": 0.4849984150072255, + "learning_rate": 1.1393759212883544e-05, + "loss": 0.3169, + "step": 3032 + }, + { + "epoch": 1.4491965832387552, + "grad_norm": 0.4706486064594749, + "learning_rate": 1.1388543715088838e-05, + "loss": 0.2996, + "step": 3033 + }, + { + "epoch": 1.4496744519443283, + "grad_norm": 0.5374385570329433, + "learning_rate": 1.13833278321366e-05, + "loss": 0.3099, + "step": 3034 + }, + { + "epoch": 1.4501523206499014, + "grad_norm": 0.5207013029763824, + "learning_rate": 1.137811156547362e-05, + "loss": 0.3135, + "step": 3035 + }, + { + "epoch": 1.4506301893554747, + "grad_norm": 0.5204531377813438, + "learning_rate": 1.1372894916546804e-05, + "loss": 0.3254, + "step": 3036 + }, + { + "epoch": 1.4511080580610476, + "grad_norm": 0.46838194787481924, + "learning_rate": 1.1367677886803152e-05, + "loss": 0.3311, + "step": 3037 + }, + { + "epoch": 1.451585926766621, + "grad_norm": 0.5507044912148897, + "learning_rate": 1.1362460477689784e-05, + "loss": 0.3367, + "step": 3038 + }, + { + "epoch": 1.452063795472194, + "grad_norm": 0.5069990443386853, + "learning_rate": 1.1357242690653911e-05, + "loss": 0.342, + "step": 3039 + }, + { + "epoch": 1.452541664177767, + "grad_norm": 0.48662576527328144, + "learning_rate": 1.1352024527142855e-05, + "loss": 0.3279, + "step": 3040 + }, + { + "epoch": 1.4530195328833404, + "grad_norm": 0.5009454804753579, + "learning_rate": 1.1346805988604048e-05, + "loss": 0.3205, + "step": 3041 + }, + { + "epoch": 1.4534974015889135, + "grad_norm": 0.5033573059665943, + "learning_rate": 1.1341587076485015e-05, + "loss": 0.3258, + "step": 3042 + }, + { + "epoch": 1.4539752702944866, + "grad_norm": 0.48696663644276184, + "learning_rate": 1.1336367792233394e-05, + "loss": 0.306, + "step": 3043 + }, + { + "epoch": 1.4544531390000597, + "grad_norm": 0.4703928054737579, + "learning_rate": 1.133114813729692e-05, + "loss": 0.3342, + "step": 3044 + }, + { + "epoch": 1.4549310077056328, + "grad_norm": 0.5033596475668163, + "learning_rate": 1.1325928113123431e-05, + "loss": 0.3275, + "step": 3045 + }, + { + "epoch": 1.455408876411206, + "grad_norm": 0.4994474514417599, + "learning_rate": 1.1320707721160876e-05, + "loss": 0.3334, + "step": 3046 + }, + { + "epoch": 1.4558867451167792, + "grad_norm": 0.5430329386590091, + "learning_rate": 1.1315486962857293e-05, + "loss": 0.3116, + "step": 3047 + }, + { + "epoch": 1.4563646138223523, + "grad_norm": 0.4838126377080352, + "learning_rate": 1.1310265839660835e-05, + "loss": 0.3131, + "step": 3048 + }, + { + "epoch": 1.4568424825279256, + "grad_norm": 0.6439581398779899, + "learning_rate": 1.130504435301974e-05, + "loss": 0.3308, + "step": 3049 + }, + { + "epoch": 1.4573203512334987, + "grad_norm": 0.4840420029195805, + "learning_rate": 1.129982250438237e-05, + "loss": 0.3257, + "step": 3050 + }, + { + "epoch": 1.4577982199390718, + "grad_norm": 0.47837719127146977, + "learning_rate": 1.129460029519716e-05, + "loss": 0.3358, + "step": 3051 + }, + { + "epoch": 1.4582760886446449, + "grad_norm": 0.4893765918667975, + "learning_rate": 1.1289377726912665e-05, + "loss": 0.3352, + "step": 3052 + }, + { + "epoch": 1.458753957350218, + "grad_norm": 0.4923586108450949, + "learning_rate": 1.1284154800977533e-05, + "loss": 0.3281, + "step": 3053 + }, + { + "epoch": 1.4592318260557913, + "grad_norm": 0.4621191750554461, + "learning_rate": 1.127893151884051e-05, + "loss": 0.3401, + "step": 3054 + }, + { + "epoch": 1.4597096947613644, + "grad_norm": 0.492839774425873, + "learning_rate": 1.1273707881950445e-05, + "loss": 0.3327, + "step": 3055 + }, + { + "epoch": 1.4601875634669375, + "grad_norm": 0.4643479712090378, + "learning_rate": 1.1268483891756283e-05, + "loss": 0.3292, + "step": 3056 + }, + { + "epoch": 1.4606654321725105, + "grad_norm": 0.4671866712639955, + "learning_rate": 1.1263259549707063e-05, + "loss": 0.3255, + "step": 3057 + }, + { + "epoch": 1.4611433008780836, + "grad_norm": 0.493972595273388, + "learning_rate": 1.125803485725193e-05, + "loss": 0.3326, + "step": 3058 + }, + { + "epoch": 1.461621169583657, + "grad_norm": 0.4660787343724186, + "learning_rate": 1.1252809815840118e-05, + "loss": 0.3128, + "step": 3059 + }, + { + "epoch": 1.46209903828923, + "grad_norm": 0.4975144467520985, + "learning_rate": 1.1247584426920962e-05, + "loss": 0.3134, + "step": 3060 + }, + { + "epoch": 1.4625769069948031, + "grad_norm": 0.45843098521936504, + "learning_rate": 1.124235869194389e-05, + "loss": 0.3154, + "step": 3061 + }, + { + "epoch": 1.4630547757003765, + "grad_norm": 0.4818123666317099, + "learning_rate": 1.1237132612358436e-05, + "loss": 0.3275, + "step": 3062 + }, + { + "epoch": 1.4635326444059495, + "grad_norm": 0.5292710710183121, + "learning_rate": 1.1231906189614217e-05, + "loss": 0.3257, + "step": 3063 + }, + { + "epoch": 1.4640105131115226, + "grad_norm": 0.48700887730356873, + "learning_rate": 1.1226679425160949e-05, + "loss": 0.3329, + "step": 3064 + }, + { + "epoch": 1.4644883818170957, + "grad_norm": 0.48240701019079835, + "learning_rate": 1.1221452320448449e-05, + "loss": 0.3425, + "step": 3065 + }, + { + "epoch": 1.4649662505226688, + "grad_norm": 0.48147101653808244, + "learning_rate": 1.1216224876926622e-05, + "loss": 0.318, + "step": 3066 + }, + { + "epoch": 1.4654441192282421, + "grad_norm": 0.467596203651121, + "learning_rate": 1.1210997096045466e-05, + "loss": 0.3293, + "step": 3067 + }, + { + "epoch": 1.4659219879338152, + "grad_norm": 0.4550529019067923, + "learning_rate": 1.1205768979255078e-05, + "loss": 0.3158, + "step": 3068 + }, + { + "epoch": 1.4663998566393883, + "grad_norm": 0.48042440869684455, + "learning_rate": 1.1200540528005645e-05, + "loss": 0.3298, + "step": 3069 + }, + { + "epoch": 1.4668777253449614, + "grad_norm": 0.5277816140348442, + "learning_rate": 1.1195311743747445e-05, + "loss": 0.3363, + "step": 3070 + }, + { + "epoch": 1.4673555940505345, + "grad_norm": 0.4888123296530078, + "learning_rate": 1.1190082627930854e-05, + "loss": 0.3226, + "step": 3071 + }, + { + "epoch": 1.4678334627561078, + "grad_norm": 0.5015585568421738, + "learning_rate": 1.1184853182006332e-05, + "loss": 0.3172, + "step": 3072 + }, + { + "epoch": 1.468311331461681, + "grad_norm": 0.47745495211196265, + "learning_rate": 1.1179623407424442e-05, + "loss": 0.3181, + "step": 3073 + }, + { + "epoch": 1.468789200167254, + "grad_norm": 0.5013475163036912, + "learning_rate": 1.1174393305635825e-05, + "loss": 0.3083, + "step": 3074 + }, + { + "epoch": 1.4692670688728273, + "grad_norm": 0.521653074954796, + "learning_rate": 1.116916287809122e-05, + "loss": 0.3178, + "step": 3075 + }, + { + "epoch": 1.4697449375784004, + "grad_norm": 0.4953449613595978, + "learning_rate": 1.116393212624146e-05, + "loss": 0.3375, + "step": 3076 + }, + { + "epoch": 1.4702228062839735, + "grad_norm": 0.4704912345390417, + "learning_rate": 1.1158701051537455e-05, + "loss": 0.3199, + "step": 3077 + }, + { + "epoch": 1.4707006749895466, + "grad_norm": 0.5066215917086822, + "learning_rate": 1.1153469655430218e-05, + "loss": 0.3028, + "step": 3078 + }, + { + "epoch": 1.4711785436951197, + "grad_norm": 0.49299038538236867, + "learning_rate": 1.1148237939370847e-05, + "loss": 0.3013, + "step": 3079 + }, + { + "epoch": 1.471656412400693, + "grad_norm": 0.503131186431184, + "learning_rate": 1.1143005904810527e-05, + "loss": 0.3228, + "step": 3080 + }, + { + "epoch": 1.472134281106266, + "grad_norm": 0.48541708299393943, + "learning_rate": 1.1137773553200528e-05, + "loss": 0.3247, + "step": 3081 + }, + { + "epoch": 1.4726121498118392, + "grad_norm": 0.4667618110206343, + "learning_rate": 1.1132540885992221e-05, + "loss": 0.3302, + "step": 3082 + }, + { + "epoch": 1.4730900185174123, + "grad_norm": 0.4966223048730619, + "learning_rate": 1.1127307904637044e-05, + "loss": 0.3098, + "step": 3083 + }, + { + "epoch": 1.4735678872229854, + "grad_norm": 0.49913335938844805, + "learning_rate": 1.1122074610586541e-05, + "loss": 0.3338, + "step": 3084 + }, + { + "epoch": 1.4740457559285587, + "grad_norm": 0.5787027610611666, + "learning_rate": 1.1116841005292339e-05, + "loss": 0.3233, + "step": 3085 + }, + { + "epoch": 1.4745236246341318, + "grad_norm": 0.4988518285658534, + "learning_rate": 1.1111607090206135e-05, + "loss": 0.3451, + "step": 3086 + }, + { + "epoch": 1.4750014933397049, + "grad_norm": 0.48890728691122937, + "learning_rate": 1.1106372866779738e-05, + "loss": 0.3226, + "step": 3087 + }, + { + "epoch": 1.4754793620452782, + "grad_norm": 0.4743490756030402, + "learning_rate": 1.110113833646502e-05, + "loss": 0.3174, + "step": 3088 + }, + { + "epoch": 1.4759572307508513, + "grad_norm": 0.496094343763048, + "learning_rate": 1.1095903500713953e-05, + "loss": 0.3114, + "step": 3089 + }, + { + "epoch": 1.4764350994564244, + "grad_norm": 0.5359507037011488, + "learning_rate": 1.1090668360978589e-05, + "loss": 0.3144, + "step": 3090 + }, + { + "epoch": 1.4769129681619975, + "grad_norm": 0.5615293353766108, + "learning_rate": 1.1085432918711059e-05, + "loss": 0.3209, + "step": 3091 + }, + { + "epoch": 1.4773908368675706, + "grad_norm": 0.512584855813972, + "learning_rate": 1.1080197175363584e-05, + "loss": 0.3242, + "step": 3092 + }, + { + "epoch": 1.4778687055731439, + "grad_norm": 0.5265949611457437, + "learning_rate": 1.1074961132388466e-05, + "loss": 0.3275, + "step": 3093 + }, + { + "epoch": 1.478346574278717, + "grad_norm": 0.49105786313448796, + "learning_rate": 1.1069724791238092e-05, + "loss": 0.3118, + "step": 3094 + }, + { + "epoch": 1.47882444298429, + "grad_norm": 0.47704097499158915, + "learning_rate": 1.106448815336493e-05, + "loss": 0.3289, + "step": 3095 + }, + { + "epoch": 1.4793023116898631, + "grad_norm": 0.5232471767406344, + "learning_rate": 1.1059251220221534e-05, + "loss": 0.3302, + "step": 3096 + }, + { + "epoch": 1.4797801803954362, + "grad_norm": 0.5228817184494445, + "learning_rate": 1.1054013993260533e-05, + "loss": 0.3201, + "step": 3097 + }, + { + "epoch": 1.4802580491010096, + "grad_norm": 0.4936116345958341, + "learning_rate": 1.1048776473934642e-05, + "loss": 0.32, + "step": 3098 + }, + { + "epoch": 1.4807359178065826, + "grad_norm": 0.4930123354861475, + "learning_rate": 1.1043538663696658e-05, + "loss": 0.319, + "step": 3099 + }, + { + "epoch": 1.4812137865121557, + "grad_norm": 0.5490307592266955, + "learning_rate": 1.1038300563999455e-05, + "loss": 0.3392, + "step": 3100 + }, + { + "epoch": 1.481691655217729, + "grad_norm": 0.5030549664101823, + "learning_rate": 1.1033062176295992e-05, + "loss": 0.3256, + "step": 3101 + }, + { + "epoch": 1.4821695239233021, + "grad_norm": 0.4693206453740588, + "learning_rate": 1.1027823502039307e-05, + "loss": 0.3357, + "step": 3102 + }, + { + "epoch": 1.4826473926288752, + "grad_norm": 0.49980642230471584, + "learning_rate": 1.1022584542682508e-05, + "loss": 0.3224, + "step": 3103 + }, + { + "epoch": 1.4831252613344483, + "grad_norm": 0.4872985953951889, + "learning_rate": 1.1017345299678797e-05, + "loss": 0.32, + "step": 3104 + }, + { + "epoch": 1.4836031300400214, + "grad_norm": 0.5155764862652535, + "learning_rate": 1.1012105774481446e-05, + "loss": 0.3255, + "step": 3105 + }, + { + "epoch": 1.4840809987455947, + "grad_norm": 0.49092199999070874, + "learning_rate": 1.1006865968543805e-05, + "loss": 0.3022, + "step": 3106 + }, + { + "epoch": 1.4845588674511678, + "grad_norm": 0.4638968403292164, + "learning_rate": 1.1001625883319307e-05, + "loss": 0.3284, + "step": 3107 + }, + { + "epoch": 1.485036736156741, + "grad_norm": 0.47749399048412333, + "learning_rate": 1.0996385520261457e-05, + "loss": 0.3171, + "step": 3108 + }, + { + "epoch": 1.485514604862314, + "grad_norm": 0.571045799710566, + "learning_rate": 1.0991144880823838e-05, + "loss": 0.3236, + "step": 3109 + }, + { + "epoch": 1.485992473567887, + "grad_norm": 0.49745529315284953, + "learning_rate": 1.0985903966460115e-05, + "loss": 0.3439, + "step": 3110 + }, + { + "epoch": 1.4864703422734604, + "grad_norm": 0.5308655028324103, + "learning_rate": 1.0980662778624023e-05, + "loss": 0.329, + "step": 3111 + }, + { + "epoch": 1.4869482109790335, + "grad_norm": 0.5029668660735324, + "learning_rate": 1.0975421318769373e-05, + "loss": 0.332, + "step": 3112 + }, + { + "epoch": 1.4874260796846066, + "grad_norm": 0.5166152056859417, + "learning_rate": 1.0970179588350054e-05, + "loss": 0.3311, + "step": 3113 + }, + { + "epoch": 1.48790394839018, + "grad_norm": 0.5380965879327918, + "learning_rate": 1.0964937588820036e-05, + "loss": 0.3368, + "step": 3114 + }, + { + "epoch": 1.488381817095753, + "grad_norm": 0.4866414171234857, + "learning_rate": 1.0959695321633346e-05, + "loss": 0.3182, + "step": 3115 + }, + { + "epoch": 1.488859685801326, + "grad_norm": 0.5061118269564608, + "learning_rate": 1.0954452788244106e-05, + "loss": 0.3256, + "step": 3116 + }, + { + "epoch": 1.4893375545068992, + "grad_norm": 0.4940241531258727, + "learning_rate": 1.0949209990106497e-05, + "loss": 0.3083, + "step": 3117 + }, + { + "epoch": 1.4898154232124723, + "grad_norm": 0.4543349967578536, + "learning_rate": 1.0943966928674783e-05, + "loss": 0.3339, + "step": 3118 + }, + { + "epoch": 1.4902932919180456, + "grad_norm": 0.5120304669937483, + "learning_rate": 1.093872360540329e-05, + "loss": 0.3219, + "step": 3119 + }, + { + "epoch": 1.4907711606236187, + "grad_norm": 0.4662796909068133, + "learning_rate": 1.0933480021746432e-05, + "loss": 0.327, + "step": 3120 + }, + { + "epoch": 1.4912490293291918, + "grad_norm": 0.4846749119742991, + "learning_rate": 1.0928236179158678e-05, + "loss": 0.3087, + "step": 3121 + }, + { + "epoch": 1.4917268980347649, + "grad_norm": 0.48452861211325327, + "learning_rate": 1.0922992079094588e-05, + "loss": 0.3141, + "step": 3122 + }, + { + "epoch": 1.492204766740338, + "grad_norm": 0.499123257716151, + "learning_rate": 1.0917747723008771e-05, + "loss": 0.3193, + "step": 3123 + }, + { + "epoch": 1.4926826354459113, + "grad_norm": 0.46393153800632286, + "learning_rate": 1.0912503112355926e-05, + "loss": 0.3166, + "step": 3124 + }, + { + "epoch": 1.4931605041514844, + "grad_norm": 0.5298869777156393, + "learning_rate": 1.0907258248590816e-05, + "loss": 0.3258, + "step": 3125 + }, + { + "epoch": 1.4936383728570575, + "grad_norm": 0.47615348856877815, + "learning_rate": 1.0902013133168267e-05, + "loss": 0.296, + "step": 3126 + }, + { + "epoch": 1.4941162415626308, + "grad_norm": 0.4861605658720827, + "learning_rate": 1.089676776754319e-05, + "loss": 0.3092, + "step": 3127 + }, + { + "epoch": 1.4945941102682039, + "grad_norm": 0.5061227916241133, + "learning_rate": 1.0891522153170553e-05, + "loss": 0.3258, + "step": 3128 + }, + { + "epoch": 1.495071978973777, + "grad_norm": 0.44470015995888657, + "learning_rate": 1.0886276291505395e-05, + "loss": 0.3264, + "step": 3129 + }, + { + "epoch": 1.49554984767935, + "grad_norm": 0.4956355952980061, + "learning_rate": 1.0881030184002827e-05, + "loss": 0.3124, + "step": 3130 + }, + { + "epoch": 1.4960277163849232, + "grad_norm": 0.47672673270267124, + "learning_rate": 1.0875783832118032e-05, + "loss": 0.3075, + "step": 3131 + }, + { + "epoch": 1.4965055850904965, + "grad_norm": 0.48537882960310214, + "learning_rate": 1.0870537237306245e-05, + "loss": 0.3264, + "step": 3132 + }, + { + "epoch": 1.4969834537960696, + "grad_norm": 0.5110345138782598, + "learning_rate": 1.0865290401022785e-05, + "loss": 0.325, + "step": 3133 + }, + { + "epoch": 1.4974613225016427, + "grad_norm": 0.4626387908464134, + "learning_rate": 1.0860043324723035e-05, + "loss": 0.315, + "step": 3134 + }, + { + "epoch": 1.4979391912072157, + "grad_norm": 0.4425903321792646, + "learning_rate": 1.0854796009862434e-05, + "loss": 0.3349, + "step": 3135 + }, + { + "epoch": 1.4984170599127888, + "grad_norm": 0.5005573549699335, + "learning_rate": 1.0849548457896499e-05, + "loss": 0.3229, + "step": 3136 + }, + { + "epoch": 1.4988949286183622, + "grad_norm": 0.4629721762379501, + "learning_rate": 1.0844300670280809e-05, + "loss": 0.3098, + "step": 3137 + }, + { + "epoch": 1.4993727973239352, + "grad_norm": 0.45157919259053075, + "learning_rate": 1.0839052648471002e-05, + "loss": 0.3338, + "step": 3138 + }, + { + "epoch": 1.4998506660295083, + "grad_norm": 0.49218452155784925, + "learning_rate": 1.0833804393922796e-05, + "loss": 0.3279, + "step": 3139 + }, + { + "epoch": 1.5003285347350817, + "grad_norm": 0.46697736877733986, + "learning_rate": 1.0828555908091958e-05, + "loss": 0.3209, + "step": 3140 + }, + { + "epoch": 1.5008064034406545, + "grad_norm": 0.502846029719294, + "learning_rate": 1.0823307192434325e-05, + "loss": 0.3211, + "step": 3141 + }, + { + "epoch": 1.5012842721462278, + "grad_norm": 0.5308861043661305, + "learning_rate": 1.08180582484058e-05, + "loss": 0.317, + "step": 3142 + }, + { + "epoch": 1.501762140851801, + "grad_norm": 0.9950929077061546, + "learning_rate": 1.0812809077462348e-05, + "loss": 0.3308, + "step": 3143 + }, + { + "epoch": 1.502240009557374, + "grad_norm": 0.4748834877132264, + "learning_rate": 1.0807559681059993e-05, + "loss": 0.3225, + "step": 3144 + }, + { + "epoch": 1.5027178782629473, + "grad_norm": 0.48714653402335817, + "learning_rate": 1.0802310060654832e-05, + "loss": 0.3294, + "step": 3145 + }, + { + "epoch": 1.5031957469685204, + "grad_norm": 0.48089986557496106, + "learning_rate": 1.079706021770301e-05, + "loss": 0.3175, + "step": 3146 + }, + { + "epoch": 1.5036736156740935, + "grad_norm": 0.5517783451396178, + "learning_rate": 1.0791810153660745e-05, + "loss": 0.3294, + "step": 3147 + }, + { + "epoch": 1.5041514843796668, + "grad_norm": 0.5650570568015437, + "learning_rate": 1.078655986998431e-05, + "loss": 0.3312, + "step": 3148 + }, + { + "epoch": 1.5046293530852397, + "grad_norm": 0.45637472471019414, + "learning_rate": 1.0781309368130042e-05, + "loss": 0.3262, + "step": 3149 + }, + { + "epoch": 1.505107221790813, + "grad_norm": 0.4653008253186181, + "learning_rate": 1.0776058649554336e-05, + "loss": 0.3049, + "step": 3150 + }, + { + "epoch": 1.5055850904963861, + "grad_norm": 0.545056353096684, + "learning_rate": 1.0770807715713651e-05, + "loss": 0.3362, + "step": 3151 + }, + { + "epoch": 1.5060629592019592, + "grad_norm": 0.4902414971923998, + "learning_rate": 1.0765556568064503e-05, + "loss": 0.3204, + "step": 3152 + }, + { + "epoch": 1.5065408279075325, + "grad_norm": 0.4608071782816651, + "learning_rate": 1.0760305208063467e-05, + "loss": 0.3208, + "step": 3153 + }, + { + "epoch": 1.5070186966131054, + "grad_norm": 0.4808143098839568, + "learning_rate": 1.0755053637167178e-05, + "loss": 0.3328, + "step": 3154 + }, + { + "epoch": 1.5074965653186787, + "grad_norm": 0.48834994164881984, + "learning_rate": 1.0749801856832325e-05, + "loss": 0.3175, + "step": 3155 + }, + { + "epoch": 1.5079744340242518, + "grad_norm": 0.49638350606323217, + "learning_rate": 1.0744549868515667e-05, + "loss": 0.3324, + "step": 3156 + }, + { + "epoch": 1.508452302729825, + "grad_norm": 0.49222615340506876, + "learning_rate": 1.073929767367401e-05, + "loss": 0.322, + "step": 3157 + }, + { + "epoch": 1.5089301714353982, + "grad_norm": 0.4564826854769966, + "learning_rate": 1.0734045273764217e-05, + "loss": 0.3301, + "step": 3158 + }, + { + "epoch": 1.5094080401409713, + "grad_norm": 0.4941738943168986, + "learning_rate": 1.0728792670243215e-05, + "loss": 0.3282, + "step": 3159 + }, + { + "epoch": 1.5098859088465444, + "grad_norm": 0.4565157299448364, + "learning_rate": 1.0723539864567983e-05, + "loss": 0.3139, + "step": 3160 + }, + { + "epoch": 1.5103637775521177, + "grad_norm": 0.46731577882122116, + "learning_rate": 1.0718286858195553e-05, + "loss": 0.3094, + "step": 3161 + }, + { + "epoch": 1.5108416462576906, + "grad_norm": 0.5380454191031079, + "learning_rate": 1.071303365258302e-05, + "loss": 0.3274, + "step": 3162 + }, + { + "epoch": 1.5113195149632639, + "grad_norm": 0.4504131152594778, + "learning_rate": 1.070778024918753e-05, + "loss": 0.3404, + "step": 3163 + }, + { + "epoch": 1.511797383668837, + "grad_norm": 0.47414368841241944, + "learning_rate": 1.0702526649466282e-05, + "loss": 0.3326, + "step": 3164 + }, + { + "epoch": 1.51227525237441, + "grad_norm": 0.48484243615521744, + "learning_rate": 1.0697272854876537e-05, + "loss": 0.3149, + "step": 3165 + }, + { + "epoch": 1.5127531210799834, + "grad_norm": 0.4926823715355633, + "learning_rate": 1.0692018866875598e-05, + "loss": 0.3274, + "step": 3166 + }, + { + "epoch": 1.5132309897855563, + "grad_norm": 0.47882967616119354, + "learning_rate": 1.0686764686920834e-05, + "loss": 0.3292, + "step": 3167 + }, + { + "epoch": 1.5137088584911296, + "grad_norm": 0.4626903520160237, + "learning_rate": 1.0681510316469661e-05, + "loss": 0.3185, + "step": 3168 + }, + { + "epoch": 1.5141867271967027, + "grad_norm": 0.560531953606567, + "learning_rate": 1.0676255756979548e-05, + "loss": 0.3301, + "step": 3169 + }, + { + "epoch": 1.5146645959022758, + "grad_norm": 0.48792270052341974, + "learning_rate": 1.0671001009908015e-05, + "loss": 0.3126, + "step": 3170 + }, + { + "epoch": 1.515142464607849, + "grad_norm": 0.5129007440119009, + "learning_rate": 1.066574607671264e-05, + "loss": 0.3181, + "step": 3171 + }, + { + "epoch": 1.5156203333134222, + "grad_norm": 0.4711038320320487, + "learning_rate": 1.0660490958851044e-05, + "loss": 0.3178, + "step": 3172 + }, + { + "epoch": 1.5160982020189953, + "grad_norm": 0.5092552024671185, + "learning_rate": 1.0655235657780906e-05, + "loss": 0.3289, + "step": 3173 + }, + { + "epoch": 1.5165760707245686, + "grad_norm": 0.5953972215800041, + "learning_rate": 1.0649980174959961e-05, + "loss": 0.3102, + "step": 3174 + }, + { + "epoch": 1.5170539394301414, + "grad_norm": 0.4703123203138388, + "learning_rate": 1.0644724511845976e-05, + "loss": 0.3143, + "step": 3175 + }, + { + "epoch": 1.5175318081357148, + "grad_norm": 0.5165416055915609, + "learning_rate": 1.0639468669896787e-05, + "loss": 0.3115, + "step": 3176 + }, + { + "epoch": 1.5180096768412878, + "grad_norm": 0.582450596746319, + "learning_rate": 1.0634212650570269e-05, + "loss": 0.3316, + "step": 3177 + }, + { + "epoch": 1.518487545546861, + "grad_norm": 0.4535700489101072, + "learning_rate": 1.0628956455324347e-05, + "loss": 0.3307, + "step": 3178 + }, + { + "epoch": 1.5189654142524343, + "grad_norm": 0.48156284886715084, + "learning_rate": 1.0623700085616999e-05, + "loss": 0.327, + "step": 3179 + }, + { + "epoch": 1.5194432829580071, + "grad_norm": 0.4838090723343779, + "learning_rate": 1.0618443542906251e-05, + "loss": 0.3302, + "step": 3180 + }, + { + "epoch": 1.5199211516635804, + "grad_norm": 0.5079130746721264, + "learning_rate": 1.0613186828650171e-05, + "loss": 0.3202, + "step": 3181 + }, + { + "epoch": 1.5203990203691535, + "grad_norm": 0.4986317378339555, + "learning_rate": 1.0607929944306883e-05, + "loss": 0.308, + "step": 3182 + }, + { + "epoch": 1.5208768890747266, + "grad_norm": 0.5275576649032065, + "learning_rate": 1.0602672891334552e-05, + "loss": 0.3332, + "step": 3183 + }, + { + "epoch": 1.5213547577803, + "grad_norm": 0.4731647767150495, + "learning_rate": 1.0597415671191391e-05, + "loss": 0.3509, + "step": 3184 + }, + { + "epoch": 1.521832626485873, + "grad_norm": 0.4763797455280031, + "learning_rate": 1.059215828533566e-05, + "loss": 0.3286, + "step": 3185 + }, + { + "epoch": 1.5223104951914461, + "grad_norm": 0.47355536256007635, + "learning_rate": 1.0586900735225669e-05, + "loss": 0.3239, + "step": 3186 + }, + { + "epoch": 1.5227883638970194, + "grad_norm": 0.4674306668056484, + "learning_rate": 1.0581643022319765e-05, + "loss": 0.3065, + "step": 3187 + }, + { + "epoch": 1.5232662326025923, + "grad_norm": 0.5012159446084873, + "learning_rate": 1.0576385148076346e-05, + "loss": 0.3288, + "step": 3188 + }, + { + "epoch": 1.5237441013081656, + "grad_norm": 0.4926801554454291, + "learning_rate": 1.0571127113953855e-05, + "loss": 0.321, + "step": 3189 + }, + { + "epoch": 1.5242219700137387, + "grad_norm": 0.5352440355066403, + "learning_rate": 1.0565868921410776e-05, + "loss": 0.3326, + "step": 3190 + }, + { + "epoch": 1.5246998387193118, + "grad_norm": 0.5707201595977832, + "learning_rate": 1.0560610571905642e-05, + "loss": 0.3247, + "step": 3191 + }, + { + "epoch": 1.5251777074248851, + "grad_norm": 0.46143438621243094, + "learning_rate": 1.0555352066897025e-05, + "loss": 0.335, + "step": 3192 + }, + { + "epoch": 1.525655576130458, + "grad_norm": 0.46914561506550345, + "learning_rate": 1.0550093407843538e-05, + "loss": 0.3035, + "step": 3193 + }, + { + "epoch": 1.5261334448360313, + "grad_norm": 0.4745615418585816, + "learning_rate": 1.0544834596203846e-05, + "loss": 0.3192, + "step": 3194 + }, + { + "epoch": 1.5266113135416044, + "grad_norm": 0.48717953885939197, + "learning_rate": 1.0539575633436645e-05, + "loss": 0.3333, + "step": 3195 + }, + { + "epoch": 1.5270891822471775, + "grad_norm": 0.4735843171248546, + "learning_rate": 1.0534316521000683e-05, + "loss": 0.3256, + "step": 3196 + }, + { + "epoch": 1.5275670509527508, + "grad_norm": 0.4796146078480026, + "learning_rate": 1.0529057260354744e-05, + "loss": 0.311, + "step": 3197 + }, + { + "epoch": 1.528044919658324, + "grad_norm": 0.4964693345022835, + "learning_rate": 1.052379785295765e-05, + "loss": 0.3328, + "step": 3198 + }, + { + "epoch": 1.528522788363897, + "grad_norm": 0.4551957925753305, + "learning_rate": 1.0518538300268275e-05, + "loss": 0.3267, + "step": 3199 + }, + { + "epoch": 1.5290006570694703, + "grad_norm": 0.4833164973926133, + "learning_rate": 1.0513278603745523e-05, + "loss": 0.3346, + "step": 3200 + }, + { + "epoch": 1.5294785257750432, + "grad_norm": 0.4976783327391536, + "learning_rate": 1.0508018764848336e-05, + "loss": 0.3214, + "step": 3201 + }, + { + "epoch": 1.5299563944806165, + "grad_norm": 0.5163458481272856, + "learning_rate": 1.0502758785035708e-05, + "loss": 0.3276, + "step": 3202 + }, + { + "epoch": 1.5304342631861896, + "grad_norm": 0.5188646785189226, + "learning_rate": 1.0497498665766662e-05, + "loss": 0.3228, + "step": 3203 + }, + { + "epoch": 1.5309121318917627, + "grad_norm": 0.4865309249115541, + "learning_rate": 1.049223840850026e-05, + "loss": 0.3232, + "step": 3204 + }, + { + "epoch": 1.531390000597336, + "grad_norm": 0.508377716195505, + "learning_rate": 1.0486978014695606e-05, + "loss": 0.3224, + "step": 3205 + }, + { + "epoch": 1.531867869302909, + "grad_norm": 0.4890171119416868, + "learning_rate": 1.048171748581184e-05, + "loss": 0.321, + "step": 3206 + }, + { + "epoch": 1.5323457380084822, + "grad_norm": 0.49225304466391234, + "learning_rate": 1.0476456823308144e-05, + "loss": 0.295, + "step": 3207 + }, + { + "epoch": 1.5328236067140553, + "grad_norm": 0.533313040699636, + "learning_rate": 1.0471196028643728e-05, + "loss": 0.3171, + "step": 3208 + }, + { + "epoch": 1.5333014754196284, + "grad_norm": 0.5656534777964286, + "learning_rate": 1.0465935103277845e-05, + "loss": 0.324, + "step": 3209 + }, + { + "epoch": 1.5337793441252017, + "grad_norm": 0.49695878757528583, + "learning_rate": 1.0460674048669783e-05, + "loss": 0.3218, + "step": 3210 + }, + { + "epoch": 1.5342572128307748, + "grad_norm": 0.5362838974016912, + "learning_rate": 1.0455412866278868e-05, + "loss": 0.3242, + "step": 3211 + }, + { + "epoch": 1.5347350815363479, + "grad_norm": 0.5144518785758332, + "learning_rate": 1.0450151557564457e-05, + "loss": 0.3334, + "step": 3212 + }, + { + "epoch": 1.5352129502419212, + "grad_norm": 0.5068918594348978, + "learning_rate": 1.0444890123985942e-05, + "loss": 0.3299, + "step": 3213 + }, + { + "epoch": 1.535690818947494, + "grad_norm": 0.4780919059955396, + "learning_rate": 1.043962856700276e-05, + "loss": 0.3199, + "step": 3214 + }, + { + "epoch": 1.5361686876530674, + "grad_norm": 0.5127686326304569, + "learning_rate": 1.0434366888074363e-05, + "loss": 0.3193, + "step": 3215 + }, + { + "epoch": 1.5366465563586404, + "grad_norm": 0.49589298134321574, + "learning_rate": 1.0429105088660253e-05, + "loss": 0.3226, + "step": 3216 + }, + { + "epoch": 1.5371244250642135, + "grad_norm": 0.4860233798195414, + "learning_rate": 1.0423843170219966e-05, + "loss": 0.3365, + "step": 3217 + }, + { + "epoch": 1.5376022937697869, + "grad_norm": 0.4714518886662182, + "learning_rate": 1.0418581134213055e-05, + "loss": 0.3244, + "step": 3218 + }, + { + "epoch": 1.53808016247536, + "grad_norm": 0.4955762103566397, + "learning_rate": 1.0413318982099124e-05, + "loss": 0.3226, + "step": 3219 + }, + { + "epoch": 1.538558031180933, + "grad_norm": 0.4973793360007407, + "learning_rate": 1.0408056715337797e-05, + "loss": 0.3154, + "step": 3220 + }, + { + "epoch": 1.5390358998865061, + "grad_norm": 0.487987592402328, + "learning_rate": 1.0402794335388733e-05, + "loss": 0.3219, + "step": 3221 + }, + { + "epoch": 1.5395137685920792, + "grad_norm": 0.5013231822647967, + "learning_rate": 1.0397531843711626e-05, + "loss": 0.3144, + "step": 3222 + }, + { + "epoch": 1.5399916372976525, + "grad_norm": 0.5122546611335848, + "learning_rate": 1.0392269241766199e-05, + "loss": 0.3161, + "step": 3223 + }, + { + "epoch": 1.5404695060032256, + "grad_norm": 0.5023537156876484, + "learning_rate": 1.0387006531012204e-05, + "loss": 0.3307, + "step": 3224 + }, + { + "epoch": 1.5409473747087987, + "grad_norm": 0.46724605937313596, + "learning_rate": 1.0381743712909424e-05, + "loss": 0.3229, + "step": 3225 + }, + { + "epoch": 1.541425243414372, + "grad_norm": 0.47093375402216126, + "learning_rate": 1.0376480788917676e-05, + "loss": 0.3279, + "step": 3226 + }, + { + "epoch": 1.541903112119945, + "grad_norm": 0.46994586921571396, + "learning_rate": 1.0371217760496792e-05, + "loss": 0.3236, + "step": 3227 + }, + { + "epoch": 1.5423809808255182, + "grad_norm": 0.4843800423622691, + "learning_rate": 1.0365954629106652e-05, + "loss": 0.3085, + "step": 3228 + }, + { + "epoch": 1.5428588495310913, + "grad_norm": 0.4772582302201957, + "learning_rate": 1.0360691396207155e-05, + "loss": 0.3076, + "step": 3229 + }, + { + "epoch": 1.5433367182366644, + "grad_norm": 0.481322355383063, + "learning_rate": 1.0355428063258224e-05, + "loss": 0.3068, + "step": 3230 + }, + { + "epoch": 1.5438145869422377, + "grad_norm": 0.511101779977841, + "learning_rate": 1.0350164631719816e-05, + "loss": 0.3217, + "step": 3231 + }, + { + "epoch": 1.5442924556478108, + "grad_norm": 0.45963843948522115, + "learning_rate": 1.0344901103051923e-05, + "loss": 0.343, + "step": 3232 + }, + { + "epoch": 1.544770324353384, + "grad_norm": 0.48757882063659436, + "learning_rate": 1.033963747871454e-05, + "loss": 0.3232, + "step": 3233 + }, + { + "epoch": 1.545248193058957, + "grad_norm": 0.5055497495839371, + "learning_rate": 1.0334373760167718e-05, + "loss": 0.3234, + "step": 3234 + }, + { + "epoch": 1.54572606176453, + "grad_norm": 0.49927220968524727, + "learning_rate": 1.0329109948871512e-05, + "loss": 0.3197, + "step": 3235 + }, + { + "epoch": 1.5462039304701034, + "grad_norm": 0.48763244928090066, + "learning_rate": 1.032384604628601e-05, + "loss": 0.3222, + "step": 3236 + }, + { + "epoch": 1.5466817991756765, + "grad_norm": 0.464310330481023, + "learning_rate": 1.0318582053871326e-05, + "loss": 0.319, + "step": 3237 + }, + { + "epoch": 1.5471596678812496, + "grad_norm": 0.4970739536764959, + "learning_rate": 1.0313317973087603e-05, + "loss": 0.3121, + "step": 3238 + }, + { + "epoch": 1.547637536586823, + "grad_norm": 0.5697514900420939, + "learning_rate": 1.0308053805394998e-05, + "loss": 0.3153, + "step": 3239 + }, + { + "epoch": 1.5481154052923958, + "grad_norm": 0.48345598443017196, + "learning_rate": 1.0302789552253702e-05, + "loss": 0.3194, + "step": 3240 + }, + { + "epoch": 1.548593273997969, + "grad_norm": 0.4761641613916091, + "learning_rate": 1.0297525215123927e-05, + "loss": 0.3175, + "step": 3241 + }, + { + "epoch": 1.5490711427035422, + "grad_norm": 0.4899651453697106, + "learning_rate": 1.0292260795465905e-05, + "loss": 0.3289, + "step": 3242 + }, + { + "epoch": 1.5495490114091153, + "grad_norm": 0.5535534354531657, + "learning_rate": 1.0286996294739895e-05, + "loss": 0.3321, + "step": 3243 + }, + { + "epoch": 1.5500268801146886, + "grad_norm": 0.49382296783652835, + "learning_rate": 1.0281731714406172e-05, + "loss": 0.3274, + "step": 3244 + }, + { + "epoch": 1.5505047488202617, + "grad_norm": 0.48868429365584, + "learning_rate": 1.0276467055925044e-05, + "loss": 0.315, + "step": 3245 + }, + { + "epoch": 1.5509826175258348, + "grad_norm": 0.4900357280511654, + "learning_rate": 1.027120232075683e-05, + "loss": 0.3273, + "step": 3246 + }, + { + "epoch": 1.551460486231408, + "grad_norm": 0.44278332935205783, + "learning_rate": 1.0265937510361876e-05, + "loss": 0.3363, + "step": 3247 + }, + { + "epoch": 1.551938354936981, + "grad_norm": 0.5025588525393508, + "learning_rate": 1.0260672626200548e-05, + "loss": 0.3467, + "step": 3248 + }, + { + "epoch": 1.5524162236425543, + "grad_norm": 0.48485895773001003, + "learning_rate": 1.0255407669733235e-05, + "loss": 0.3179, + "step": 3249 + }, + { + "epoch": 1.5528940923481274, + "grad_norm": 0.48681474256092666, + "learning_rate": 1.0250142642420335e-05, + "loss": 0.3131, + "step": 3250 + }, + { + "epoch": 1.5533719610537005, + "grad_norm": 0.5072938805616622, + "learning_rate": 1.024487754572228e-05, + "loss": 0.3048, + "step": 3251 + }, + { + "epoch": 1.5538498297592738, + "grad_norm": 0.464960127721567, + "learning_rate": 1.0239612381099515e-05, + "loss": 0.3081, + "step": 3252 + }, + { + "epoch": 1.5543276984648466, + "grad_norm": 0.5029051775399344, + "learning_rate": 1.02343471500125e-05, + "loss": 0.326, + "step": 3253 + }, + { + "epoch": 1.55480556717042, + "grad_norm": 0.4760931121940165, + "learning_rate": 1.0229081853921719e-05, + "loss": 0.3373, + "step": 3254 + }, + { + "epoch": 1.555283435875993, + "grad_norm": 0.5211628472885691, + "learning_rate": 1.0223816494287675e-05, + "loss": 0.3194, + "step": 3255 + }, + { + "epoch": 1.5557613045815661, + "grad_norm": 4.00869479916537, + "learning_rate": 1.021855107257088e-05, + "loss": 0.3147, + "step": 3256 + }, + { + "epoch": 1.5562391732871395, + "grad_norm": 0.5278043616111543, + "learning_rate": 1.0213285590231877e-05, + "loss": 0.2986, + "step": 3257 + }, + { + "epoch": 1.5567170419927125, + "grad_norm": 0.48164426331213295, + "learning_rate": 1.020802004873121e-05, + "loss": 0.3253, + "step": 3258 + }, + { + "epoch": 1.5571949106982856, + "grad_norm": 0.5500585812101135, + "learning_rate": 1.0202754449529453e-05, + "loss": 0.3209, + "step": 3259 + }, + { + "epoch": 1.557672779403859, + "grad_norm": 0.49446254990048405, + "learning_rate": 1.0197488794087188e-05, + "loss": 0.3198, + "step": 3260 + }, + { + "epoch": 1.5581506481094318, + "grad_norm": 0.4657507878684054, + "learning_rate": 1.0192223083865013e-05, + "loss": 0.3256, + "step": 3261 + }, + { + "epoch": 1.5586285168150051, + "grad_norm": 0.4477764129200038, + "learning_rate": 1.0186957320323547e-05, + "loss": 0.3271, + "step": 3262 + }, + { + "epoch": 1.5591063855205782, + "grad_norm": 0.48278945630965814, + "learning_rate": 1.0181691504923421e-05, + "loss": 0.307, + "step": 3263 + }, + { + "epoch": 1.5595842542261513, + "grad_norm": 1.1355856675651534, + "learning_rate": 1.0176425639125273e-05, + "loss": 0.3271, + "step": 3264 + }, + { + "epoch": 1.5600621229317246, + "grad_norm": 0.4819769035656966, + "learning_rate": 1.0171159724389766e-05, + "loss": 0.3178, + "step": 3265 + }, + { + "epoch": 1.5605399916372975, + "grad_norm": 0.4786394343649937, + "learning_rate": 1.016589376217757e-05, + "loss": 0.3382, + "step": 3266 + }, + { + "epoch": 1.5610178603428708, + "grad_norm": 0.48736910260272115, + "learning_rate": 1.016062775394937e-05, + "loss": 0.3196, + "step": 3267 + }, + { + "epoch": 1.561495729048444, + "grad_norm": 0.46448638746562965, + "learning_rate": 1.0155361701165867e-05, + "loss": 0.3148, + "step": 3268 + }, + { + "epoch": 1.561973597754017, + "grad_norm": 0.49431236575933374, + "learning_rate": 1.0150095605287768e-05, + "loss": 0.3093, + "step": 3269 + }, + { + "epoch": 1.5624514664595903, + "grad_norm": 0.4686420840220977, + "learning_rate": 1.0144829467775794e-05, + "loss": 0.3168, + "step": 3270 + }, + { + "epoch": 1.5629293351651634, + "grad_norm": 0.44817236471431465, + "learning_rate": 1.0139563290090679e-05, + "loss": 0.3154, + "step": 3271 + }, + { + "epoch": 1.5634072038707365, + "grad_norm": 0.46921012393643297, + "learning_rate": 1.0134297073693173e-05, + "loss": 0.3257, + "step": 3272 + }, + { + "epoch": 1.5638850725763098, + "grad_norm": 0.46434620917562563, + "learning_rate": 1.0129030820044024e-05, + "loss": 0.3303, + "step": 3273 + }, + { + "epoch": 1.5643629412818827, + "grad_norm": 0.6236012194460561, + "learning_rate": 1.0123764530604003e-05, + "loss": 0.3127, + "step": 3274 + }, + { + "epoch": 1.564840809987456, + "grad_norm": 0.4944268701162515, + "learning_rate": 1.0118498206833886e-05, + "loss": 0.3269, + "step": 3275 + }, + { + "epoch": 1.565318678693029, + "grad_norm": 0.5192897092562184, + "learning_rate": 1.0113231850194455e-05, + "loss": 0.3332, + "step": 3276 + }, + { + "epoch": 1.5657965473986022, + "grad_norm": 0.4841696394389553, + "learning_rate": 1.0107965462146507e-05, + "loss": 0.3369, + "step": 3277 + }, + { + "epoch": 1.5662744161041755, + "grad_norm": 0.5059546085514659, + "learning_rate": 1.0102699044150845e-05, + "loss": 0.3233, + "step": 3278 + }, + { + "epoch": 1.5667522848097484, + "grad_norm": 0.4818467825525027, + "learning_rate": 1.0097432597668279e-05, + "loss": 0.3099, + "step": 3279 + }, + { + "epoch": 1.5672301535153217, + "grad_norm": 0.4889420234246756, + "learning_rate": 1.0092166124159628e-05, + "loss": 0.3182, + "step": 3280 + }, + { + "epoch": 1.5677080222208948, + "grad_norm": 0.4826052928580796, + "learning_rate": 1.0086899625085725e-05, + "loss": 0.3247, + "step": 3281 + }, + { + "epoch": 1.5681858909264679, + "grad_norm": 0.49036739531543294, + "learning_rate": 1.0081633101907393e-05, + "loss": 0.329, + "step": 3282 + }, + { + "epoch": 1.5686637596320412, + "grad_norm": 0.4574906703009941, + "learning_rate": 1.007636655608548e-05, + "loss": 0.3157, + "step": 3283 + }, + { + "epoch": 1.5691416283376143, + "grad_norm": 0.4738908087609072, + "learning_rate": 1.0071099989080833e-05, + "loss": 0.3433, + "step": 3284 + }, + { + "epoch": 1.5696194970431874, + "grad_norm": 0.46547191964568313, + "learning_rate": 1.0065833402354302e-05, + "loss": 0.3208, + "step": 3285 + }, + { + "epoch": 1.5700973657487607, + "grad_norm": 0.5839094375309312, + "learning_rate": 1.0060566797366744e-05, + "loss": 0.3255, + "step": 3286 + }, + { + "epoch": 1.5705752344543336, + "grad_norm": 0.4947521545103725, + "learning_rate": 1.005530017557903e-05, + "loss": 0.3208, + "step": 3287 + }, + { + "epoch": 1.5710531031599069, + "grad_norm": 0.5025064223155531, + "learning_rate": 1.005003353845202e-05, + "loss": 0.3076, + "step": 3288 + }, + { + "epoch": 1.57153097186548, + "grad_norm": 0.4669404424154083, + "learning_rate": 1.0044766887446586e-05, + "loss": 0.3276, + "step": 3289 + }, + { + "epoch": 1.572008840571053, + "grad_norm": 0.5009595930368698, + "learning_rate": 1.003950022402361e-05, + "loss": 0.3106, + "step": 3290 + }, + { + "epoch": 1.5724867092766264, + "grad_norm": 0.4948274057807226, + "learning_rate": 1.0034233549643969e-05, + "loss": 0.3424, + "step": 3291 + }, + { + "epoch": 1.5729645779821992, + "grad_norm": 0.45361579014587694, + "learning_rate": 1.0028966865768546e-05, + "loss": 0.3258, + "step": 3292 + }, + { + "epoch": 1.5734424466877726, + "grad_norm": 0.7899603895576185, + "learning_rate": 1.0023700173858224e-05, + "loss": 0.3189, + "step": 3293 + }, + { + "epoch": 1.5739203153933456, + "grad_norm": 0.45700787119484876, + "learning_rate": 1.0018433475373891e-05, + "loss": 0.3257, + "step": 3294 + }, + { + "epoch": 1.5743981840989187, + "grad_norm": 0.4721568980282006, + "learning_rate": 1.0013166771776441e-05, + "loss": 0.3356, + "step": 3295 + }, + { + "epoch": 1.574876052804492, + "grad_norm": 0.4854731194781878, + "learning_rate": 1.0007900064526756e-05, + "loss": 0.3142, + "step": 3296 + }, + { + "epoch": 1.5753539215100651, + "grad_norm": 0.4880060979457883, + "learning_rate": 1.0002633355085734e-05, + "loss": 0.3223, + "step": 3297 + }, + { + "epoch": 1.5758317902156382, + "grad_norm": 0.4561283926612506, + "learning_rate": 9.997366644914266e-06, + "loss": 0.3206, + "step": 3298 + }, + { + "epoch": 1.5763096589212116, + "grad_norm": 0.47632004228079955, + "learning_rate": 9.992099935473244e-06, + "loss": 0.3128, + "step": 3299 + }, + { + "epoch": 1.5767875276267844, + "grad_norm": 0.4847083170328396, + "learning_rate": 9.986833228223562e-06, + "loss": 0.2984, + "step": 3300 + }, + { + "epoch": 1.5772653963323577, + "grad_norm": 0.473252193657243, + "learning_rate": 9.98156652462611e-06, + "loss": 0.3223, + "step": 3301 + }, + { + "epoch": 1.5777432650379308, + "grad_norm": 0.5012398605644248, + "learning_rate": 9.976299826141776e-06, + "loss": 0.3186, + "step": 3302 + }, + { + "epoch": 1.578221133743504, + "grad_norm": 0.5394851324327755, + "learning_rate": 9.971033134231458e-06, + "loss": 0.3228, + "step": 3303 + }, + { + "epoch": 1.5786990024490772, + "grad_norm": 0.46951070560264985, + "learning_rate": 9.965766450356031e-06, + "loss": 0.3249, + "step": 3304 + }, + { + "epoch": 1.57917687115465, + "grad_norm": 0.47934334873980533, + "learning_rate": 9.96049977597639e-06, + "loss": 0.3197, + "step": 3305 + }, + { + "epoch": 1.5796547398602234, + "grad_norm": 0.514993715660872, + "learning_rate": 9.955233112553416e-06, + "loss": 0.3037, + "step": 3306 + }, + { + "epoch": 1.5801326085657965, + "grad_norm": 0.4812618916635242, + "learning_rate": 9.949966461547984e-06, + "loss": 0.3224, + "step": 3307 + }, + { + "epoch": 1.5806104772713696, + "grad_norm": 0.49269064197863277, + "learning_rate": 9.944699824420973e-06, + "loss": 0.3335, + "step": 3308 + }, + { + "epoch": 1.581088345976943, + "grad_norm": 0.4888765058606419, + "learning_rate": 9.939433202633258e-06, + "loss": 0.3188, + "step": 3309 + }, + { + "epoch": 1.581566214682516, + "grad_norm": 0.47130242909704306, + "learning_rate": 9.934166597645703e-06, + "loss": 0.327, + "step": 3310 + }, + { + "epoch": 1.582044083388089, + "grad_norm": 0.4980212169856602, + "learning_rate": 9.92890001091917e-06, + "loss": 0.3144, + "step": 3311 + }, + { + "epoch": 1.5825219520936624, + "grad_norm": 0.5047705095751753, + "learning_rate": 9.923633443914522e-06, + "loss": 0.3098, + "step": 3312 + }, + { + "epoch": 1.5829998207992353, + "grad_norm": 0.5055307123773655, + "learning_rate": 9.91836689809261e-06, + "loss": 0.3249, + "step": 3313 + }, + { + "epoch": 1.5834776895048086, + "grad_norm": 0.45564093127252564, + "learning_rate": 9.913100374914279e-06, + "loss": 0.3027, + "step": 3314 + }, + { + "epoch": 1.5839555582103817, + "grad_norm": 0.48312085794596293, + "learning_rate": 9.907833875840374e-06, + "loss": 0.3143, + "step": 3315 + }, + { + "epoch": 1.5844334269159548, + "grad_norm": 0.4737622360232326, + "learning_rate": 9.902567402331723e-06, + "loss": 0.3171, + "step": 3316 + }, + { + "epoch": 1.584911295621528, + "grad_norm": 0.46646546545393947, + "learning_rate": 9.897300955849157e-06, + "loss": 0.306, + "step": 3317 + }, + { + "epoch": 1.585389164327101, + "grad_norm": 0.5382467045040225, + "learning_rate": 9.892034537853495e-06, + "loss": 0.3118, + "step": 3318 + }, + { + "epoch": 1.5858670330326743, + "grad_norm": 0.46491097933901154, + "learning_rate": 9.886768149805546e-06, + "loss": 0.3236, + "step": 3319 + }, + { + "epoch": 1.5863449017382474, + "grad_norm": 0.48990510634701584, + "learning_rate": 9.881501793166117e-06, + "loss": 0.32, + "step": 3320 + }, + { + "epoch": 1.5868227704438205, + "grad_norm": 0.4872711839238837, + "learning_rate": 9.876235469395999e-06, + "loss": 0.3242, + "step": 3321 + }, + { + "epoch": 1.5873006391493938, + "grad_norm": 0.46588753435922464, + "learning_rate": 9.870969179955978e-06, + "loss": 0.3045, + "step": 3322 + }, + { + "epoch": 1.5877785078549669, + "grad_norm": 0.4365913398780867, + "learning_rate": 9.86570292630683e-06, + "loss": 0.31, + "step": 3323 + }, + { + "epoch": 1.58825637656054, + "grad_norm": 0.5319535297106422, + "learning_rate": 9.860436709909324e-06, + "loss": 0.3159, + "step": 3324 + }, + { + "epoch": 1.5887342452661133, + "grad_norm": 0.49154790591120767, + "learning_rate": 9.85517053222421e-06, + "loss": 0.3555, + "step": 3325 + }, + { + "epoch": 1.5892121139716862, + "grad_norm": 0.4843832531975595, + "learning_rate": 9.849904394712237e-06, + "loss": 0.312, + "step": 3326 + }, + { + "epoch": 1.5896899826772595, + "grad_norm": 0.466629508182958, + "learning_rate": 9.84463829883414e-06, + "loss": 0.3207, + "step": 3327 + }, + { + "epoch": 1.5901678513828326, + "grad_norm": 0.5132633803194183, + "learning_rate": 9.839372246050633e-06, + "loss": 0.3106, + "step": 3328 + }, + { + "epoch": 1.5906457200884057, + "grad_norm": 0.47341023027177437, + "learning_rate": 9.834106237822434e-06, + "loss": 0.3185, + "step": 3329 + }, + { + "epoch": 1.591123588793979, + "grad_norm": 0.49348692682013234, + "learning_rate": 9.82884027561024e-06, + "loss": 0.3286, + "step": 3330 + }, + { + "epoch": 1.5916014574995518, + "grad_norm": 0.4663364393993305, + "learning_rate": 9.823574360874732e-06, + "loss": 0.3146, + "step": 3331 + }, + { + "epoch": 1.5920793262051252, + "grad_norm": 0.5075761152310269, + "learning_rate": 9.818308495076582e-06, + "loss": 0.3231, + "step": 3332 + }, + { + "epoch": 1.5925571949106982, + "grad_norm": 0.4901694360126208, + "learning_rate": 9.813042679676453e-06, + "loss": 0.3078, + "step": 3333 + }, + { + "epoch": 1.5930350636162713, + "grad_norm": 0.47515746573089945, + "learning_rate": 9.807776916134985e-06, + "loss": 0.3177, + "step": 3334 + }, + { + "epoch": 1.5935129323218447, + "grad_norm": 0.45885158737432963, + "learning_rate": 9.802511205912815e-06, + "loss": 0.333, + "step": 3335 + }, + { + "epoch": 1.5939908010274177, + "grad_norm": 0.4932981943331556, + "learning_rate": 9.797245550470549e-06, + "loss": 0.2977, + "step": 3336 + }, + { + "epoch": 1.5944686697329908, + "grad_norm": 0.4566237024471887, + "learning_rate": 9.791979951268791e-06, + "loss": 0.3147, + "step": 3337 + }, + { + "epoch": 1.5949465384385642, + "grad_norm": 0.4767957332887537, + "learning_rate": 9.786714409768127e-06, + "loss": 0.3114, + "step": 3338 + }, + { + "epoch": 1.595424407144137, + "grad_norm": 0.583636525688097, + "learning_rate": 9.78144892742912e-06, + "loss": 0.3029, + "step": 3339 + }, + { + "epoch": 1.5959022758497103, + "grad_norm": 0.5062035780498737, + "learning_rate": 9.776183505712327e-06, + "loss": 0.3161, + "step": 3340 + }, + { + "epoch": 1.5963801445552834, + "grad_norm": 0.4838854040827197, + "learning_rate": 9.770918146078283e-06, + "loss": 0.3555, + "step": 3341 + }, + { + "epoch": 1.5968580132608565, + "grad_norm": 0.49830687795336376, + "learning_rate": 9.765652849987504e-06, + "loss": 0.3151, + "step": 3342 + }, + { + "epoch": 1.5973358819664298, + "grad_norm": 0.49241625719381726, + "learning_rate": 9.760387618900488e-06, + "loss": 0.3171, + "step": 3343 + }, + { + "epoch": 1.5978137506720027, + "grad_norm": 0.5523584158121128, + "learning_rate": 9.755122454277723e-06, + "loss": 0.3274, + "step": 3344 + }, + { + "epoch": 1.598291619377576, + "grad_norm": 0.4845078185424916, + "learning_rate": 9.749857357579667e-06, + "loss": 0.3149, + "step": 3345 + }, + { + "epoch": 1.5987694880831491, + "grad_norm": 0.5024697204031072, + "learning_rate": 9.744592330266769e-06, + "loss": 0.314, + "step": 3346 + }, + { + "epoch": 1.5992473567887222, + "grad_norm": 0.45934959722680274, + "learning_rate": 9.739327373799454e-06, + "loss": 0.3142, + "step": 3347 + }, + { + "epoch": 1.5997252254942955, + "grad_norm": 0.464086055829073, + "learning_rate": 9.734062489638127e-06, + "loss": 0.323, + "step": 3348 + }, + { + "epoch": 1.6002030941998686, + "grad_norm": 0.4831840810555116, + "learning_rate": 9.728797679243172e-06, + "loss": 0.3306, + "step": 3349 + }, + { + "epoch": 1.6006809629054417, + "grad_norm": 0.4764342274703041, + "learning_rate": 9.723532944074961e-06, + "loss": 0.3201, + "step": 3350 + }, + { + "epoch": 1.601158831611015, + "grad_norm": 0.48070401161567705, + "learning_rate": 9.71826828559383e-06, + "loss": 0.3281, + "step": 3351 + }, + { + "epoch": 1.601636700316588, + "grad_norm": 0.4708155298765893, + "learning_rate": 9.71300370526011e-06, + "loss": 0.322, + "step": 3352 + }, + { + "epoch": 1.6021145690221612, + "grad_norm": 0.4767194553320532, + "learning_rate": 9.7077392045341e-06, + "loss": 0.3258, + "step": 3353 + }, + { + "epoch": 1.6025924377277343, + "grad_norm": 0.479523768892733, + "learning_rate": 9.702474784876075e-06, + "loss": 0.3199, + "step": 3354 + }, + { + "epoch": 1.6030703064333074, + "grad_norm": 0.5236359911668674, + "learning_rate": 9.6972104477463e-06, + "loss": 0.3237, + "step": 3355 + }, + { + "epoch": 1.6035481751388807, + "grad_norm": 0.46485496717447494, + "learning_rate": 9.691946194605007e-06, + "loss": 0.3084, + "step": 3356 + }, + { + "epoch": 1.6040260438444536, + "grad_norm": 0.48134675159435425, + "learning_rate": 9.686682026912402e-06, + "loss": 0.3084, + "step": 3357 + }, + { + "epoch": 1.604503912550027, + "grad_norm": 0.49397380566853843, + "learning_rate": 9.681417946128677e-06, + "loss": 0.2973, + "step": 3358 + }, + { + "epoch": 1.6049817812556, + "grad_norm": 0.48607625348138517, + "learning_rate": 9.676153953713996e-06, + "loss": 0.3212, + "step": 3359 + }, + { + "epoch": 1.605459649961173, + "grad_norm": 0.487640811201148, + "learning_rate": 9.670890051128493e-06, + "loss": 0.3181, + "step": 3360 + }, + { + "epoch": 1.6059375186667464, + "grad_norm": 0.47749175931112464, + "learning_rate": 9.665626239832286e-06, + "loss": 0.3211, + "step": 3361 + }, + { + "epoch": 1.6064153873723195, + "grad_norm": 0.7933182151405382, + "learning_rate": 9.660362521285463e-06, + "loss": 0.3277, + "step": 3362 + }, + { + "epoch": 1.6068932560778926, + "grad_norm": 0.47250312135112554, + "learning_rate": 9.655098896948083e-06, + "loss": 0.3167, + "step": 3363 + }, + { + "epoch": 1.6073711247834659, + "grad_norm": 0.4693883987261192, + "learning_rate": 9.649835368280186e-06, + "loss": 0.3241, + "step": 3364 + }, + { + "epoch": 1.6078489934890388, + "grad_norm": 0.47939105139177257, + "learning_rate": 9.644571936741778e-06, + "loss": 0.3253, + "step": 3365 + }, + { + "epoch": 1.608326862194612, + "grad_norm": 0.471799876196094, + "learning_rate": 9.639308603792847e-06, + "loss": 0.2978, + "step": 3366 + }, + { + "epoch": 1.6088047309001852, + "grad_norm": 0.5037783606345849, + "learning_rate": 9.634045370893348e-06, + "loss": 0.3178, + "step": 3367 + }, + { + "epoch": 1.6092825996057583, + "grad_norm": 0.5003960430326154, + "learning_rate": 9.628782239503208e-06, + "loss": 0.3416, + "step": 3368 + }, + { + "epoch": 1.6097604683113316, + "grad_norm": 0.48869672104414125, + "learning_rate": 9.623519211082325e-06, + "loss": 0.3125, + "step": 3369 + }, + { + "epoch": 1.6102383370169044, + "grad_norm": 0.49864918116539536, + "learning_rate": 9.618256287090576e-06, + "loss": 0.3244, + "step": 3370 + }, + { + "epoch": 1.6107162057224778, + "grad_norm": 0.5019807545385679, + "learning_rate": 9.612993468987796e-06, + "loss": 0.3138, + "step": 3371 + }, + { + "epoch": 1.6111940744280508, + "grad_norm": 0.4683993877429079, + "learning_rate": 9.6077307582338e-06, + "loss": 0.332, + "step": 3372 + }, + { + "epoch": 1.611671943133624, + "grad_norm": 0.4605238593535419, + "learning_rate": 9.602468156288374e-06, + "loss": 0.3209, + "step": 3373 + }, + { + "epoch": 1.6121498118391973, + "grad_norm": 0.47258089848935597, + "learning_rate": 9.597205664611269e-06, + "loss": 0.3218, + "step": 3374 + }, + { + "epoch": 1.6126276805447703, + "grad_norm": 0.48987083989083485, + "learning_rate": 9.591943284662206e-06, + "loss": 0.3216, + "step": 3375 + }, + { + "epoch": 1.6131055492503434, + "grad_norm": 0.45509362905511785, + "learning_rate": 9.586681017900881e-06, + "loss": 0.3151, + "step": 3376 + }, + { + "epoch": 1.6135834179559168, + "grad_norm": 0.5226274148001115, + "learning_rate": 9.581418865786948e-06, + "loss": 0.296, + "step": 3377 + }, + { + "epoch": 1.6140612866614896, + "grad_norm": 0.4972753917887101, + "learning_rate": 9.576156829780038e-06, + "loss": 0.3083, + "step": 3378 + }, + { + "epoch": 1.614539155367063, + "grad_norm": 0.5066525482936579, + "learning_rate": 9.570894911339748e-06, + "loss": 0.3135, + "step": 3379 + }, + { + "epoch": 1.615017024072636, + "grad_norm": 0.4594810626516822, + "learning_rate": 9.56563311192564e-06, + "loss": 0.3315, + "step": 3380 + }, + { + "epoch": 1.6154948927782091, + "grad_norm": 0.5042615947206254, + "learning_rate": 9.560371432997244e-06, + "loss": 0.3119, + "step": 3381 + }, + { + "epoch": 1.6159727614837824, + "grad_norm": 0.4914519937789319, + "learning_rate": 9.55510987601406e-06, + "loss": 0.3276, + "step": 3382 + }, + { + "epoch": 1.6164506301893553, + "grad_norm": 0.45163236705607995, + "learning_rate": 9.549848442435547e-06, + "loss": 0.3236, + "step": 3383 + }, + { + "epoch": 1.6169284988949286, + "grad_norm": 0.4853699956930337, + "learning_rate": 9.544587133721133e-06, + "loss": 0.3262, + "step": 3384 + }, + { + "epoch": 1.6174063676005017, + "grad_norm": 0.5027837659953032, + "learning_rate": 9.53932595133022e-06, + "loss": 0.3159, + "step": 3385 + }, + { + "epoch": 1.6178842363060748, + "grad_norm": 0.5154740574278834, + "learning_rate": 9.534064896722157e-06, + "loss": 0.3022, + "step": 3386 + }, + { + "epoch": 1.6183621050116481, + "grad_norm": 0.4571584542414456, + "learning_rate": 9.528803971356275e-06, + "loss": 0.3106, + "step": 3387 + }, + { + "epoch": 1.6188399737172212, + "grad_norm": 0.5686619283974957, + "learning_rate": 9.523543176691861e-06, + "loss": 0.3067, + "step": 3388 + }, + { + "epoch": 1.6193178424227943, + "grad_norm": 0.5191198661164662, + "learning_rate": 9.518282514188163e-06, + "loss": 0.3071, + "step": 3389 + }, + { + "epoch": 1.6197957111283676, + "grad_norm": 0.46818165462038835, + "learning_rate": 9.513021985304399e-06, + "loss": 0.3116, + "step": 3390 + }, + { + "epoch": 1.6202735798339405, + "grad_norm": 0.4948050354348871, + "learning_rate": 9.507761591499747e-06, + "loss": 0.3252, + "step": 3391 + }, + { + "epoch": 1.6207514485395138, + "grad_norm": 0.5123067692293698, + "learning_rate": 9.502501334233343e-06, + "loss": 0.312, + "step": 3392 + }, + { + "epoch": 1.621229317245087, + "grad_norm": 0.4591469898065541, + "learning_rate": 9.497241214964297e-06, + "loss": 0.3208, + "step": 3393 + }, + { + "epoch": 1.62170718595066, + "grad_norm": 0.4756275676531446, + "learning_rate": 9.491981235151669e-06, + "loss": 0.3121, + "step": 3394 + }, + { + "epoch": 1.6221850546562333, + "grad_norm": 0.4827825826648765, + "learning_rate": 9.486721396254484e-06, + "loss": 0.3049, + "step": 3395 + }, + { + "epoch": 1.6226629233618064, + "grad_norm": 0.4915126644495333, + "learning_rate": 9.48146169973173e-06, + "loss": 0.3192, + "step": 3396 + }, + { + "epoch": 1.6231407920673795, + "grad_norm": 0.4610798399908291, + "learning_rate": 9.476202147042354e-06, + "loss": 0.3269, + "step": 3397 + }, + { + "epoch": 1.6236186607729526, + "grad_norm": 0.5599323558551276, + "learning_rate": 9.47094273964526e-06, + "loss": 0.335, + "step": 3398 + }, + { + "epoch": 1.6240965294785257, + "grad_norm": 0.5255208607648829, + "learning_rate": 9.465683478999319e-06, + "loss": 0.3291, + "step": 3399 + }, + { + "epoch": 1.624574398184099, + "grad_norm": 0.4924211708443086, + "learning_rate": 9.460424366563355e-06, + "loss": 0.3217, + "step": 3400 + }, + { + "epoch": 1.625052266889672, + "grad_norm": 0.476959287524869, + "learning_rate": 9.455165403796157e-06, + "loss": 0.3091, + "step": 3401 + }, + { + "epoch": 1.6255301355952452, + "grad_norm": 0.4672843679938778, + "learning_rate": 9.449906592156463e-06, + "loss": 0.3176, + "step": 3402 + }, + { + "epoch": 1.6260080043008185, + "grad_norm": 0.5009267665364574, + "learning_rate": 9.444647933102977e-06, + "loss": 0.3306, + "step": 3403 + }, + { + "epoch": 1.6264858730063914, + "grad_norm": 0.49020359549356995, + "learning_rate": 9.43938942809436e-06, + "loss": 0.3016, + "step": 3404 + }, + { + "epoch": 1.6269637417119647, + "grad_norm": 0.4871954472988403, + "learning_rate": 9.434131078589224e-06, + "loss": 0.3118, + "step": 3405 + }, + { + "epoch": 1.6274416104175378, + "grad_norm": 0.47764178365700216, + "learning_rate": 9.428872886046145e-06, + "loss": 0.3277, + "step": 3406 + }, + { + "epoch": 1.6279194791231109, + "grad_norm": 0.49682895676211, + "learning_rate": 9.423614851923657e-06, + "loss": 0.3052, + "step": 3407 + }, + { + "epoch": 1.6283973478286842, + "grad_norm": 0.4664914179892642, + "learning_rate": 9.418356977680238e-06, + "loss": 0.3182, + "step": 3408 + }, + { + "epoch": 1.6288752165342573, + "grad_norm": 0.46865718754503594, + "learning_rate": 9.413099264774334e-06, + "loss": 0.303, + "step": 3409 + }, + { + "epoch": 1.6293530852398304, + "grad_norm": 0.487385856422879, + "learning_rate": 9.407841714664343e-06, + "loss": 0.3303, + "step": 3410 + }, + { + "epoch": 1.6298309539454034, + "grad_norm": 0.5465692345398058, + "learning_rate": 9.402584328808614e-06, + "loss": 0.3158, + "step": 3411 + }, + { + "epoch": 1.6303088226509765, + "grad_norm": 0.47198199422777404, + "learning_rate": 9.39732710866545e-06, + "loss": 0.3182, + "step": 3412 + }, + { + "epoch": 1.6307866913565499, + "grad_norm": 0.46116344865689524, + "learning_rate": 9.392070055693122e-06, + "loss": 0.3393, + "step": 3413 + }, + { + "epoch": 1.631264560062123, + "grad_norm": 0.4763679490362691, + "learning_rate": 9.38681317134983e-06, + "loss": 0.3161, + "step": 3414 + }, + { + "epoch": 1.631742428767696, + "grad_norm": 0.46624869036989713, + "learning_rate": 9.381556457093752e-06, + "loss": 0.3285, + "step": 3415 + }, + { + "epoch": 1.6322202974732694, + "grad_norm": 0.46060193625574064, + "learning_rate": 9.376299914383004e-06, + "loss": 0.3169, + "step": 3416 + }, + { + "epoch": 1.6326981661788422, + "grad_norm": 0.48033327662418424, + "learning_rate": 9.371043544675656e-06, + "loss": 0.3169, + "step": 3417 + }, + { + "epoch": 1.6331760348844155, + "grad_norm": 0.4802037904197471, + "learning_rate": 9.365787349429734e-06, + "loss": 0.325, + "step": 3418 + }, + { + "epoch": 1.6336539035899886, + "grad_norm": 0.5496693935684847, + "learning_rate": 9.360531330103218e-06, + "loss": 0.3025, + "step": 3419 + }, + { + "epoch": 1.6341317722955617, + "grad_norm": 0.4824651576505537, + "learning_rate": 9.355275488154025e-06, + "loss": 0.3251, + "step": 3420 + }, + { + "epoch": 1.634609641001135, + "grad_norm": 0.49309460242445563, + "learning_rate": 9.350019825040042e-06, + "loss": 0.3023, + "step": 3421 + }, + { + "epoch": 1.6350875097067081, + "grad_norm": 0.471529984373171, + "learning_rate": 9.344764342219096e-06, + "loss": 0.3212, + "step": 3422 + }, + { + "epoch": 1.6355653784122812, + "grad_norm": 0.47065857771956116, + "learning_rate": 9.33950904114896e-06, + "loss": 0.3107, + "step": 3423 + }, + { + "epoch": 1.6360432471178543, + "grad_norm": 0.47686302451072865, + "learning_rate": 9.334253923287364e-06, + "loss": 0.3156, + "step": 3424 + }, + { + "epoch": 1.6365211158234274, + "grad_norm": 0.5127539654610185, + "learning_rate": 9.32899899009199e-06, + "loss": 0.3162, + "step": 3425 + }, + { + "epoch": 1.6369989845290007, + "grad_norm": 0.4747555573267503, + "learning_rate": 9.323744243020458e-06, + "loss": 0.3094, + "step": 3426 + }, + { + "epoch": 1.6374768532345738, + "grad_norm": 2.954360238976603, + "learning_rate": 9.318489683530342e-06, + "loss": 0.2972, + "step": 3427 + }, + { + "epoch": 1.637954721940147, + "grad_norm": 0.5656096550756227, + "learning_rate": 9.313235313079171e-06, + "loss": 0.321, + "step": 3428 + }, + { + "epoch": 1.6384325906457202, + "grad_norm": 0.5225575379322978, + "learning_rate": 9.307981133124407e-06, + "loss": 0.3252, + "step": 3429 + }, + { + "epoch": 1.638910459351293, + "grad_norm": 0.5091222365768304, + "learning_rate": 9.30272714512347e-06, + "loss": 0.3249, + "step": 3430 + }, + { + "epoch": 1.6393883280568664, + "grad_norm": 0.4992499483033707, + "learning_rate": 9.297473350533723e-06, + "loss": 0.3172, + "step": 3431 + }, + { + "epoch": 1.6398661967624395, + "grad_norm": 0.4831370776686128, + "learning_rate": 9.292219750812475e-06, + "loss": 0.3244, + "step": 3432 + }, + { + "epoch": 1.6403440654680126, + "grad_norm": 0.4476338694579795, + "learning_rate": 9.286966347416982e-06, + "loss": 0.3148, + "step": 3433 + }, + { + "epoch": 1.640821934173586, + "grad_norm": 0.43830875865078245, + "learning_rate": 9.281713141804449e-06, + "loss": 0.3182, + "step": 3434 + }, + { + "epoch": 1.641299802879159, + "grad_norm": 0.4809168285139566, + "learning_rate": 9.276460135432019e-06, + "loss": 0.3241, + "step": 3435 + }, + { + "epoch": 1.641777671584732, + "grad_norm": 0.4754134644489622, + "learning_rate": 9.271207329756787e-06, + "loss": 0.2952, + "step": 3436 + }, + { + "epoch": 1.6422555402903054, + "grad_norm": 0.48655404908499644, + "learning_rate": 9.265954726235783e-06, + "loss": 0.312, + "step": 3437 + }, + { + "epoch": 1.6427334089958783, + "grad_norm": 0.4683670052071872, + "learning_rate": 9.26070232632599e-06, + "loss": 0.324, + "step": 3438 + }, + { + "epoch": 1.6432112777014516, + "grad_norm": 0.4890631847154369, + "learning_rate": 9.255450131484334e-06, + "loss": 0.3092, + "step": 3439 + }, + { + "epoch": 1.6436891464070247, + "grad_norm": 0.5288231867382709, + "learning_rate": 9.250198143167675e-06, + "loss": 0.3033, + "step": 3440 + }, + { + "epoch": 1.6441670151125978, + "grad_norm": 0.48338257041337923, + "learning_rate": 9.244946362832825e-06, + "loss": 0.3129, + "step": 3441 + }, + { + "epoch": 1.644644883818171, + "grad_norm": 0.4614188770486306, + "learning_rate": 9.239694791936536e-06, + "loss": 0.3121, + "step": 3442 + }, + { + "epoch": 1.645122752523744, + "grad_norm": 0.48639433398453336, + "learning_rate": 9.234443431935498e-06, + "loss": 0.3359, + "step": 3443 + }, + { + "epoch": 1.6456006212293173, + "grad_norm": 0.454810660207841, + "learning_rate": 9.22919228428635e-06, + "loss": 0.3307, + "step": 3444 + }, + { + "epoch": 1.6460784899348904, + "grad_norm": 0.4813257235091246, + "learning_rate": 9.223941350445666e-06, + "loss": 0.3322, + "step": 3445 + }, + { + "epoch": 1.6465563586404635, + "grad_norm": 0.466364711126299, + "learning_rate": 9.218690631869961e-06, + "loss": 0.3192, + "step": 3446 + }, + { + "epoch": 1.6470342273460368, + "grad_norm": 0.5066591698413345, + "learning_rate": 9.213440130015692e-06, + "loss": 0.3273, + "step": 3447 + }, + { + "epoch": 1.6475120960516099, + "grad_norm": 0.49776839266177914, + "learning_rate": 9.208189846339259e-06, + "loss": 0.3338, + "step": 3448 + }, + { + "epoch": 1.647989964757183, + "grad_norm": 0.480237463551773, + "learning_rate": 9.202939782296992e-06, + "loss": 0.3249, + "step": 3449 + }, + { + "epoch": 1.6484678334627563, + "grad_norm": 0.46449483591155677, + "learning_rate": 9.19768993934517e-06, + "loss": 0.2952, + "step": 3450 + }, + { + "epoch": 1.6489457021683291, + "grad_norm": 0.46253453287395435, + "learning_rate": 9.192440318940009e-06, + "loss": 0.3152, + "step": 3451 + }, + { + "epoch": 1.6494235708739025, + "grad_norm": 0.8458069073772643, + "learning_rate": 9.187190922537654e-06, + "loss": 0.3168, + "step": 3452 + }, + { + "epoch": 1.6499014395794755, + "grad_norm": 0.6969051435404784, + "learning_rate": 9.181941751594203e-06, + "loss": 0.3276, + "step": 3453 + }, + { + "epoch": 1.6503793082850486, + "grad_norm": 0.461496229769572, + "learning_rate": 9.176692807565679e-06, + "loss": 0.326, + "step": 3454 + }, + { + "epoch": 1.650857176990622, + "grad_norm": 0.483293334225472, + "learning_rate": 9.171444091908046e-06, + "loss": 0.3164, + "step": 3455 + }, + { + "epoch": 1.6513350456961948, + "grad_norm": 0.4776240975371244, + "learning_rate": 9.166195606077205e-06, + "loss": 0.3153, + "step": 3456 + }, + { + "epoch": 1.6518129144017681, + "grad_norm": 0.47229526671977295, + "learning_rate": 9.160947351529001e-06, + "loss": 0.3199, + "step": 3457 + }, + { + "epoch": 1.6522907831073412, + "grad_norm": 0.5146895918341302, + "learning_rate": 9.155699329719196e-06, + "loss": 0.3286, + "step": 3458 + }, + { + "epoch": 1.6527686518129143, + "grad_norm": 0.4384463475806529, + "learning_rate": 9.150451542103505e-06, + "loss": 0.3301, + "step": 3459 + }, + { + "epoch": 1.6532465205184876, + "grad_norm": 0.4703968423103728, + "learning_rate": 9.145203990137571e-06, + "loss": 0.3096, + "step": 3460 + }, + { + "epoch": 1.6537243892240607, + "grad_norm": 0.4793369058069929, + "learning_rate": 9.13995667527697e-06, + "loss": 0.3284, + "step": 3461 + }, + { + "epoch": 1.6542022579296338, + "grad_norm": 0.45236867666956726, + "learning_rate": 9.134709598977218e-06, + "loss": 0.3234, + "step": 3462 + }, + { + "epoch": 1.6546801266352071, + "grad_norm": 0.47207424023467076, + "learning_rate": 9.129462762693759e-06, + "loss": 0.3096, + "step": 3463 + }, + { + "epoch": 1.65515799534078, + "grad_norm": 0.4750207016386637, + "learning_rate": 9.124216167881974e-06, + "loss": 0.3025, + "step": 3464 + }, + { + "epoch": 1.6556358640463533, + "grad_norm": 0.45011316729860085, + "learning_rate": 9.118969815997174e-06, + "loss": 0.3117, + "step": 3465 + }, + { + "epoch": 1.6561137327519264, + "grad_norm": 0.45982543099231327, + "learning_rate": 9.11372370849461e-06, + "loss": 0.338, + "step": 3466 + }, + { + "epoch": 1.6565916014574995, + "grad_norm": 0.47099246855087673, + "learning_rate": 9.108477846829447e-06, + "loss": 0.313, + "step": 3467 + }, + { + "epoch": 1.6570694701630728, + "grad_norm": 0.4872573696860828, + "learning_rate": 9.103232232456812e-06, + "loss": 0.3257, + "step": 3468 + }, + { + "epoch": 1.6575473388686457, + "grad_norm": 0.44856712782419816, + "learning_rate": 9.097986866831733e-06, + "loss": 0.321, + "step": 3469 + }, + { + "epoch": 1.658025207574219, + "grad_norm": 0.4420962792302268, + "learning_rate": 9.092741751409186e-06, + "loss": 0.3273, + "step": 3470 + }, + { + "epoch": 1.658503076279792, + "grad_norm": 0.4687035556845388, + "learning_rate": 9.087496887644075e-06, + "loss": 0.3249, + "step": 3471 + }, + { + "epoch": 1.6589809449853652, + "grad_norm": 0.49495908912270503, + "learning_rate": 9.08225227699123e-06, + "loss": 0.3054, + "step": 3472 + }, + { + "epoch": 1.6594588136909385, + "grad_norm": 0.48042565301421647, + "learning_rate": 9.077007920905413e-06, + "loss": 0.3207, + "step": 3473 + }, + { + "epoch": 1.6599366823965116, + "grad_norm": 0.46644981695972604, + "learning_rate": 9.071763820841322e-06, + "loss": 0.3226, + "step": 3474 + }, + { + "epoch": 1.6604145511020847, + "grad_norm": 0.4810549358732364, + "learning_rate": 9.06651997825357e-06, + "loss": 0.3275, + "step": 3475 + }, + { + "epoch": 1.660892419807658, + "grad_norm": 0.4889810661635855, + "learning_rate": 9.061276394596712e-06, + "loss": 0.3238, + "step": 3476 + }, + { + "epoch": 1.6613702885132309, + "grad_norm": 0.4429594397309242, + "learning_rate": 9.05603307132522e-06, + "loss": 0.3114, + "step": 3477 + }, + { + "epoch": 1.6618481572188042, + "grad_norm": 0.46379425469030755, + "learning_rate": 9.050790009893505e-06, + "loss": 0.313, + "step": 3478 + }, + { + "epoch": 1.6623260259243773, + "grad_norm": 0.5433136463867893, + "learning_rate": 9.0455472117559e-06, + "loss": 0.3061, + "step": 3479 + }, + { + "epoch": 1.6628038946299504, + "grad_norm": 0.4514939829237272, + "learning_rate": 9.040304678366658e-06, + "loss": 0.3147, + "step": 3480 + }, + { + "epoch": 1.6632817633355237, + "grad_norm": 0.471795348976342, + "learning_rate": 9.03506241117997e-06, + "loss": 0.3277, + "step": 3481 + }, + { + "epoch": 1.6637596320410966, + "grad_norm": 0.480378485496424, + "learning_rate": 9.02982041164995e-06, + "loss": 0.3062, + "step": 3482 + }, + { + "epoch": 1.6642375007466699, + "grad_norm": 0.44413418724919074, + "learning_rate": 9.024578681230632e-06, + "loss": 0.3231, + "step": 3483 + }, + { + "epoch": 1.664715369452243, + "grad_norm": 0.495040097592052, + "learning_rate": 9.01933722137598e-06, + "loss": 0.3113, + "step": 3484 + }, + { + "epoch": 1.665193238157816, + "grad_norm": 0.5093870178221196, + "learning_rate": 9.014096033539889e-06, + "loss": 0.3206, + "step": 3485 + }, + { + "epoch": 1.6656711068633894, + "grad_norm": 0.5133513477654141, + "learning_rate": 9.008855119176165e-06, + "loss": 0.3057, + "step": 3486 + }, + { + "epoch": 1.6661489755689625, + "grad_norm": 0.4699567002694254, + "learning_rate": 9.003614479738544e-06, + "loss": 0.3287, + "step": 3487 + }, + { + "epoch": 1.6666268442745356, + "grad_norm": 0.4892249229390645, + "learning_rate": 8.998374116680697e-06, + "loss": 0.3274, + "step": 3488 + }, + { + "epoch": 1.6671047129801089, + "grad_norm": 0.4946263872577261, + "learning_rate": 8.993134031456198e-06, + "loss": 0.3179, + "step": 3489 + }, + { + "epoch": 1.6675825816856817, + "grad_norm": 0.48264814521862553, + "learning_rate": 8.987894225518556e-06, + "loss": 0.3097, + "step": 3490 + }, + { + "epoch": 1.668060450391255, + "grad_norm": 0.4605926310411199, + "learning_rate": 8.982654700321207e-06, + "loss": 0.3218, + "step": 3491 + }, + { + "epoch": 1.6685383190968281, + "grad_norm": 0.4807584069153299, + "learning_rate": 8.977415457317495e-06, + "loss": 0.3295, + "step": 3492 + }, + { + "epoch": 1.6690161878024012, + "grad_norm": 0.5126134343679848, + "learning_rate": 8.972176497960698e-06, + "loss": 0.3023, + "step": 3493 + }, + { + "epoch": 1.6694940565079746, + "grad_norm": 0.4866870303998035, + "learning_rate": 8.966937823704013e-06, + "loss": 0.3246, + "step": 3494 + }, + { + "epoch": 1.6699719252135474, + "grad_norm": 0.4701808785486881, + "learning_rate": 8.961699436000548e-06, + "loss": 0.3138, + "step": 3495 + }, + { + "epoch": 1.6704497939191207, + "grad_norm": 0.48437050470284443, + "learning_rate": 8.956461336303345e-06, + "loss": 0.3309, + "step": 3496 + }, + { + "epoch": 1.6709276626246938, + "grad_norm": 0.4915042033611595, + "learning_rate": 8.951223526065363e-06, + "loss": 0.3056, + "step": 3497 + }, + { + "epoch": 1.671405531330267, + "grad_norm": 0.48853602825271614, + "learning_rate": 8.945986006739472e-06, + "loss": 0.3398, + "step": 3498 + }, + { + "epoch": 1.6718834000358402, + "grad_norm": 0.47166819435850216, + "learning_rate": 8.94074877977847e-06, + "loss": 0.32, + "step": 3499 + }, + { + "epoch": 1.6723612687414133, + "grad_norm": 0.4848661547844431, + "learning_rate": 8.93551184663507e-06, + "loss": 0.3163, + "step": 3500 + }, + { + "epoch": 1.6728391374469864, + "grad_norm": 0.5013003475484796, + "learning_rate": 8.930275208761908e-06, + "loss": 0.3187, + "step": 3501 + }, + { + "epoch": 1.6733170061525597, + "grad_norm": 0.5469587538744264, + "learning_rate": 8.925038867611536e-06, + "loss": 0.3181, + "step": 3502 + }, + { + "epoch": 1.6737948748581326, + "grad_norm": 0.4669973541608491, + "learning_rate": 8.919802824636418e-06, + "loss": 0.3089, + "step": 3503 + }, + { + "epoch": 1.674272743563706, + "grad_norm": 0.6631093786446882, + "learning_rate": 8.914567081288943e-06, + "loss": 0.3196, + "step": 3504 + }, + { + "epoch": 1.674750612269279, + "grad_norm": 0.4870905723140869, + "learning_rate": 8.909331639021414e-06, + "loss": 0.3002, + "step": 3505 + }, + { + "epoch": 1.675228480974852, + "grad_norm": 0.5011009424411035, + "learning_rate": 8.904096499286047e-06, + "loss": 0.3197, + "step": 3506 + }, + { + "epoch": 1.6757063496804254, + "grad_norm": 0.49155633759059203, + "learning_rate": 8.89886166353498e-06, + "loss": 0.3081, + "step": 3507 + }, + { + "epoch": 1.6761842183859983, + "grad_norm": 0.49765551877135294, + "learning_rate": 8.893627133220266e-06, + "loss": 0.3117, + "step": 3508 + }, + { + "epoch": 1.6766620870915716, + "grad_norm": 0.4911618547326789, + "learning_rate": 8.888392909793866e-06, + "loss": 0.3098, + "step": 3509 + }, + { + "epoch": 1.6771399557971447, + "grad_norm": 0.5063806765110067, + "learning_rate": 8.883158994707666e-06, + "loss": 0.3335, + "step": 3510 + }, + { + "epoch": 1.6776178245027178, + "grad_norm": 0.5734707096493096, + "learning_rate": 8.87792538941346e-06, + "loss": 0.3082, + "step": 3511 + }, + { + "epoch": 1.678095693208291, + "grad_norm": 0.5067693296298389, + "learning_rate": 8.872692095362957e-06, + "loss": 0.3061, + "step": 3512 + }, + { + "epoch": 1.6785735619138642, + "grad_norm": 0.5479709311181276, + "learning_rate": 8.867459114007784e-06, + "loss": 0.3164, + "step": 3513 + }, + { + "epoch": 1.6790514306194373, + "grad_norm": 0.5287134929809657, + "learning_rate": 8.862226446799474e-06, + "loss": 0.3307, + "step": 3514 + }, + { + "epoch": 1.6795292993250106, + "grad_norm": 0.4649713213931607, + "learning_rate": 8.856994095189477e-06, + "loss": 0.3155, + "step": 3515 + }, + { + "epoch": 1.6800071680305835, + "grad_norm": 0.49222649083051145, + "learning_rate": 8.851762060629155e-06, + "loss": 0.3309, + "step": 3516 + }, + { + "epoch": 1.6804850367361568, + "grad_norm": 0.5143659759951493, + "learning_rate": 8.846530344569785e-06, + "loss": 0.3245, + "step": 3517 + }, + { + "epoch": 1.6809629054417299, + "grad_norm": 0.49180603565206343, + "learning_rate": 8.841298948462548e-06, + "loss": 0.319, + "step": 3518 + }, + { + "epoch": 1.681440774147303, + "grad_norm": 0.4904305397252043, + "learning_rate": 8.836067873758544e-06, + "loss": 0.3076, + "step": 3519 + }, + { + "epoch": 1.6819186428528763, + "grad_norm": 0.5069444940408103, + "learning_rate": 8.830837121908783e-06, + "loss": 0.3142, + "step": 3520 + }, + { + "epoch": 1.6823965115584492, + "grad_norm": 0.47329267540124315, + "learning_rate": 8.825606694364178e-06, + "loss": 0.3116, + "step": 3521 + }, + { + "epoch": 1.6828743802640225, + "grad_norm": 0.5502456613469378, + "learning_rate": 8.820376592575562e-06, + "loss": 0.3266, + "step": 3522 + }, + { + "epoch": 1.6833522489695956, + "grad_norm": 0.5242294700558009, + "learning_rate": 8.81514681799367e-06, + "loss": 0.3171, + "step": 3523 + }, + { + "epoch": 1.6838301176751687, + "grad_norm": 0.4587826429514902, + "learning_rate": 8.80991737206915e-06, + "loss": 0.3128, + "step": 3524 + }, + { + "epoch": 1.684307986380742, + "grad_norm": 0.5210196583488915, + "learning_rate": 8.804688256252557e-06, + "loss": 0.3201, + "step": 3525 + }, + { + "epoch": 1.684785855086315, + "grad_norm": 0.511732360451873, + "learning_rate": 8.79945947199436e-06, + "loss": 0.3213, + "step": 3526 + }, + { + "epoch": 1.6852637237918882, + "grad_norm": 0.6266517581717058, + "learning_rate": 8.794231020744926e-06, + "loss": 0.308, + "step": 3527 + }, + { + "epoch": 1.6857415924974615, + "grad_norm": 0.4306673990518024, + "learning_rate": 8.789002903954538e-06, + "loss": 0.3062, + "step": 3528 + }, + { + "epoch": 1.6862194612030343, + "grad_norm": 0.5201991430967379, + "learning_rate": 8.783775123073383e-06, + "loss": 0.2972, + "step": 3529 + }, + { + "epoch": 1.6866973299086077, + "grad_norm": 0.4789566466601809, + "learning_rate": 8.778547679551555e-06, + "loss": 0.3287, + "step": 3530 + }, + { + "epoch": 1.6871751986141807, + "grad_norm": 0.4503867654809924, + "learning_rate": 8.773320574839055e-06, + "loss": 0.3158, + "step": 3531 + }, + { + "epoch": 1.6876530673197538, + "grad_norm": 0.5592132809330164, + "learning_rate": 8.76809381038579e-06, + "loss": 0.2928, + "step": 3532 + }, + { + "epoch": 1.6881309360253272, + "grad_norm": 0.5248394969953822, + "learning_rate": 8.762867387641569e-06, + "loss": 0.3192, + "step": 3533 + }, + { + "epoch": 1.6886088047309, + "grad_norm": 0.473584910793579, + "learning_rate": 8.757641308056111e-06, + "loss": 0.315, + "step": 3534 + }, + { + "epoch": 1.6890866734364733, + "grad_norm": 0.5181876158952626, + "learning_rate": 8.75241557307904e-06, + "loss": 0.3034, + "step": 3535 + }, + { + "epoch": 1.6895645421420464, + "grad_norm": 0.4871622666683337, + "learning_rate": 8.747190184159883e-06, + "loss": 0.3042, + "step": 3536 + }, + { + "epoch": 1.6900424108476195, + "grad_norm": 0.4857165458002748, + "learning_rate": 8.741965142748072e-06, + "loss": 0.3125, + "step": 3537 + }, + { + "epoch": 1.6905202795531928, + "grad_norm": 0.47357991306741637, + "learning_rate": 8.736740450292937e-06, + "loss": 0.3216, + "step": 3538 + }, + { + "epoch": 1.690998148258766, + "grad_norm": 0.45787328331002924, + "learning_rate": 8.731516108243717e-06, + "loss": 0.3252, + "step": 3539 + }, + { + "epoch": 1.691476016964339, + "grad_norm": 0.479337722950027, + "learning_rate": 8.726292118049555e-06, + "loss": 0.3217, + "step": 3540 + }, + { + "epoch": 1.6919538856699123, + "grad_norm": 0.4770400698452951, + "learning_rate": 8.721068481159491e-06, + "loss": 0.3332, + "step": 3541 + }, + { + "epoch": 1.6924317543754852, + "grad_norm": 0.5242710945344221, + "learning_rate": 8.715845199022468e-06, + "loss": 0.3065, + "step": 3542 + }, + { + "epoch": 1.6929096230810585, + "grad_norm": 0.47188838291316537, + "learning_rate": 8.710622273087338e-06, + "loss": 0.3181, + "step": 3543 + }, + { + "epoch": 1.6933874917866316, + "grad_norm": 0.4916479471672269, + "learning_rate": 8.705399704802844e-06, + "loss": 0.3124, + "step": 3544 + }, + { + "epoch": 1.6938653604922047, + "grad_norm": 0.4864876674585418, + "learning_rate": 8.700177495617635e-06, + "loss": 0.303, + "step": 3545 + }, + { + "epoch": 1.694343229197778, + "grad_norm": 0.48143706434962735, + "learning_rate": 8.694955646980261e-06, + "loss": 0.3244, + "step": 3546 + }, + { + "epoch": 1.694821097903351, + "grad_norm": 0.5024128714258727, + "learning_rate": 8.68973416033917e-06, + "loss": 0.3214, + "step": 3547 + }, + { + "epoch": 1.6952989666089242, + "grad_norm": 0.4701290161733491, + "learning_rate": 8.684513037142708e-06, + "loss": 0.319, + "step": 3548 + }, + { + "epoch": 1.6957768353144973, + "grad_norm": 0.5196310295903178, + "learning_rate": 8.67929227883913e-06, + "loss": 0.3304, + "step": 3549 + }, + { + "epoch": 1.6962547040200704, + "grad_norm": 0.4470857047416331, + "learning_rate": 8.674071886876572e-06, + "loss": 0.3116, + "step": 3550 + }, + { + "epoch": 1.6967325727256437, + "grad_norm": 0.49139633093819357, + "learning_rate": 8.668851862703084e-06, + "loss": 0.3147, + "step": 3551 + }, + { + "epoch": 1.6972104414312168, + "grad_norm": 0.4533593202968756, + "learning_rate": 8.66363220776661e-06, + "loss": 0.3117, + "step": 3552 + }, + { + "epoch": 1.69768831013679, + "grad_norm": 0.5016957872077247, + "learning_rate": 8.658412923514987e-06, + "loss": 0.3058, + "step": 3553 + }, + { + "epoch": 1.6981661788423632, + "grad_norm": 0.49422695943685363, + "learning_rate": 8.653194011395955e-06, + "loss": 0.3166, + "step": 3554 + }, + { + "epoch": 1.698644047547936, + "grad_norm": 0.5041371009037742, + "learning_rate": 8.647975472857148e-06, + "loss": 0.3058, + "step": 3555 + }, + { + "epoch": 1.6991219162535094, + "grad_norm": 0.46401191058629204, + "learning_rate": 8.642757309346092e-06, + "loss": 0.2981, + "step": 3556 + }, + { + "epoch": 1.6995997849590825, + "grad_norm": 0.47462647906742783, + "learning_rate": 8.637539522310219e-06, + "loss": 0.3158, + "step": 3557 + }, + { + "epoch": 1.7000776536646556, + "grad_norm": 0.48560418398414035, + "learning_rate": 8.63232211319685e-06, + "loss": 0.3101, + "step": 3558 + }, + { + "epoch": 1.700555522370229, + "grad_norm": 0.4924892083480203, + "learning_rate": 8.6271050834532e-06, + "loss": 0.3175, + "step": 3559 + }, + { + "epoch": 1.7010333910758018, + "grad_norm": 0.4739306925770209, + "learning_rate": 8.621888434526382e-06, + "loss": 0.3055, + "step": 3560 + }, + { + "epoch": 1.701511259781375, + "grad_norm": 0.5668938530252869, + "learning_rate": 8.616672167863406e-06, + "loss": 0.3001, + "step": 3561 + }, + { + "epoch": 1.7019891284869482, + "grad_norm": 0.5258055525735766, + "learning_rate": 8.611456284911167e-06, + "loss": 0.3317, + "step": 3562 + }, + { + "epoch": 1.7024669971925213, + "grad_norm": 0.4731593091810082, + "learning_rate": 8.606240787116459e-06, + "loss": 0.305, + "step": 3563 + }, + { + "epoch": 1.7029448658980946, + "grad_norm": 0.47298302795830727, + "learning_rate": 8.601025675925976e-06, + "loss": 0.3518, + "step": 3564 + }, + { + "epoch": 1.7034227346036677, + "grad_norm": 0.5044823125447081, + "learning_rate": 8.595810952786289e-06, + "loss": 0.316, + "step": 3565 + }, + { + "epoch": 1.7039006033092408, + "grad_norm": 0.48834528388610515, + "learning_rate": 8.590596619143874e-06, + "loss": 0.3185, + "step": 3566 + }, + { + "epoch": 1.704378472014814, + "grad_norm": 0.47503311635279094, + "learning_rate": 8.585382676445099e-06, + "loss": 0.3122, + "step": 3567 + }, + { + "epoch": 1.704856340720387, + "grad_norm": 0.4779679121742256, + "learning_rate": 8.580169126136211e-06, + "loss": 0.3024, + "step": 3568 + }, + { + "epoch": 1.7053342094259603, + "grad_norm": 0.4881996436437521, + "learning_rate": 8.574955969663364e-06, + "loss": 0.308, + "step": 3569 + }, + { + "epoch": 1.7058120781315333, + "grad_norm": 0.49243529307480594, + "learning_rate": 8.569743208472594e-06, + "loss": 0.3211, + "step": 3570 + }, + { + "epoch": 1.7062899468371064, + "grad_norm": 0.47769625174462077, + "learning_rate": 8.56453084400983e-06, + "loss": 0.3022, + "step": 3571 + }, + { + "epoch": 1.7067678155426798, + "grad_norm": 0.4687274182843692, + "learning_rate": 8.559318877720889e-06, + "loss": 0.3026, + "step": 3572 + }, + { + "epoch": 1.7072456842482526, + "grad_norm": 0.48608349498390174, + "learning_rate": 8.554107311051477e-06, + "loss": 0.3178, + "step": 3573 + }, + { + "epoch": 1.707723552953826, + "grad_norm": 0.48723365794182133, + "learning_rate": 8.548896145447191e-06, + "loss": 0.3041, + "step": 3574 + }, + { + "epoch": 1.708201421659399, + "grad_norm": 0.5215881012424272, + "learning_rate": 8.543685382353518e-06, + "loss": 0.3147, + "step": 3575 + }, + { + "epoch": 1.7086792903649721, + "grad_norm": 0.46461361060388295, + "learning_rate": 8.538475023215831e-06, + "loss": 0.3296, + "step": 3576 + }, + { + "epoch": 1.7091571590705454, + "grad_norm": 0.48227283989606085, + "learning_rate": 8.533265069479393e-06, + "loss": 0.3143, + "step": 3577 + }, + { + "epoch": 1.7096350277761185, + "grad_norm": 0.4427755116826597, + "learning_rate": 8.52805552258935e-06, + "loss": 0.2946, + "step": 3578 + }, + { + "epoch": 1.7101128964816916, + "grad_norm": 0.4557912224968937, + "learning_rate": 8.52284638399074e-06, + "loss": 0.3117, + "step": 3579 + }, + { + "epoch": 1.710590765187265, + "grad_norm": 0.48947216925097453, + "learning_rate": 8.51763765512849e-06, + "loss": 0.3135, + "step": 3580 + }, + { + "epoch": 1.7110686338928378, + "grad_norm": 0.48750224853053237, + "learning_rate": 8.512429337447404e-06, + "loss": 0.3152, + "step": 3581 + }, + { + "epoch": 1.7115465025984111, + "grad_norm": 0.5424080934153999, + "learning_rate": 8.507221432392177e-06, + "loss": 0.3185, + "step": 3582 + }, + { + "epoch": 1.7120243713039842, + "grad_norm": 0.5243992703363829, + "learning_rate": 8.502013941407396e-06, + "loss": 0.2851, + "step": 3583 + }, + { + "epoch": 1.7125022400095573, + "grad_norm": 0.47093855520770667, + "learning_rate": 8.496806865937523e-06, + "loss": 0.3141, + "step": 3584 + }, + { + "epoch": 1.7129801087151306, + "grad_norm": 0.5163527516479693, + "learning_rate": 8.491600207426907e-06, + "loss": 0.3048, + "step": 3585 + }, + { + "epoch": 1.7134579774207037, + "grad_norm": 0.46552143826305653, + "learning_rate": 8.48639396731979e-06, + "loss": 0.3002, + "step": 3586 + }, + { + "epoch": 1.7139358461262768, + "grad_norm": 0.47841277412531025, + "learning_rate": 8.481188147060283e-06, + "loss": 0.3095, + "step": 3587 + }, + { + "epoch": 1.71441371483185, + "grad_norm": 0.5911896029327872, + "learning_rate": 8.475982748092395e-06, + "loss": 0.3257, + "step": 3588 + }, + { + "epoch": 1.714891583537423, + "grad_norm": 0.4716246559506084, + "learning_rate": 8.470777771860009e-06, + "loss": 0.3108, + "step": 3589 + }, + { + "epoch": 1.7153694522429963, + "grad_norm": 0.4544478697048045, + "learning_rate": 8.465573219806893e-06, + "loss": 0.3144, + "step": 3590 + }, + { + "epoch": 1.7158473209485694, + "grad_norm": 0.48108714707741906, + "learning_rate": 8.460369093376698e-06, + "loss": 0.2957, + "step": 3591 + }, + { + "epoch": 1.7163251896541425, + "grad_norm": 0.8635235335475422, + "learning_rate": 8.455165394012962e-06, + "loss": 0.3218, + "step": 3592 + }, + { + "epoch": 1.7168030583597158, + "grad_norm": 0.5054277355892234, + "learning_rate": 8.449962123159092e-06, + "loss": 0.3072, + "step": 3593 + }, + { + "epoch": 1.7172809270652887, + "grad_norm": 0.4829413421685776, + "learning_rate": 8.444759282258387e-06, + "loss": 0.3052, + "step": 3594 + }, + { + "epoch": 1.717758795770862, + "grad_norm": 0.4494055048503989, + "learning_rate": 8.439556872754025e-06, + "loss": 0.3111, + "step": 3595 + }, + { + "epoch": 1.718236664476435, + "grad_norm": 0.4736205047237309, + "learning_rate": 8.434354896089058e-06, + "loss": 0.3211, + "step": 3596 + }, + { + "epoch": 1.7187145331820082, + "grad_norm": 0.47081905982842837, + "learning_rate": 8.429153353706427e-06, + "loss": 0.3341, + "step": 3597 + }, + { + "epoch": 1.7191924018875815, + "grad_norm": 0.4748503377753521, + "learning_rate": 8.423952247048948e-06, + "loss": 0.3215, + "step": 3598 + }, + { + "epoch": 1.7196702705931546, + "grad_norm": 0.4944887369523441, + "learning_rate": 8.418751577559313e-06, + "loss": 0.3099, + "step": 3599 + }, + { + "epoch": 1.7201481392987277, + "grad_norm": 0.47254363092097545, + "learning_rate": 8.413551346680095e-06, + "loss": 0.3067, + "step": 3600 + }, + { + "epoch": 1.7206260080043008, + "grad_norm": 0.47647885914472093, + "learning_rate": 8.40835155585375e-06, + "loss": 0.3129, + "step": 3601 + }, + { + "epoch": 1.7211038767098739, + "grad_norm": 0.48835317263293404, + "learning_rate": 8.403152206522607e-06, + "loss": 0.3107, + "step": 3602 + }, + { + "epoch": 1.7215817454154472, + "grad_norm": 0.4596088980636602, + "learning_rate": 8.397953300128879e-06, + "loss": 0.3157, + "step": 3603 + }, + { + "epoch": 1.7220596141210203, + "grad_norm": 0.49182267500267884, + "learning_rate": 8.39275483811464e-06, + "loss": 0.3254, + "step": 3604 + }, + { + "epoch": 1.7225374828265934, + "grad_norm": 0.48328627094269644, + "learning_rate": 8.38755682192186e-06, + "loss": 0.3112, + "step": 3605 + }, + { + "epoch": 1.7230153515321667, + "grad_norm": 0.47304365223535483, + "learning_rate": 8.382359252992377e-06, + "loss": 0.3069, + "step": 3606 + }, + { + "epoch": 1.7234932202377395, + "grad_norm": 0.4813058108065857, + "learning_rate": 8.3771621327679e-06, + "loss": 0.3198, + "step": 3607 + }, + { + "epoch": 1.7239710889433129, + "grad_norm": 0.47699476225663495, + "learning_rate": 8.371965462690021e-06, + "loss": 0.32, + "step": 3608 + }, + { + "epoch": 1.724448957648886, + "grad_norm": 0.48478741000916475, + "learning_rate": 8.366769244200206e-06, + "loss": 0.3123, + "step": 3609 + }, + { + "epoch": 1.724926826354459, + "grad_norm": 0.4912926079455751, + "learning_rate": 8.36157347873979e-06, + "loss": 0.3116, + "step": 3610 + }, + { + "epoch": 1.7254046950600324, + "grad_norm": 0.5174639765845415, + "learning_rate": 8.356378167749993e-06, + "loss": 0.298, + "step": 3611 + }, + { + "epoch": 1.7258825637656054, + "grad_norm": 0.4793465538100503, + "learning_rate": 8.351183312671898e-06, + "loss": 0.31, + "step": 3612 + }, + { + "epoch": 1.7263604324711785, + "grad_norm": 0.4814025970072797, + "learning_rate": 8.345988914946467e-06, + "loss": 0.3103, + "step": 3613 + }, + { + "epoch": 1.7268383011767516, + "grad_norm": 0.46966946318183, + "learning_rate": 8.340794976014535e-06, + "loss": 0.3055, + "step": 3614 + }, + { + "epoch": 1.7273161698823247, + "grad_norm": 0.48421924623191237, + "learning_rate": 8.335601497316809e-06, + "loss": 0.3147, + "step": 3615 + }, + { + "epoch": 1.727794038587898, + "grad_norm": 0.47415453106644484, + "learning_rate": 8.330408480293867e-06, + "loss": 0.3189, + "step": 3616 + }, + { + "epoch": 1.7282719072934711, + "grad_norm": 0.5311247241943203, + "learning_rate": 8.325215926386164e-06, + "loss": 0.3081, + "step": 3617 + }, + { + "epoch": 1.7287497759990442, + "grad_norm": 0.45880552685400533, + "learning_rate": 8.32002383703402e-06, + "loss": 0.3079, + "step": 3618 + }, + { + "epoch": 1.7292276447046175, + "grad_norm": 0.7527896953526857, + "learning_rate": 8.314832213677627e-06, + "loss": 0.3276, + "step": 3619 + }, + { + "epoch": 1.7297055134101904, + "grad_norm": 0.47577023953198666, + "learning_rate": 8.309641057757052e-06, + "loss": 0.3193, + "step": 3620 + }, + { + "epoch": 1.7301833821157637, + "grad_norm": 0.4600219319465241, + "learning_rate": 8.304450370712234e-06, + "loss": 0.3247, + "step": 3621 + }, + { + "epoch": 1.7306612508213368, + "grad_norm": 0.49059880424437236, + "learning_rate": 8.299260153982969e-06, + "loss": 0.3199, + "step": 3622 + }, + { + "epoch": 1.73113911952691, + "grad_norm": 0.4882276209446351, + "learning_rate": 8.29407040900894e-06, + "loss": 0.3041, + "step": 3623 + }, + { + "epoch": 1.7316169882324832, + "grad_norm": 0.44643071797150663, + "learning_rate": 8.288881137229687e-06, + "loss": 0.3095, + "step": 3624 + }, + { + "epoch": 1.7320948569380563, + "grad_norm": 0.46790133259886285, + "learning_rate": 8.283692340084623e-06, + "loss": 0.3146, + "step": 3625 + }, + { + "epoch": 1.7325727256436294, + "grad_norm": 0.48231263905937954, + "learning_rate": 8.278504019013026e-06, + "loss": 0.3126, + "step": 3626 + }, + { + "epoch": 1.7330505943492027, + "grad_norm": 0.4774119989895905, + "learning_rate": 8.273316175454052e-06, + "loss": 0.3261, + "step": 3627 + }, + { + "epoch": 1.7335284630547756, + "grad_norm": 0.45700207528874304, + "learning_rate": 8.26812881084671e-06, + "loss": 0.3016, + "step": 3628 + }, + { + "epoch": 1.734006331760349, + "grad_norm": 0.8002514747178684, + "learning_rate": 8.262941926629888e-06, + "loss": 0.311, + "step": 3629 + }, + { + "epoch": 1.734484200465922, + "grad_norm": 0.4762774044807064, + "learning_rate": 8.257755524242333e-06, + "loss": 0.3037, + "step": 3630 + }, + { + "epoch": 1.734962069171495, + "grad_norm": 0.4694744914517388, + "learning_rate": 8.252569605122662e-06, + "loss": 0.3129, + "step": 3631 + }, + { + "epoch": 1.7354399378770684, + "grad_norm": 0.4991347812581403, + "learning_rate": 8.24738417070936e-06, + "loss": 0.3395, + "step": 3632 + }, + { + "epoch": 1.7359178065826413, + "grad_norm": 0.504295162908251, + "learning_rate": 8.242199222440773e-06, + "loss": 0.3125, + "step": 3633 + }, + { + "epoch": 1.7363956752882146, + "grad_norm": 0.6877943879571605, + "learning_rate": 8.237014761755116e-06, + "loss": 0.3096, + "step": 3634 + }, + { + "epoch": 1.7368735439937877, + "grad_norm": 0.4672144021586577, + "learning_rate": 8.231830790090461e-06, + "loss": 0.3082, + "step": 3635 + }, + { + "epoch": 1.7373514126993608, + "grad_norm": 0.44504887155749057, + "learning_rate": 8.226647308884756e-06, + "loss": 0.3228, + "step": 3636 + }, + { + "epoch": 1.737829281404934, + "grad_norm": 0.4640809118349916, + "learning_rate": 8.221464319575808e-06, + "loss": 0.3251, + "step": 3637 + }, + { + "epoch": 1.7383071501105072, + "grad_norm": 0.46673343701161357, + "learning_rate": 8.216281823601286e-06, + "loss": 0.3146, + "step": 3638 + }, + { + "epoch": 1.7387850188160803, + "grad_norm": 0.4820552462095371, + "learning_rate": 8.211099822398721e-06, + "loss": 0.3118, + "step": 3639 + }, + { + "epoch": 1.7392628875216536, + "grad_norm": 0.49498233493294336, + "learning_rate": 8.205918317405508e-06, + "loss": 0.3249, + "step": 3640 + }, + { + "epoch": 1.7397407562272265, + "grad_norm": 0.48843945985854864, + "learning_rate": 8.20073731005891e-06, + "loss": 0.3164, + "step": 3641 + }, + { + "epoch": 1.7402186249327998, + "grad_norm": 0.4814877105665985, + "learning_rate": 8.195556801796041e-06, + "loss": 0.3267, + "step": 3642 + }, + { + "epoch": 1.7406964936383729, + "grad_norm": 0.4683984421194578, + "learning_rate": 8.190376794053886e-06, + "loss": 0.3048, + "step": 3643 + }, + { + "epoch": 1.741174362343946, + "grad_norm": 0.4713542128746563, + "learning_rate": 8.185197288269289e-06, + "loss": 0.3114, + "step": 3644 + }, + { + "epoch": 1.7416522310495193, + "grad_norm": 0.5033578491109322, + "learning_rate": 8.18001828587895e-06, + "loss": 0.3356, + "step": 3645 + }, + { + "epoch": 1.7421300997550921, + "grad_norm": 0.48757140141481903, + "learning_rate": 8.174839788319432e-06, + "loss": 0.2978, + "step": 3646 + }, + { + "epoch": 1.7426079684606655, + "grad_norm": 0.5198704975496817, + "learning_rate": 8.169661797027167e-06, + "loss": 0.3215, + "step": 3647 + }, + { + "epoch": 1.7430858371662385, + "grad_norm": 0.5077562055422693, + "learning_rate": 8.164484313438427e-06, + "loss": 0.3096, + "step": 3648 + }, + { + "epoch": 1.7435637058718116, + "grad_norm": 0.476581737348631, + "learning_rate": 8.159307338989364e-06, + "loss": 0.3266, + "step": 3649 + }, + { + "epoch": 1.744041574577385, + "grad_norm": 0.474540506795891, + "learning_rate": 8.154130875115978e-06, + "loss": 0.3041, + "step": 3650 + }, + { + "epoch": 1.744519443282958, + "grad_norm": 0.4801751996496636, + "learning_rate": 8.148954923254122e-06, + "loss": 0.3127, + "step": 3651 + }, + { + "epoch": 1.7449973119885311, + "grad_norm": 0.4852373646526146, + "learning_rate": 8.14377948483952e-06, + "loss": 0.3295, + "step": 3652 + }, + { + "epoch": 1.7454751806941045, + "grad_norm": 0.47761777592380195, + "learning_rate": 8.138604561307748e-06, + "loss": 0.3239, + "step": 3653 + }, + { + "epoch": 1.7459530493996773, + "grad_norm": 0.5234441445204633, + "learning_rate": 8.133430154094232e-06, + "loss": 0.3189, + "step": 3654 + }, + { + "epoch": 1.7464309181052506, + "grad_norm": 0.49427462068157, + "learning_rate": 8.12825626463427e-06, + "loss": 0.3126, + "step": 3655 + }, + { + "epoch": 1.7469087868108237, + "grad_norm": 0.4470937189715777, + "learning_rate": 8.123082894363001e-06, + "loss": 0.3129, + "step": 3656 + }, + { + "epoch": 1.7473866555163968, + "grad_norm": 0.5522576075572359, + "learning_rate": 8.11791004471543e-06, + "loss": 0.3012, + "step": 3657 + }, + { + "epoch": 1.7478645242219701, + "grad_norm": 0.49371091975472053, + "learning_rate": 8.11273771712641e-06, + "loss": 0.3192, + "step": 3658 + }, + { + "epoch": 1.748342392927543, + "grad_norm": 0.45307522023914715, + "learning_rate": 8.10756591303066e-06, + "loss": 0.3204, + "step": 3659 + }, + { + "epoch": 1.7488202616331163, + "grad_norm": 0.46977495936479996, + "learning_rate": 8.102394633862743e-06, + "loss": 0.3327, + "step": 3660 + }, + { + "epoch": 1.7492981303386894, + "grad_norm": 0.49827574100114275, + "learning_rate": 8.097223881057079e-06, + "loss": 0.3308, + "step": 3661 + }, + { + "epoch": 1.7497759990442625, + "grad_norm": 0.46928949793764024, + "learning_rate": 8.09205365604795e-06, + "loss": 0.3085, + "step": 3662 + }, + { + "epoch": 1.7502538677498358, + "grad_norm": 0.46261108160255604, + "learning_rate": 8.086883960269477e-06, + "loss": 0.3281, + "step": 3663 + }, + { + "epoch": 1.750731736455409, + "grad_norm": 0.45709628622401643, + "learning_rate": 8.081714795155648e-06, + "loss": 0.3195, + "step": 3664 + }, + { + "epoch": 1.751209605160982, + "grad_norm": 0.5065744246500246, + "learning_rate": 8.0765461621403e-06, + "loss": 0.3302, + "step": 3665 + }, + { + "epoch": 1.7516874738665553, + "grad_norm": 0.46058472348695123, + "learning_rate": 8.071378062657114e-06, + "loss": 0.3293, + "step": 3666 + }, + { + "epoch": 1.7521653425721282, + "grad_norm": 0.5009271488564606, + "learning_rate": 8.066210498139632e-06, + "loss": 0.3049, + "step": 3667 + }, + { + "epoch": 1.7526432112777015, + "grad_norm": 0.5227592677426378, + "learning_rate": 8.061043470021251e-06, + "loss": 0.3135, + "step": 3668 + }, + { + "epoch": 1.7531210799832746, + "grad_norm": 0.5018329300564572, + "learning_rate": 8.055876979735203e-06, + "loss": 0.3071, + "step": 3669 + }, + { + "epoch": 1.7535989486888477, + "grad_norm": 0.4680917060285861, + "learning_rate": 8.050711028714589e-06, + "loss": 0.2993, + "step": 3670 + }, + { + "epoch": 1.754076817394421, + "grad_norm": 0.5167520768333849, + "learning_rate": 8.045545618392352e-06, + "loss": 0.3084, + "step": 3671 + }, + { + "epoch": 1.7545546860999939, + "grad_norm": 0.4654107230776164, + "learning_rate": 8.040380750201286e-06, + "loss": 0.3149, + "step": 3672 + }, + { + "epoch": 1.7550325548055672, + "grad_norm": 0.5273472331111888, + "learning_rate": 8.035216425574031e-06, + "loss": 0.3195, + "step": 3673 + }, + { + "epoch": 1.7555104235111403, + "grad_norm": 0.4671145108043513, + "learning_rate": 8.03005264594308e-06, + "loss": 0.3249, + "step": 3674 + }, + { + "epoch": 1.7559882922167134, + "grad_norm": 0.6794461125069341, + "learning_rate": 8.02488941274078e-06, + "loss": 0.3101, + "step": 3675 + }, + { + "epoch": 1.7564661609222867, + "grad_norm": 0.46888345908111245, + "learning_rate": 8.019726727399313e-06, + "loss": 0.3028, + "step": 3676 + }, + { + "epoch": 1.7569440296278598, + "grad_norm": 0.4913169627797939, + "learning_rate": 8.01456459135072e-06, + "loss": 0.3357, + "step": 3677 + }, + { + "epoch": 1.7574218983334329, + "grad_norm": 0.475145978227504, + "learning_rate": 8.00940300602689e-06, + "loss": 0.3149, + "step": 3678 + }, + { + "epoch": 1.7578997670390062, + "grad_norm": 0.44803279971539967, + "learning_rate": 8.004241972859552e-06, + "loss": 0.3347, + "step": 3679 + }, + { + "epoch": 1.758377635744579, + "grad_norm": 0.4787438847451073, + "learning_rate": 7.999081493280283e-06, + "loss": 0.3168, + "step": 3680 + }, + { + "epoch": 1.7588555044501524, + "grad_norm": 0.47005813353883635, + "learning_rate": 7.993921568720515e-06, + "loss": 0.3322, + "step": 3681 + }, + { + "epoch": 1.7593333731557255, + "grad_norm": 0.4661035524219349, + "learning_rate": 7.988762200611517e-06, + "loss": 0.322, + "step": 3682 + }, + { + "epoch": 1.7598112418612986, + "grad_norm": 0.45866488888688306, + "learning_rate": 7.983603390384405e-06, + "loss": 0.3227, + "step": 3683 + }, + { + "epoch": 1.7602891105668719, + "grad_norm": 0.5137297024673525, + "learning_rate": 7.978445139470147e-06, + "loss": 0.323, + "step": 3684 + }, + { + "epoch": 1.7607669792724447, + "grad_norm": 0.9472143703913082, + "learning_rate": 7.973287449299545e-06, + "loss": 0.3266, + "step": 3685 + }, + { + "epoch": 1.761244847978018, + "grad_norm": 0.46102852447781484, + "learning_rate": 7.968130321303254e-06, + "loss": 0.3103, + "step": 3686 + }, + { + "epoch": 1.7617227166835912, + "grad_norm": 0.5664763306205914, + "learning_rate": 7.962973756911773e-06, + "loss": 0.3173, + "step": 3687 + }, + { + "epoch": 1.7622005853891642, + "grad_norm": 0.48417061898824687, + "learning_rate": 7.957817757555438e-06, + "loss": 0.3044, + "step": 3688 + }, + { + "epoch": 1.7626784540947376, + "grad_norm": 0.4891312388302578, + "learning_rate": 7.95266232466443e-06, + "loss": 0.3075, + "step": 3689 + }, + { + "epoch": 1.7631563228003106, + "grad_norm": 0.4839119830817586, + "learning_rate": 7.947507459668784e-06, + "loss": 0.3227, + "step": 3690 + }, + { + "epoch": 1.7636341915058837, + "grad_norm": 0.5122896682596106, + "learning_rate": 7.94235316399836e-06, + "loss": 0.3229, + "step": 3691 + }, + { + "epoch": 1.764112060211457, + "grad_norm": 0.4895385220427247, + "learning_rate": 7.937199439082874e-06, + "loss": 0.2932, + "step": 3692 + }, + { + "epoch": 1.76458992891703, + "grad_norm": 0.47779205504554345, + "learning_rate": 7.932046286351877e-06, + "loss": 0.3098, + "step": 3693 + }, + { + "epoch": 1.7650677976226032, + "grad_norm": 0.46886659878011405, + "learning_rate": 7.92689370723476e-06, + "loss": 0.3195, + "step": 3694 + }, + { + "epoch": 1.7655456663281763, + "grad_norm": 0.47588844827515836, + "learning_rate": 7.921741703160758e-06, + "loss": 0.3178, + "step": 3695 + }, + { + "epoch": 1.7660235350337494, + "grad_norm": 0.48249496323108887, + "learning_rate": 7.916590275558953e-06, + "loss": 0.3135, + "step": 3696 + }, + { + "epoch": 1.7665014037393227, + "grad_norm": 0.5528175204761951, + "learning_rate": 7.91143942585825e-06, + "loss": 0.3119, + "step": 3697 + }, + { + "epoch": 1.7669792724448956, + "grad_norm": 0.49703375415055845, + "learning_rate": 7.90628915548741e-06, + "loss": 0.319, + "step": 3698 + }, + { + "epoch": 1.767457141150469, + "grad_norm": 0.4797540216428928, + "learning_rate": 7.901139465875029e-06, + "loss": 0.3216, + "step": 3699 + }, + { + "epoch": 1.767935009856042, + "grad_norm": 0.4685398215123699, + "learning_rate": 7.895990358449533e-06, + "loss": 0.2973, + "step": 3700 + }, + { + "epoch": 1.768412878561615, + "grad_norm": 0.47072717607559234, + "learning_rate": 7.890841834639198e-06, + "loss": 0.3204, + "step": 3701 + }, + { + "epoch": 1.7688907472671884, + "grad_norm": 0.47672448748601165, + "learning_rate": 7.88569389587213e-06, + "loss": 0.3201, + "step": 3702 + }, + { + "epoch": 1.7693686159727615, + "grad_norm": 0.4417548032639278, + "learning_rate": 7.880546543576283e-06, + "loss": 0.3278, + "step": 3703 + }, + { + "epoch": 1.7698464846783346, + "grad_norm": 0.5159968343841332, + "learning_rate": 7.875399779179442e-06, + "loss": 0.3204, + "step": 3704 + }, + { + "epoch": 1.770324353383908, + "grad_norm": 0.5117769103897961, + "learning_rate": 7.87025360410922e-06, + "loss": 0.3129, + "step": 3705 + }, + { + "epoch": 1.7708022220894808, + "grad_norm": 0.514433283638377, + "learning_rate": 7.865108019793082e-06, + "loss": 0.3174, + "step": 3706 + }, + { + "epoch": 1.771280090795054, + "grad_norm": 0.58396258202816, + "learning_rate": 7.859963027658322e-06, + "loss": 0.3238, + "step": 3707 + }, + { + "epoch": 1.7717579595006272, + "grad_norm": 0.4542920457799816, + "learning_rate": 7.854818629132065e-06, + "loss": 0.3169, + "step": 3708 + }, + { + "epoch": 1.7722358282062003, + "grad_norm": 0.49838845584475117, + "learning_rate": 7.849674825641282e-06, + "loss": 0.323, + "step": 3709 + }, + { + "epoch": 1.7727136969117736, + "grad_norm": 0.4707270211274528, + "learning_rate": 7.844531618612772e-06, + "loss": 0.3196, + "step": 3710 + }, + { + "epoch": 1.7731915656173465, + "grad_norm": 0.45205689040938446, + "learning_rate": 7.839389009473167e-06, + "loss": 0.3276, + "step": 3711 + }, + { + "epoch": 1.7736694343229198, + "grad_norm": 0.4717638415399161, + "learning_rate": 7.83424699964894e-06, + "loss": 0.3403, + "step": 3712 + }, + { + "epoch": 1.7741473030284929, + "grad_norm": 0.5086223507316545, + "learning_rate": 7.829105590566393e-06, + "loss": 0.3124, + "step": 3713 + }, + { + "epoch": 1.774625171734066, + "grad_norm": 0.49314322583865455, + "learning_rate": 7.823964783651659e-06, + "loss": 0.3089, + "step": 3714 + }, + { + "epoch": 1.7751030404396393, + "grad_norm": 0.4623851784080253, + "learning_rate": 7.81882458033071e-06, + "loss": 0.3048, + "step": 3715 + }, + { + "epoch": 1.7755809091452124, + "grad_norm": 0.5172646679609034, + "learning_rate": 7.813684982029347e-06, + "loss": 0.3164, + "step": 3716 + }, + { + "epoch": 1.7760587778507855, + "grad_norm": 0.4947812594178152, + "learning_rate": 7.808545990173204e-06, + "loss": 0.3134, + "step": 3717 + }, + { + "epoch": 1.7765366465563588, + "grad_norm": 0.4779432224978824, + "learning_rate": 7.803407606187748e-06, + "loss": 0.3098, + "step": 3718 + }, + { + "epoch": 1.7770145152619317, + "grad_norm": 0.5012122088224422, + "learning_rate": 7.798269831498275e-06, + "loss": 0.3096, + "step": 3719 + }, + { + "epoch": 1.777492383967505, + "grad_norm": 0.47084003336822555, + "learning_rate": 7.79313266752991e-06, + "loss": 0.312, + "step": 3720 + }, + { + "epoch": 1.777970252673078, + "grad_norm": 0.47286383043551417, + "learning_rate": 7.787996115707617e-06, + "loss": 0.3197, + "step": 3721 + }, + { + "epoch": 1.7784481213786512, + "grad_norm": 0.4780990261307311, + "learning_rate": 7.782860177456183e-06, + "loss": 0.3041, + "step": 3722 + }, + { + "epoch": 1.7789259900842245, + "grad_norm": 0.5239288813334391, + "learning_rate": 7.777724854200224e-06, + "loss": 0.3107, + "step": 3723 + }, + { + "epoch": 1.7794038587897973, + "grad_norm": 0.47039194395335954, + "learning_rate": 7.77259014736419e-06, + "loss": 0.3097, + "step": 3724 + }, + { + "epoch": 1.7798817274953707, + "grad_norm": 0.45970590084025953, + "learning_rate": 7.767456058372362e-06, + "loss": 0.3211, + "step": 3725 + }, + { + "epoch": 1.7803595962009438, + "grad_norm": 0.4734707205616713, + "learning_rate": 7.762322588648839e-06, + "loss": 0.3074, + "step": 3726 + }, + { + "epoch": 1.7808374649065168, + "grad_norm": 0.5154833143648376, + "learning_rate": 7.757189739617556e-06, + "loss": 0.3048, + "step": 3727 + }, + { + "epoch": 1.7813153336120902, + "grad_norm": 0.47648033493201797, + "learning_rate": 7.75205751270228e-06, + "loss": 0.325, + "step": 3728 + }, + { + "epoch": 1.7817932023176632, + "grad_norm": 0.46731827189098973, + "learning_rate": 7.746925909326597e-06, + "loss": 0.3175, + "step": 3729 + }, + { + "epoch": 1.7822710710232363, + "grad_norm": 0.4600389521064629, + "learning_rate": 7.741794930913922e-06, + "loss": 0.3058, + "step": 3730 + }, + { + "epoch": 1.7827489397288097, + "grad_norm": 0.4471951076152281, + "learning_rate": 7.7366645788875e-06, + "loss": 0.3226, + "step": 3731 + }, + { + "epoch": 1.7832268084343825, + "grad_norm": 0.45824019393467236, + "learning_rate": 7.731534854670398e-06, + "loss": 0.3184, + "step": 3732 + }, + { + "epoch": 1.7837046771399558, + "grad_norm": 0.5039947063633161, + "learning_rate": 7.726405759685512e-06, + "loss": 0.3294, + "step": 3733 + }, + { + "epoch": 1.784182545845529, + "grad_norm": 0.48151111030694355, + "learning_rate": 7.721277295355566e-06, + "loss": 0.3135, + "step": 3734 + }, + { + "epoch": 1.784660414551102, + "grad_norm": 0.47276411179310734, + "learning_rate": 7.7161494631031e-06, + "loss": 0.3087, + "step": 3735 + }, + { + "epoch": 1.7851382832566753, + "grad_norm": 0.5222088309394991, + "learning_rate": 7.711022264350483e-06, + "loss": 0.3032, + "step": 3736 + }, + { + "epoch": 1.7856161519622482, + "grad_norm": 0.45961959884253906, + "learning_rate": 7.705895700519915e-06, + "loss": 0.324, + "step": 3737 + }, + { + "epoch": 1.7860940206678215, + "grad_norm": 0.45367078593289256, + "learning_rate": 7.700769773033414e-06, + "loss": 0.312, + "step": 3738 + }, + { + "epoch": 1.7865718893733946, + "grad_norm": 0.48592430357796496, + "learning_rate": 7.69564448331282e-06, + "loss": 0.3064, + "step": 3739 + }, + { + "epoch": 1.7870497580789677, + "grad_norm": 0.49401805489694345, + "learning_rate": 7.690519832779799e-06, + "loss": 0.3158, + "step": 3740 + }, + { + "epoch": 1.787527626784541, + "grad_norm": 0.4920841853592163, + "learning_rate": 7.685395822855837e-06, + "loss": 0.3193, + "step": 3741 + }, + { + "epoch": 1.7880054954901141, + "grad_norm": 0.48732165094819957, + "learning_rate": 7.680272454962245e-06, + "loss": 0.3163, + "step": 3742 + }, + { + "epoch": 1.7884833641956872, + "grad_norm": 0.49308527781981676, + "learning_rate": 7.675149730520155e-06, + "loss": 0.3173, + "step": 3743 + }, + { + "epoch": 1.7889612329012605, + "grad_norm": 0.48955038846140736, + "learning_rate": 7.670027650950519e-06, + "loss": 0.3266, + "step": 3744 + }, + { + "epoch": 1.7894391016068334, + "grad_norm": 0.4710450744507011, + "learning_rate": 7.664906217674115e-06, + "loss": 0.309, + "step": 3745 + }, + { + "epoch": 1.7899169703124067, + "grad_norm": 0.4738846393248144, + "learning_rate": 7.659785432111533e-06, + "loss": 0.3105, + "step": 3746 + }, + { + "epoch": 1.7903948390179798, + "grad_norm": 0.48221267868714024, + "learning_rate": 7.654665295683192e-06, + "loss": 0.3188, + "step": 3747 + }, + { + "epoch": 1.790872707723553, + "grad_norm": 0.46375474845886444, + "learning_rate": 7.649545809809329e-06, + "loss": 0.2975, + "step": 3748 + }, + { + "epoch": 1.7913505764291262, + "grad_norm": 0.47721809669352233, + "learning_rate": 7.644426975909995e-06, + "loss": 0.3113, + "step": 3749 + }, + { + "epoch": 1.791828445134699, + "grad_norm": 4.805146189592445, + "learning_rate": 7.639308795405066e-06, + "loss": 0.3143, + "step": 3750 + }, + { + "epoch": 1.7923063138402724, + "grad_norm": 0.5455951254365826, + "learning_rate": 7.634191269714238e-06, + "loss": 0.2981, + "step": 3751 + }, + { + "epoch": 1.7927841825458455, + "grad_norm": 0.4744407650385994, + "learning_rate": 7.6290744002570176e-06, + "loss": 0.3249, + "step": 3752 + }, + { + "epoch": 1.7932620512514186, + "grad_norm": 0.45267368874034103, + "learning_rate": 7.6239581884527354e-06, + "loss": 0.3169, + "step": 3753 + }, + { + "epoch": 1.793739919956992, + "grad_norm": 0.46081369771470404, + "learning_rate": 7.618842635720542e-06, + "loss": 0.3122, + "step": 3754 + }, + { + "epoch": 1.794217788662565, + "grad_norm": 0.46409053301581027, + "learning_rate": 7.613727743479395e-06, + "loss": 0.317, + "step": 3755 + }, + { + "epoch": 1.794695657368138, + "grad_norm": 0.45315984581264734, + "learning_rate": 7.608613513148081e-06, + "loss": 0.3253, + "step": 3756 + }, + { + "epoch": 1.7951735260737114, + "grad_norm": 0.44114117490408994, + "learning_rate": 7.6034999461451956e-06, + "loss": 0.3272, + "step": 3757 + }, + { + "epoch": 1.7956513947792843, + "grad_norm": 1.5565443010930862, + "learning_rate": 7.5983870438891505e-06, + "loss": 0.3197, + "step": 3758 + }, + { + "epoch": 1.7961292634848576, + "grad_norm": 0.4911118507351176, + "learning_rate": 7.593274807798175e-06, + "loss": 0.3245, + "step": 3759 + }, + { + "epoch": 1.7966071321904307, + "grad_norm": 0.44995132281699324, + "learning_rate": 7.588163239290316e-06, + "loss": 0.3186, + "step": 3760 + }, + { + "epoch": 1.7970850008960038, + "grad_norm": 0.46456788831578155, + "learning_rate": 7.583052339783428e-06, + "loss": 0.3172, + "step": 3761 + }, + { + "epoch": 1.797562869601577, + "grad_norm": 0.4734482643803076, + "learning_rate": 7.5779421106951874e-06, + "loss": 0.3086, + "step": 3762 + }, + { + "epoch": 1.7980407383071502, + "grad_norm": 0.46205250286552624, + "learning_rate": 7.572832553443083e-06, + "loss": 0.309, + "step": 3763 + }, + { + "epoch": 1.7985186070127233, + "grad_norm": 0.5474167526568322, + "learning_rate": 7.567723669444411e-06, + "loss": 0.3263, + "step": 3764 + }, + { + "epoch": 1.7989964757182964, + "grad_norm": 0.4807279141914673, + "learning_rate": 7.562615460116289e-06, + "loss": 0.3114, + "step": 3765 + }, + { + "epoch": 1.7994743444238694, + "grad_norm": 0.47642414056800825, + "learning_rate": 7.557507926875646e-06, + "loss": 0.3141, + "step": 3766 + }, + { + "epoch": 1.7999522131294428, + "grad_norm": 0.44512012977708126, + "learning_rate": 7.552401071139217e-06, + "loss": 0.3202, + "step": 3767 + }, + { + "epoch": 1.8004300818350158, + "grad_norm": 0.48382096788679013, + "learning_rate": 7.547294894323556e-06, + "loss": 0.3183, + "step": 3768 + }, + { + "epoch": 1.800907950540589, + "grad_norm": 0.4631932373502014, + "learning_rate": 7.542189397845028e-06, + "loss": 0.3163, + "step": 3769 + }, + { + "epoch": 1.8013858192461623, + "grad_norm": 0.4661580950852657, + "learning_rate": 7.537084583119802e-06, + "loss": 0.3009, + "step": 3770 + }, + { + "epoch": 1.8018636879517351, + "grad_norm": 0.44890347173646833, + "learning_rate": 7.531980451563869e-06, + "loss": 0.324, + "step": 3771 + }, + { + "epoch": 1.8023415566573084, + "grad_norm": 0.46211335360678135, + "learning_rate": 7.5268770045930255e-06, + "loss": 0.3063, + "step": 3772 + }, + { + "epoch": 1.8028194253628815, + "grad_norm": 0.47324725173384913, + "learning_rate": 7.521774243622875e-06, + "loss": 0.3232, + "step": 3773 + }, + { + "epoch": 1.8032972940684546, + "grad_norm": 0.4769086546795592, + "learning_rate": 7.516672170068835e-06, + "loss": 0.3081, + "step": 3774 + }, + { + "epoch": 1.803775162774028, + "grad_norm": 0.4987810815938432, + "learning_rate": 7.511570785346129e-06, + "loss": 0.3331, + "step": 3775 + }, + { + "epoch": 1.804253031479601, + "grad_norm": 0.515528221050649, + "learning_rate": 7.506470090869793e-06, + "loss": 0.3081, + "step": 3776 + }, + { + "epoch": 1.8047309001851741, + "grad_norm": 0.48258513179828666, + "learning_rate": 7.501370088054667e-06, + "loss": 0.3253, + "step": 3777 + }, + { + "epoch": 1.8052087688907472, + "grad_norm": 0.4687979828089521, + "learning_rate": 7.496270778315404e-06, + "loss": 0.3038, + "step": 3778 + }, + { + "epoch": 1.8056866375963203, + "grad_norm": 0.4675700871564693, + "learning_rate": 7.4911721630664644e-06, + "loss": 0.3131, + "step": 3779 + }, + { + "epoch": 1.8061645063018936, + "grad_norm": 0.4488321146024742, + "learning_rate": 7.486074243722109e-06, + "loss": 0.3097, + "step": 3780 + }, + { + "epoch": 1.8066423750074667, + "grad_norm": 0.4731548586712247, + "learning_rate": 7.480977021696414e-06, + "loss": 0.3036, + "step": 3781 + }, + { + "epoch": 1.8071202437130398, + "grad_norm": 0.5469397778562964, + "learning_rate": 7.475880498403261e-06, + "loss": 0.32, + "step": 3782 + }, + { + "epoch": 1.8075981124186131, + "grad_norm": 0.45764608523635797, + "learning_rate": 7.470784675256329e-06, + "loss": 0.3237, + "step": 3783 + }, + { + "epoch": 1.808075981124186, + "grad_norm": 0.4850459958523287, + "learning_rate": 7.4656895536691154e-06, + "loss": 0.3132, + "step": 3784 + }, + { + "epoch": 1.8085538498297593, + "grad_norm": 0.45406496016389747, + "learning_rate": 7.460595135054916e-06, + "loss": 0.3124, + "step": 3785 + }, + { + "epoch": 1.8090317185353324, + "grad_norm": 0.45950621992569934, + "learning_rate": 7.455501420826831e-06, + "loss": 0.3218, + "step": 3786 + }, + { + "epoch": 1.8095095872409055, + "grad_norm": 0.4591525516581785, + "learning_rate": 7.450408412397767e-06, + "loss": 0.3206, + "step": 3787 + }, + { + "epoch": 1.8099874559464788, + "grad_norm": 0.508129994813578, + "learning_rate": 7.445316111180436e-06, + "loss": 0.3265, + "step": 3788 + }, + { + "epoch": 1.810465324652052, + "grad_norm": 0.4487238152879312, + "learning_rate": 7.440224518587353e-06, + "loss": 0.317, + "step": 3789 + }, + { + "epoch": 1.810943193357625, + "grad_norm": 0.44328561901531993, + "learning_rate": 7.435133636030831e-06, + "loss": 0.3091, + "step": 3790 + }, + { + "epoch": 1.811421062063198, + "grad_norm": 0.4386069882959427, + "learning_rate": 7.430043464923e-06, + "loss": 0.33, + "step": 3791 + }, + { + "epoch": 1.8118989307687712, + "grad_norm": 0.4737344270055403, + "learning_rate": 7.424954006675775e-06, + "loss": 0.3075, + "step": 3792 + }, + { + "epoch": 1.8123767994743445, + "grad_norm": 0.46437704276405334, + "learning_rate": 7.419865262700887e-06, + "loss": 0.3163, + "step": 3793 + }, + { + "epoch": 1.8128546681799176, + "grad_norm": 0.4582800308955872, + "learning_rate": 7.414777234409863e-06, + "loss": 0.336, + "step": 3794 + }, + { + "epoch": 1.8133325368854907, + "grad_norm": 0.4639778390365935, + "learning_rate": 7.4096899232140295e-06, + "loss": 0.3253, + "step": 3795 + }, + { + "epoch": 1.813810405591064, + "grad_norm": 0.47191555510498095, + "learning_rate": 7.40460333052452e-06, + "loss": 0.3164, + "step": 3796 + }, + { + "epoch": 1.8142882742966369, + "grad_norm": 0.8754820249606956, + "learning_rate": 7.399517457752266e-06, + "loss": 0.3068, + "step": 3797 + }, + { + "epoch": 1.8147661430022102, + "grad_norm": 0.4647598211303201, + "learning_rate": 7.394432306307997e-06, + "loss": 0.3236, + "step": 3798 + }, + { + "epoch": 1.8152440117077833, + "grad_norm": 0.8061057528954818, + "learning_rate": 7.389347877602242e-06, + "loss": 0.3204, + "step": 3799 + }, + { + "epoch": 1.8157218804133564, + "grad_norm": 0.4832339467960314, + "learning_rate": 7.384264173045339e-06, + "loss": 0.3068, + "step": 3800 + }, + { + "epoch": 1.8161997491189297, + "grad_norm": 0.48537677126279627, + "learning_rate": 7.379181194047412e-06, + "loss": 0.3044, + "step": 3801 + }, + { + "epoch": 1.8166776178245028, + "grad_norm": 0.4370504478039206, + "learning_rate": 7.374098942018388e-06, + "loss": 0.3103, + "step": 3802 + }, + { + "epoch": 1.8171554865300759, + "grad_norm": 0.4540403617689952, + "learning_rate": 7.3690174183680015e-06, + "loss": 0.293, + "step": 3803 + }, + { + "epoch": 1.817633355235649, + "grad_norm": 0.44834138741434454, + "learning_rate": 7.363936624505767e-06, + "loss": 0.3038, + "step": 3804 + }, + { + "epoch": 1.818111223941222, + "grad_norm": 0.5071526227296054, + "learning_rate": 7.358856561841021e-06, + "loss": 0.33, + "step": 3805 + }, + { + "epoch": 1.8185890926467954, + "grad_norm": 0.45524356000949795, + "learning_rate": 7.353777231782873e-06, + "loss": 0.3037, + "step": 3806 + }, + { + "epoch": 1.8190669613523685, + "grad_norm": 0.4323954513417616, + "learning_rate": 7.3486986357402414e-06, + "loss": 0.2984, + "step": 3807 + }, + { + "epoch": 1.8195448300579415, + "grad_norm": 0.4748222174160758, + "learning_rate": 7.343620775121842e-06, + "loss": 0.3197, + "step": 3808 + }, + { + "epoch": 1.8200226987635149, + "grad_norm": 0.5434867730758338, + "learning_rate": 7.338543651336181e-06, + "loss": 0.3048, + "step": 3809 + }, + { + "epoch": 1.8205005674690877, + "grad_norm": 0.5261663632567056, + "learning_rate": 7.333467265791563e-06, + "loss": 0.3269, + "step": 3810 + }, + { + "epoch": 1.820978436174661, + "grad_norm": 0.4668224192467844, + "learning_rate": 7.328391619896092e-06, + "loss": 0.3256, + "step": 3811 + }, + { + "epoch": 1.8214563048802341, + "grad_norm": 0.47668206195798435, + "learning_rate": 7.3233167150576554e-06, + "loss": 0.3268, + "step": 3812 + }, + { + "epoch": 1.8219341735858072, + "grad_norm": 0.5070836849852632, + "learning_rate": 7.318242552683948e-06, + "loss": 0.321, + "step": 3813 + }, + { + "epoch": 1.8224120422913805, + "grad_norm": 0.494039037408748, + "learning_rate": 7.3131691341824515e-06, + "loss": 0.3083, + "step": 3814 + }, + { + "epoch": 1.8228899109969536, + "grad_norm": 0.4701617746326885, + "learning_rate": 7.308096460960441e-06, + "loss": 0.3036, + "step": 3815 + }, + { + "epoch": 1.8233677797025267, + "grad_norm": 0.48182454253938445, + "learning_rate": 7.303024534424987e-06, + "loss": 0.3191, + "step": 3816 + }, + { + "epoch": 1.8238456484081, + "grad_norm": 0.4968254147423399, + "learning_rate": 7.297953355982956e-06, + "loss": 0.3228, + "step": 3817 + }, + { + "epoch": 1.824323517113673, + "grad_norm": 0.48879138443250414, + "learning_rate": 7.292882927040999e-06, + "loss": 0.3083, + "step": 3818 + }, + { + "epoch": 1.8248013858192462, + "grad_norm": 0.47334912056885264, + "learning_rate": 7.287813249005565e-06, + "loss": 0.3111, + "step": 3819 + }, + { + "epoch": 1.8252792545248193, + "grad_norm": 0.4884790884097843, + "learning_rate": 7.282744323282895e-06, + "loss": 0.3256, + "step": 3820 + }, + { + "epoch": 1.8257571232303924, + "grad_norm": 0.5022837404885262, + "learning_rate": 7.277676151279019e-06, + "loss": 0.3076, + "step": 3821 + }, + { + "epoch": 1.8262349919359657, + "grad_norm": 0.47479766132527035, + "learning_rate": 7.272608734399754e-06, + "loss": 0.3033, + "step": 3822 + }, + { + "epoch": 1.8267128606415386, + "grad_norm": 1.3020109975876173, + "learning_rate": 7.26754207405072e-06, + "loss": 0.3079, + "step": 3823 + }, + { + "epoch": 1.827190729347112, + "grad_norm": 0.4919024454500918, + "learning_rate": 7.262476171637311e-06, + "loss": 0.3095, + "step": 3824 + }, + { + "epoch": 1.827668598052685, + "grad_norm": 0.44665992779272773, + "learning_rate": 7.2574110285647244e-06, + "loss": 0.3103, + "step": 3825 + }, + { + "epoch": 1.828146466758258, + "grad_norm": 0.44838920186117176, + "learning_rate": 7.252346646237942e-06, + "loss": 0.3092, + "step": 3826 + }, + { + "epoch": 1.8286243354638314, + "grad_norm": 0.7199341984258678, + "learning_rate": 7.24728302606173e-06, + "loss": 0.3233, + "step": 3827 + }, + { + "epoch": 1.8291022041694045, + "grad_norm": 0.4594979265995351, + "learning_rate": 7.242220169440649e-06, + "loss": 0.3203, + "step": 3828 + }, + { + "epoch": 1.8295800728749776, + "grad_norm": 0.46434411184296537, + "learning_rate": 7.2371580777790494e-06, + "loss": 0.3297, + "step": 3829 + }, + { + "epoch": 1.830057941580551, + "grad_norm": 0.4661421339585899, + "learning_rate": 7.232096752481061e-06, + "loss": 0.31, + "step": 3830 + }, + { + "epoch": 1.8305358102861238, + "grad_norm": 0.4796291545667761, + "learning_rate": 7.2270361949506075e-06, + "loss": 0.3196, + "step": 3831 + }, + { + "epoch": 1.831013678991697, + "grad_norm": 0.46983415973458353, + "learning_rate": 7.2219764065914024e-06, + "loss": 0.317, + "step": 3832 + }, + { + "epoch": 1.8314915476972702, + "grad_norm": 0.5194541122421759, + "learning_rate": 7.216917388806936e-06, + "loss": 0.323, + "step": 3833 + }, + { + "epoch": 1.8319694164028433, + "grad_norm": 0.483395653309714, + "learning_rate": 7.211859143000495e-06, + "loss": 0.3084, + "step": 3834 + }, + { + "epoch": 1.8324472851084166, + "grad_norm": 0.4802816137671634, + "learning_rate": 7.206801670575145e-06, + "loss": 0.3213, + "step": 3835 + }, + { + "epoch": 1.8329251538139895, + "grad_norm": 0.46676119411435246, + "learning_rate": 7.2017449729337396e-06, + "loss": 0.2915, + "step": 3836 + }, + { + "epoch": 1.8334030225195628, + "grad_norm": 0.4751615080956812, + "learning_rate": 7.196689051478917e-06, + "loss": 0.3172, + "step": 3837 + }, + { + "epoch": 1.8338808912251359, + "grad_norm": 0.4518355207103295, + "learning_rate": 7.191633907613103e-06, + "loss": 0.3037, + "step": 3838 + }, + { + "epoch": 1.834358759930709, + "grad_norm": 0.4964989199148571, + "learning_rate": 7.186579542738507e-06, + "loss": 0.3241, + "step": 3839 + }, + { + "epoch": 1.8348366286362823, + "grad_norm": 0.45654528244066295, + "learning_rate": 7.181525958257116e-06, + "loss": 0.3127, + "step": 3840 + }, + { + "epoch": 1.8353144973418554, + "grad_norm": 0.46918542594424667, + "learning_rate": 7.176473155570707e-06, + "loss": 0.3119, + "step": 3841 + }, + { + "epoch": 1.8357923660474285, + "grad_norm": 0.4507255821310943, + "learning_rate": 7.171421136080841e-06, + "loss": 0.3096, + "step": 3842 + }, + { + "epoch": 1.8362702347530018, + "grad_norm": 0.4345435682976803, + "learning_rate": 7.1663699011888524e-06, + "loss": 0.3327, + "step": 3843 + }, + { + "epoch": 1.8367481034585746, + "grad_norm": 0.46364207609499564, + "learning_rate": 7.1613194522958705e-06, + "loss": 0.3321, + "step": 3844 + }, + { + "epoch": 1.837225972164148, + "grad_norm": 1.3350277825716896, + "learning_rate": 7.156269790802801e-06, + "loss": 0.3215, + "step": 3845 + }, + { + "epoch": 1.837703840869721, + "grad_norm": 0.45482610525005635, + "learning_rate": 7.151220918110326e-06, + "loss": 0.3122, + "step": 3846 + }, + { + "epoch": 1.8381817095752941, + "grad_norm": 0.4799378572791614, + "learning_rate": 7.146172835618919e-06, + "loss": 0.3202, + "step": 3847 + }, + { + "epoch": 1.8386595782808675, + "grad_norm": 0.4560594548210913, + "learning_rate": 7.1411255447288266e-06, + "loss": 0.3178, + "step": 3848 + }, + { + "epoch": 1.8391374469864403, + "grad_norm": 0.42536499972455255, + "learning_rate": 7.136079046840078e-06, + "loss": 0.3098, + "step": 3849 + }, + { + "epoch": 1.8396153156920136, + "grad_norm": 0.4728281050415246, + "learning_rate": 7.131033343352483e-06, + "loss": 0.3162, + "step": 3850 + }, + { + "epoch": 1.8400931843975867, + "grad_norm": 0.45137859761216986, + "learning_rate": 7.125988435665632e-06, + "loss": 0.3258, + "step": 3851 + }, + { + "epoch": 1.8405710531031598, + "grad_norm": 0.4305757718398639, + "learning_rate": 7.120944325178889e-06, + "loss": 0.3034, + "step": 3852 + }, + { + "epoch": 1.8410489218087331, + "grad_norm": 0.46513614861758734, + "learning_rate": 7.1159010132914065e-06, + "loss": 0.3233, + "step": 3853 + }, + { + "epoch": 1.8415267905143062, + "grad_norm": 0.4516843311861331, + "learning_rate": 7.1108585014021095e-06, + "loss": 0.3123, + "step": 3854 + }, + { + "epoch": 1.8420046592198793, + "grad_norm": 0.44599301993670454, + "learning_rate": 7.105816790909699e-06, + "loss": 0.303, + "step": 3855 + }, + { + "epoch": 1.8424825279254526, + "grad_norm": 0.45407721798005035, + "learning_rate": 7.100775883212658e-06, + "loss": 0.2951, + "step": 3856 + }, + { + "epoch": 1.8429603966310255, + "grad_norm": 0.6330334384592656, + "learning_rate": 7.095735779709248e-06, + "loss": 0.3187, + "step": 3857 + }, + { + "epoch": 1.8434382653365988, + "grad_norm": 0.45481681430926496, + "learning_rate": 7.0906964817974984e-06, + "loss": 0.3068, + "step": 3858 + }, + { + "epoch": 1.843916134042172, + "grad_norm": 0.4552031236149239, + "learning_rate": 7.085657990875227e-06, + "loss": 0.3038, + "step": 3859 + }, + { + "epoch": 1.844394002747745, + "grad_norm": 0.4834807925984325, + "learning_rate": 7.080620308340024e-06, + "loss": 0.3047, + "step": 3860 + }, + { + "epoch": 1.8448718714533183, + "grad_norm": 0.46556812455338153, + "learning_rate": 7.075583435589248e-06, + "loss": 0.3085, + "step": 3861 + }, + { + "epoch": 1.8453497401588912, + "grad_norm": 0.43412355998395485, + "learning_rate": 7.07054737402004e-06, + "loss": 0.297, + "step": 3862 + }, + { + "epoch": 1.8458276088644645, + "grad_norm": 0.4346076047173381, + "learning_rate": 7.065512125029318e-06, + "loss": 0.3116, + "step": 3863 + }, + { + "epoch": 1.8463054775700376, + "grad_norm": 0.4604136030349345, + "learning_rate": 7.060477690013767e-06, + "loss": 0.3257, + "step": 3864 + }, + { + "epoch": 1.8467833462756107, + "grad_norm": 0.4900954658380756, + "learning_rate": 7.055444070369852e-06, + "loss": 0.3204, + "step": 3865 + }, + { + "epoch": 1.847261214981184, + "grad_norm": 0.4813520085061644, + "learning_rate": 7.050411267493815e-06, + "loss": 0.3139, + "step": 3866 + }, + { + "epoch": 1.847739083686757, + "grad_norm": 0.4567730976405721, + "learning_rate": 7.045379282781659e-06, + "loss": 0.3125, + "step": 3867 + }, + { + "epoch": 1.8482169523923302, + "grad_norm": 0.474550906247094, + "learning_rate": 7.040348117629172e-06, + "loss": 0.3149, + "step": 3868 + }, + { + "epoch": 1.8486948210979035, + "grad_norm": 0.4432230508973788, + "learning_rate": 7.035317773431911e-06, + "loss": 0.3055, + "step": 3869 + }, + { + "epoch": 1.8491726898034764, + "grad_norm": 0.5522942357232046, + "learning_rate": 7.0302882515852025e-06, + "loss": 0.3366, + "step": 3870 + }, + { + "epoch": 1.8496505585090497, + "grad_norm": 0.4752790460554777, + "learning_rate": 7.025259553484145e-06, + "loss": 0.3324, + "step": 3871 + }, + { + "epoch": 1.8501284272146228, + "grad_norm": 0.4643972132496278, + "learning_rate": 7.020231680523616e-06, + "loss": 0.3093, + "step": 3872 + }, + { + "epoch": 1.8506062959201959, + "grad_norm": 0.4554789111097097, + "learning_rate": 7.015204634098256e-06, + "loss": 0.3319, + "step": 3873 + }, + { + "epoch": 1.8510841646257692, + "grad_norm": 0.45867714400719684, + "learning_rate": 7.010178415602485e-06, + "loss": 0.3133, + "step": 3874 + }, + { + "epoch": 1.851562033331342, + "grad_norm": 0.45519094876743393, + "learning_rate": 7.005153026430476e-06, + "loss": 0.311, + "step": 3875 + }, + { + "epoch": 1.8520399020369154, + "grad_norm": 0.4456318199658115, + "learning_rate": 7.00012846797619e-06, + "loss": 0.3298, + "step": 3876 + }, + { + "epoch": 1.8525177707424885, + "grad_norm": 0.4877288837711452, + "learning_rate": 6.995104741633354e-06, + "loss": 0.302, + "step": 3877 + }, + { + "epoch": 1.8529956394480616, + "grad_norm": 0.45490188442789403, + "learning_rate": 6.990081848795453e-06, + "loss": 0.3225, + "step": 3878 + }, + { + "epoch": 1.8534735081536349, + "grad_norm": 0.46268349832494854, + "learning_rate": 6.985059790855755e-06, + "loss": 0.31, + "step": 3879 + }, + { + "epoch": 1.853951376859208, + "grad_norm": 0.47114477836168356, + "learning_rate": 6.980038569207291e-06, + "loss": 0.3215, + "step": 3880 + }, + { + "epoch": 1.854429245564781, + "grad_norm": 0.4627024705877978, + "learning_rate": 6.975018185242852e-06, + "loss": 0.322, + "step": 3881 + }, + { + "epoch": 1.8549071142703544, + "grad_norm": 0.4403644932273325, + "learning_rate": 6.969998640355011e-06, + "loss": 0.3075, + "step": 3882 + }, + { + "epoch": 1.8553849829759272, + "grad_norm": 0.4734619620341228, + "learning_rate": 6.9649799359361e-06, + "loss": 0.3153, + "step": 3883 + }, + { + "epoch": 1.8558628516815006, + "grad_norm": 0.45177620940516144, + "learning_rate": 6.959962073378216e-06, + "loss": 0.3014, + "step": 3884 + }, + { + "epoch": 1.8563407203870737, + "grad_norm": 0.4898115665474695, + "learning_rate": 6.954945054073228e-06, + "loss": 0.3159, + "step": 3885 + }, + { + "epoch": 1.8568185890926467, + "grad_norm": 0.49590283970632837, + "learning_rate": 6.949928879412768e-06, + "loss": 0.3122, + "step": 3886 + }, + { + "epoch": 1.85729645779822, + "grad_norm": 0.483933886138119, + "learning_rate": 6.944913550788235e-06, + "loss": 0.3134, + "step": 3887 + }, + { + "epoch": 1.857774326503793, + "grad_norm": 0.503142035037706, + "learning_rate": 6.939899069590791e-06, + "loss": 0.327, + "step": 3888 + }, + { + "epoch": 1.8582521952093662, + "grad_norm": 0.47557368760354807, + "learning_rate": 6.934885437211367e-06, + "loss": 0.3218, + "step": 3889 + }, + { + "epoch": 1.8587300639149393, + "grad_norm": 0.4620961694190902, + "learning_rate": 6.929872655040655e-06, + "loss": 0.2842, + "step": 3890 + }, + { + "epoch": 1.8592079326205124, + "grad_norm": 0.4673591975296846, + "learning_rate": 6.924860724469111e-06, + "loss": 0.315, + "step": 3891 + }, + { + "epoch": 1.8596858013260857, + "grad_norm": 0.46945354634024067, + "learning_rate": 6.9198496468869605e-06, + "loss": 0.3005, + "step": 3892 + }, + { + "epoch": 1.8601636700316588, + "grad_norm": 0.4851381015060144, + "learning_rate": 6.914839423684183e-06, + "loss": 0.3195, + "step": 3893 + }, + { + "epoch": 1.860641538737232, + "grad_norm": 0.5201132587365642, + "learning_rate": 6.909830056250527e-06, + "loss": 0.3026, + "step": 3894 + }, + { + "epoch": 1.8611194074428052, + "grad_norm": 0.5134723697086168, + "learning_rate": 6.904821545975507e-06, + "loss": 0.3245, + "step": 3895 + }, + { + "epoch": 1.861597276148378, + "grad_norm": 0.4518202226167905, + "learning_rate": 6.899813894248388e-06, + "loss": 0.3088, + "step": 3896 + }, + { + "epoch": 1.8620751448539514, + "grad_norm": 0.4683222226158601, + "learning_rate": 6.894807102458211e-06, + "loss": 0.3239, + "step": 3897 + }, + { + "epoch": 1.8625530135595245, + "grad_norm": 0.48007891114431994, + "learning_rate": 6.889801171993769e-06, + "loss": 0.2836, + "step": 3898 + }, + { + "epoch": 1.8630308822650976, + "grad_norm": 0.4772065271771579, + "learning_rate": 6.8847961042436185e-06, + "loss": 0.3176, + "step": 3899 + }, + { + "epoch": 1.863508750970671, + "grad_norm": 0.45177407887015353, + "learning_rate": 6.879791900596077e-06, + "loss": 0.3081, + "step": 3900 + }, + { + "epoch": 1.8639866196762438, + "grad_norm": 0.4729590065660319, + "learning_rate": 6.874788562439225e-06, + "loss": 0.3102, + "step": 3901 + }, + { + "epoch": 1.864464488381817, + "grad_norm": 0.49303248211065703, + "learning_rate": 6.869786091160895e-06, + "loss": 0.3371, + "step": 3902 + }, + { + "epoch": 1.8649423570873902, + "grad_norm": 0.4798724268373024, + "learning_rate": 6.864784488148688e-06, + "loss": 0.3172, + "step": 3903 + }, + { + "epoch": 1.8654202257929633, + "grad_norm": 0.4566461806579496, + "learning_rate": 6.859783754789962e-06, + "loss": 0.3159, + "step": 3904 + }, + { + "epoch": 1.8658980944985366, + "grad_norm": 0.45266075225313984, + "learning_rate": 6.854783892471823e-06, + "loss": 0.3336, + "step": 3905 + }, + { + "epoch": 1.8663759632041097, + "grad_norm": 0.47703082895473686, + "learning_rate": 6.849784902581158e-06, + "loss": 0.2831, + "step": 3906 + }, + { + "epoch": 1.8668538319096828, + "grad_norm": 0.5127241356341654, + "learning_rate": 6.8447867865045905e-06, + "loss": 0.3055, + "step": 3907 + }, + { + "epoch": 1.867331700615256, + "grad_norm": 0.46983255513156474, + "learning_rate": 6.83978954562851e-06, + "loss": 0.3053, + "step": 3908 + }, + { + "epoch": 1.867809569320829, + "grad_norm": 0.44944233352974683, + "learning_rate": 6.834793181339068e-06, + "loss": 0.312, + "step": 3909 + }, + { + "epoch": 1.8682874380264023, + "grad_norm": 0.48300744589768124, + "learning_rate": 6.829797695022163e-06, + "loss": 0.2879, + "step": 3910 + }, + { + "epoch": 1.8687653067319754, + "grad_norm": 0.49195148134310035, + "learning_rate": 6.824803088063454e-06, + "loss": 0.3086, + "step": 3911 + }, + { + "epoch": 1.8692431754375485, + "grad_norm": 0.48226235431323466, + "learning_rate": 6.819809361848362e-06, + "loss": 0.3161, + "step": 3912 + }, + { + "epoch": 1.8697210441431218, + "grad_norm": 0.4673032858019702, + "learning_rate": 6.814816517762053e-06, + "loss": 0.3009, + "step": 3913 + }, + { + "epoch": 1.8701989128486947, + "grad_norm": 0.5120888668650547, + "learning_rate": 6.809824557189456e-06, + "loss": 0.3047, + "step": 3914 + }, + { + "epoch": 1.870676781554268, + "grad_norm": 0.5188704505048501, + "learning_rate": 6.804833481515256e-06, + "loss": 0.2996, + "step": 3915 + }, + { + "epoch": 1.871154650259841, + "grad_norm": 0.4586738347773163, + "learning_rate": 6.799843292123883e-06, + "loss": 0.3216, + "step": 3916 + }, + { + "epoch": 1.8716325189654142, + "grad_norm": 0.4514115914431411, + "learning_rate": 6.794853990399533e-06, + "loss": 0.3157, + "step": 3917 + }, + { + "epoch": 1.8721103876709875, + "grad_norm": 0.4567742118666483, + "learning_rate": 6.7898655777261494e-06, + "loss": 0.3228, + "step": 3918 + }, + { + "epoch": 1.8725882563765606, + "grad_norm": 0.5130560266812821, + "learning_rate": 6.784878055487425e-06, + "loss": 0.3059, + "step": 3919 + }, + { + "epoch": 1.8730661250821337, + "grad_norm": 0.471031405151132, + "learning_rate": 6.7798914250668154e-06, + "loss": 0.3208, + "step": 3920 + }, + { + "epoch": 1.873543993787707, + "grad_norm": 0.4488894073471434, + "learning_rate": 6.774905687847526e-06, + "loss": 0.3082, + "step": 3921 + }, + { + "epoch": 1.8740218624932798, + "grad_norm": 0.4837073839553914, + "learning_rate": 6.769920845212506e-06, + "loss": 0.3114, + "step": 3922 + }, + { + "epoch": 1.8744997311988532, + "grad_norm": 0.45927942054259496, + "learning_rate": 6.764936898544466e-06, + "loss": 0.3162, + "step": 3923 + }, + { + "epoch": 1.8749775999044263, + "grad_norm": 0.517741398437042, + "learning_rate": 6.759953849225867e-06, + "loss": 0.3065, + "step": 3924 + }, + { + "epoch": 1.8754554686099993, + "grad_norm": 0.4830767260559844, + "learning_rate": 6.7549716986389146e-06, + "loss": 0.3142, + "step": 3925 + }, + { + "epoch": 1.8759333373155727, + "grad_norm": 0.4686254460663425, + "learning_rate": 6.749990448165572e-06, + "loss": 0.3139, + "step": 3926 + }, + { + "epoch": 1.8764112060211455, + "grad_norm": 0.4460888626954049, + "learning_rate": 6.745010099187552e-06, + "loss": 0.3315, + "step": 3927 + }, + { + "epoch": 1.8768890747267188, + "grad_norm": 0.4643706696696984, + "learning_rate": 6.740030653086311e-06, + "loss": 0.2954, + "step": 3928 + }, + { + "epoch": 1.877366943432292, + "grad_norm": 0.5033306648528414, + "learning_rate": 6.735052111243061e-06, + "loss": 0.298, + "step": 3929 + }, + { + "epoch": 1.877844812137865, + "grad_norm": 0.44669150190220186, + "learning_rate": 6.730074475038766e-06, + "loss": 0.3019, + "step": 3930 + }, + { + "epoch": 1.8783226808434383, + "grad_norm": 0.4653710125052024, + "learning_rate": 6.72509774585413e-06, + "loss": 0.307, + "step": 3931 + }, + { + "epoch": 1.8788005495490114, + "grad_norm": 0.48207706455379085, + "learning_rate": 6.720121925069609e-06, + "loss": 0.3321, + "step": 3932 + }, + { + "epoch": 1.8792784182545845, + "grad_norm": 0.446573686847004, + "learning_rate": 6.715147014065413e-06, + "loss": 0.3118, + "step": 3933 + }, + { + "epoch": 1.8797562869601578, + "grad_norm": 0.4360305190566346, + "learning_rate": 6.710173014221489e-06, + "loss": 0.3305, + "step": 3934 + }, + { + "epoch": 1.8802341556657307, + "grad_norm": 0.4928534869463769, + "learning_rate": 6.7051999269175405e-06, + "loss": 0.3335, + "step": 3935 + }, + { + "epoch": 1.880712024371304, + "grad_norm": 0.5334772505119653, + "learning_rate": 6.700227753533013e-06, + "loss": 0.3203, + "step": 3936 + }, + { + "epoch": 1.8811898930768771, + "grad_norm": 0.49357554615411325, + "learning_rate": 6.695256495447099e-06, + "loss": 0.3086, + "step": 3937 + }, + { + "epoch": 1.8816677617824502, + "grad_norm": 0.46109123255173334, + "learning_rate": 6.690286154038736e-06, + "loss": 0.3013, + "step": 3938 + }, + { + "epoch": 1.8821456304880235, + "grad_norm": 0.449128708951584, + "learning_rate": 6.685316730686614e-06, + "loss": 0.3243, + "step": 3939 + }, + { + "epoch": 1.8826234991935964, + "grad_norm": 0.46653691376464573, + "learning_rate": 6.680348226769162e-06, + "loss": 0.3299, + "step": 3940 + }, + { + "epoch": 1.8831013678991697, + "grad_norm": 0.4647819337168596, + "learning_rate": 6.675380643664553e-06, + "loss": 0.2945, + "step": 3941 + }, + { + "epoch": 1.8835792366047428, + "grad_norm": 0.4644806300385637, + "learning_rate": 6.670413982750709e-06, + "loss": 0.2997, + "step": 3942 + }, + { + "epoch": 1.884057105310316, + "grad_norm": 0.46165409315562544, + "learning_rate": 6.6654482454052936e-06, + "loss": 0.2979, + "step": 3943 + }, + { + "epoch": 1.8845349740158892, + "grad_norm": 0.4604824497984356, + "learning_rate": 6.660483433005714e-06, + "loss": 0.3179, + "step": 3944 + }, + { + "epoch": 1.8850128427214623, + "grad_norm": 0.46168391343184206, + "learning_rate": 6.655519546929121e-06, + "loss": 0.303, + "step": 3945 + }, + { + "epoch": 1.8854907114270354, + "grad_norm": 0.4678840236583196, + "learning_rate": 6.650556588552413e-06, + "loss": 0.3107, + "step": 3946 + }, + { + "epoch": 1.8859685801326087, + "grad_norm": 0.4594155584535815, + "learning_rate": 6.64559455925222e-06, + "loss": 0.3178, + "step": 3947 + }, + { + "epoch": 1.8864464488381816, + "grad_norm": 0.45341402313704277, + "learning_rate": 6.640633460404927e-06, + "loss": 0.3231, + "step": 3948 + }, + { + "epoch": 1.886924317543755, + "grad_norm": 0.5076384285562465, + "learning_rate": 6.635673293386656e-06, + "loss": 0.3197, + "step": 3949 + }, + { + "epoch": 1.887402186249328, + "grad_norm": 0.4643275428624748, + "learning_rate": 6.630714059573267e-06, + "loss": 0.3217, + "step": 3950 + }, + { + "epoch": 1.887880054954901, + "grad_norm": 0.49068283754529113, + "learning_rate": 6.625755760340362e-06, + "loss": 0.2889, + "step": 3951 + }, + { + "epoch": 1.8883579236604744, + "grad_norm": 0.47460107283399794, + "learning_rate": 6.620798397063291e-06, + "loss": 0.3271, + "step": 3952 + }, + { + "epoch": 1.8888357923660475, + "grad_norm": 0.48812107695821716, + "learning_rate": 6.615841971117136e-06, + "loss": 0.3138, + "step": 3953 + }, + { + "epoch": 1.8893136610716206, + "grad_norm": 0.5734196274573817, + "learning_rate": 6.610886483876721e-06, + "loss": 0.3044, + "step": 3954 + }, + { + "epoch": 1.8897915297771937, + "grad_norm": 0.4617227039696627, + "learning_rate": 6.6059319367166165e-06, + "loss": 0.3175, + "step": 3955 + }, + { + "epoch": 1.8902693984827668, + "grad_norm": 0.6100117271945517, + "learning_rate": 6.600978331011118e-06, + "loss": 0.3084, + "step": 3956 + }, + { + "epoch": 1.89074726718834, + "grad_norm": 0.46971941183014404, + "learning_rate": 6.596025668134276e-06, + "loss": 0.2983, + "step": 3957 + }, + { + "epoch": 1.8912251358939132, + "grad_norm": 0.461607197188674, + "learning_rate": 6.5910739494598675e-06, + "loss": 0.314, + "step": 3958 + }, + { + "epoch": 1.8917030045994863, + "grad_norm": 0.4654393182079575, + "learning_rate": 6.586123176361412e-06, + "loss": 0.3093, + "step": 3959 + }, + { + "epoch": 1.8921808733050596, + "grad_norm": 0.49908066713941035, + "learning_rate": 6.581173350212169e-06, + "loss": 0.3159, + "step": 3960 + }, + { + "epoch": 1.8926587420106324, + "grad_norm": 0.4512238380490708, + "learning_rate": 6.576224472385132e-06, + "loss": 0.3226, + "step": 3961 + }, + { + "epoch": 1.8931366107162058, + "grad_norm": 0.4496277486152005, + "learning_rate": 6.5712765442530305e-06, + "loss": 0.3107, + "step": 3962 + }, + { + "epoch": 1.8936144794217789, + "grad_norm": 0.4489167401241315, + "learning_rate": 6.566329567188334e-06, + "loss": 0.3026, + "step": 3963 + }, + { + "epoch": 1.894092348127352, + "grad_norm": 0.45699132086998806, + "learning_rate": 6.5613835425632475e-06, + "loss": 0.3145, + "step": 3964 + }, + { + "epoch": 1.8945702168329253, + "grad_norm": 0.45425664515009867, + "learning_rate": 6.55643847174971e-06, + "loss": 0.3174, + "step": 3965 + }, + { + "epoch": 1.8950480855384984, + "grad_norm": 0.500523026242252, + "learning_rate": 6.551494356119395e-06, + "loss": 0.3115, + "step": 3966 + }, + { + "epoch": 1.8955259542440714, + "grad_norm": 0.44931814929891095, + "learning_rate": 6.546551197043719e-06, + "loss": 0.3027, + "step": 3967 + }, + { + "epoch": 1.8960038229496445, + "grad_norm": 0.4626590116957062, + "learning_rate": 6.54160899589382e-06, + "loss": 0.2975, + "step": 3968 + }, + { + "epoch": 1.8964816916552176, + "grad_norm": 0.6053375872302748, + "learning_rate": 6.536667754040581e-06, + "loss": 0.3172, + "step": 3969 + }, + { + "epoch": 1.896959560360791, + "grad_norm": 0.46082649951002486, + "learning_rate": 6.531727472854617e-06, + "loss": 0.3041, + "step": 3970 + }, + { + "epoch": 1.897437429066364, + "grad_norm": 0.4647416997290529, + "learning_rate": 6.52678815370627e-06, + "loss": 0.312, + "step": 3971 + }, + { + "epoch": 1.8979152977719371, + "grad_norm": 0.8690617380399219, + "learning_rate": 6.521849797965623e-06, + "loss": 0.3076, + "step": 3972 + }, + { + "epoch": 1.8983931664775104, + "grad_norm": 0.44236729928797086, + "learning_rate": 6.516912407002487e-06, + "loss": 0.3154, + "step": 3973 + }, + { + "epoch": 1.8988710351830833, + "grad_norm": 0.48835977208241177, + "learning_rate": 6.511975982186412e-06, + "loss": 0.3152, + "step": 3974 + }, + { + "epoch": 1.8993489038886566, + "grad_norm": 0.44854340512081925, + "learning_rate": 6.507040524886672e-06, + "loss": 0.305, + "step": 3975 + }, + { + "epoch": 1.8998267725942297, + "grad_norm": 0.46881260069389524, + "learning_rate": 6.502106036472274e-06, + "loss": 0.3059, + "step": 3976 + }, + { + "epoch": 1.9003046412998028, + "grad_norm": 0.4534629224226534, + "learning_rate": 6.4971725183119596e-06, + "loss": 0.3133, + "step": 3977 + }, + { + "epoch": 1.9007825100053761, + "grad_norm": 0.4727954464086862, + "learning_rate": 6.492239971774201e-06, + "loss": 0.3112, + "step": 3978 + }, + { + "epoch": 1.9012603787109492, + "grad_norm": 0.4905488487960794, + "learning_rate": 6.487308398227198e-06, + "loss": 0.3084, + "step": 3979 + }, + { + "epoch": 1.9017382474165223, + "grad_norm": 0.45969932453025913, + "learning_rate": 6.482377799038882e-06, + "loss": 0.3074, + "step": 3980 + }, + { + "epoch": 1.9022161161220954, + "grad_norm": 0.4503140751242902, + "learning_rate": 6.477448175576917e-06, + "loss": 0.3112, + "step": 3981 + }, + { + "epoch": 1.9026939848276685, + "grad_norm": 0.5162485390503506, + "learning_rate": 6.472519529208688e-06, + "loss": 0.3116, + "step": 3982 + }, + { + "epoch": 1.9031718535332418, + "grad_norm": 0.4393300520762453, + "learning_rate": 6.467591861301319e-06, + "loss": 0.3186, + "step": 3983 + }, + { + "epoch": 1.903649722238815, + "grad_norm": 0.4404302931504677, + "learning_rate": 6.462665173221658e-06, + "loss": 0.3027, + "step": 3984 + }, + { + "epoch": 1.904127590944388, + "grad_norm": 0.4513739623364187, + "learning_rate": 6.45773946633628e-06, + "loss": 0.3191, + "step": 3985 + }, + { + "epoch": 1.9046054596499613, + "grad_norm": 0.43862099527361414, + "learning_rate": 6.45281474201149e-06, + "loss": 0.3186, + "step": 3986 + }, + { + "epoch": 1.9050833283555342, + "grad_norm": 0.48035117229015456, + "learning_rate": 6.44789100161332e-06, + "loss": 0.2902, + "step": 3987 + }, + { + "epoch": 1.9055611970611075, + "grad_norm": 0.46306152743712947, + "learning_rate": 6.442968246507526e-06, + "loss": 0.3228, + "step": 3988 + }, + { + "epoch": 1.9060390657666806, + "grad_norm": 0.4547005976928245, + "learning_rate": 6.438046478059597e-06, + "loss": 0.3081, + "step": 3989 + }, + { + "epoch": 1.9065169344722537, + "grad_norm": 0.455512028698371, + "learning_rate": 6.4331256976347434e-06, + "loss": 0.3136, + "step": 3990 + }, + { + "epoch": 1.906994803177827, + "grad_norm": 0.45184250650864355, + "learning_rate": 6.4282059065979e-06, + "loss": 0.3234, + "step": 3991 + }, + { + "epoch": 1.9074726718834, + "grad_norm": 0.4589810256657876, + "learning_rate": 6.423287106313734e-06, + "loss": 0.3139, + "step": 3992 + }, + { + "epoch": 1.9079505405889732, + "grad_norm": 0.4469761574276576, + "learning_rate": 6.4183692981466354e-06, + "loss": 0.3178, + "step": 3993 + }, + { + "epoch": 1.9084284092945463, + "grad_norm": 0.4819073592214982, + "learning_rate": 6.413452483460712e-06, + "loss": 0.3078, + "step": 3994 + }, + { + "epoch": 1.9089062780001194, + "grad_norm": 0.473783423003341, + "learning_rate": 6.408536663619803e-06, + "loss": 0.3232, + "step": 3995 + }, + { + "epoch": 1.9093841467056927, + "grad_norm": 0.46190041854892633, + "learning_rate": 6.403621839987475e-06, + "loss": 0.2947, + "step": 3996 + }, + { + "epoch": 1.9098620154112658, + "grad_norm": 0.4853262011211858, + "learning_rate": 6.398708013927006e-06, + "loss": 0.3191, + "step": 3997 + }, + { + "epoch": 1.9103398841168389, + "grad_norm": 0.46613152697547133, + "learning_rate": 6.393795186801408e-06, + "loss": 0.3148, + "step": 3998 + }, + { + "epoch": 1.9108177528224122, + "grad_norm": 0.478468525695969, + "learning_rate": 6.3888833599734164e-06, + "loss": 0.3088, + "step": 3999 + }, + { + "epoch": 1.911295621527985, + "grad_norm": 0.4807025101803059, + "learning_rate": 6.383972534805478e-06, + "loss": 0.3137, + "step": 4000 + }, + { + "epoch": 1.9117734902335584, + "grad_norm": 0.4699617352574663, + "learning_rate": 6.379062712659775e-06, + "loss": 0.3151, + "step": 4001 + }, + { + "epoch": 1.9122513589391315, + "grad_norm": 0.4432981759624267, + "learning_rate": 6.374153894898204e-06, + "loss": 0.301, + "step": 4002 + }, + { + "epoch": 1.9127292276447045, + "grad_norm": 0.4591365732275299, + "learning_rate": 6.369246082882381e-06, + "loss": 0.3183, + "step": 4003 + }, + { + "epoch": 1.9132070963502779, + "grad_norm": 0.4866953393750346, + "learning_rate": 6.36433927797365e-06, + "loss": 0.3077, + "step": 4004 + }, + { + "epoch": 1.913684965055851, + "grad_norm": 0.4474609520754289, + "learning_rate": 6.359433481533074e-06, + "loss": 0.314, + "step": 4005 + }, + { + "epoch": 1.914162833761424, + "grad_norm": 0.48773485797623917, + "learning_rate": 6.3545286949214245e-06, + "loss": 0.3329, + "step": 4006 + }, + { + "epoch": 1.9146407024669974, + "grad_norm": 0.47016604797082867, + "learning_rate": 6.349624919499218e-06, + "loss": 0.302, + "step": 4007 + }, + { + "epoch": 1.9151185711725702, + "grad_norm": 0.4760285819352319, + "learning_rate": 6.344722156626663e-06, + "loss": 0.2967, + "step": 4008 + }, + { + "epoch": 1.9155964398781435, + "grad_norm": 0.4759007373191583, + "learning_rate": 6.3398204076637035e-06, + "loss": 0.3126, + "step": 4009 + }, + { + "epoch": 1.9160743085837166, + "grad_norm": 0.45794278531506144, + "learning_rate": 6.3349196739700024e-06, + "loss": 0.3035, + "step": 4010 + }, + { + "epoch": 1.9165521772892897, + "grad_norm": 0.4692457739911771, + "learning_rate": 6.3300199569049305e-06, + "loss": 0.3215, + "step": 4011 + }, + { + "epoch": 1.917030045994863, + "grad_norm": 0.4630100738986396, + "learning_rate": 6.325121257827584e-06, + "loss": 0.2977, + "step": 4012 + }, + { + "epoch": 1.917507914700436, + "grad_norm": 0.44301606576563307, + "learning_rate": 6.32022357809678e-06, + "loss": 0.3033, + "step": 4013 + }, + { + "epoch": 1.9179857834060092, + "grad_norm": 0.4878278270456394, + "learning_rate": 6.3153269190710435e-06, + "loss": 0.3061, + "step": 4014 + }, + { + "epoch": 1.9184636521115823, + "grad_norm": 0.46093038914319545, + "learning_rate": 6.310431282108622e-06, + "loss": 0.3094, + "step": 4015 + }, + { + "epoch": 1.9189415208171554, + "grad_norm": 0.4342745654994972, + "learning_rate": 6.305536668567482e-06, + "loss": 0.2909, + "step": 4016 + }, + { + "epoch": 1.9194193895227287, + "grad_norm": 0.46848755594806835, + "learning_rate": 6.3006430798053e-06, + "loss": 0.317, + "step": 4017 + }, + { + "epoch": 1.9198972582283018, + "grad_norm": 0.45778293002207393, + "learning_rate": 6.295750517179471e-06, + "loss": 0.3253, + "step": 4018 + }, + { + "epoch": 1.920375126933875, + "grad_norm": 0.48174882670851016, + "learning_rate": 6.29085898204711e-06, + "loss": 0.2912, + "step": 4019 + }, + { + "epoch": 1.9208529956394482, + "grad_norm": 0.44572761526104454, + "learning_rate": 6.2859684757650365e-06, + "loss": 0.3104, + "step": 4020 + }, + { + "epoch": 1.921330864345021, + "grad_norm": 0.4477457574600347, + "learning_rate": 6.281078999689794e-06, + "loss": 0.3135, + "step": 4021 + }, + { + "epoch": 1.9218087330505944, + "grad_norm": 0.4612358982666258, + "learning_rate": 6.276190555177637e-06, + "loss": 0.3121, + "step": 4022 + }, + { + "epoch": 1.9222866017561675, + "grad_norm": 0.4575159149820703, + "learning_rate": 6.271303143584532e-06, + "loss": 0.3016, + "step": 4023 + }, + { + "epoch": 1.9227644704617406, + "grad_norm": 0.4482684891108602, + "learning_rate": 6.266416766266161e-06, + "loss": 0.3073, + "step": 4024 + }, + { + "epoch": 1.923242339167314, + "grad_norm": 0.4520583335071011, + "learning_rate": 6.261531424577923e-06, + "loss": 0.2965, + "step": 4025 + }, + { + "epoch": 1.9237202078728868, + "grad_norm": 0.47269951843491825, + "learning_rate": 6.256647119874919e-06, + "loss": 0.3121, + "step": 4026 + }, + { + "epoch": 1.92419807657846, + "grad_norm": 0.4758823780817171, + "learning_rate": 6.251763853511974e-06, + "loss": 0.315, + "step": 4027 + }, + { + "epoch": 1.9246759452840332, + "grad_norm": 0.4471003165029847, + "learning_rate": 6.24688162684362e-06, + "loss": 0.308, + "step": 4028 + }, + { + "epoch": 1.9251538139896063, + "grad_norm": 0.4842554500937279, + "learning_rate": 6.242000441224096e-06, + "loss": 0.3197, + "step": 4029 + }, + { + "epoch": 1.9256316826951796, + "grad_norm": 0.4766633160283862, + "learning_rate": 6.2371202980073596e-06, + "loss": 0.3098, + "step": 4030 + }, + { + "epoch": 1.9261095514007527, + "grad_norm": 0.4753968836250539, + "learning_rate": 6.23224119854708e-06, + "loss": 0.3108, + "step": 4031 + }, + { + "epoch": 1.9265874201063258, + "grad_norm": 0.5527062509603616, + "learning_rate": 6.227363144196625e-06, + "loss": 0.2904, + "step": 4032 + }, + { + "epoch": 1.927065288811899, + "grad_norm": 0.4689169847484964, + "learning_rate": 6.222486136309087e-06, + "loss": 0.2986, + "step": 4033 + }, + { + "epoch": 1.927543157517472, + "grad_norm": 0.45259176341385676, + "learning_rate": 6.217610176237263e-06, + "loss": 0.3121, + "step": 4034 + }, + { + "epoch": 1.9280210262230453, + "grad_norm": 0.45564864703780517, + "learning_rate": 6.212735265333655e-06, + "loss": 0.3016, + "step": 4035 + }, + { + "epoch": 1.9284988949286184, + "grad_norm": 0.4559492608859359, + "learning_rate": 6.207861404950477e-06, + "loss": 0.3143, + "step": 4036 + }, + { + "epoch": 1.9289767636341915, + "grad_norm": 0.4451863912439992, + "learning_rate": 6.2029885964396544e-06, + "loss": 0.3317, + "step": 4037 + }, + { + "epoch": 1.9294546323397648, + "grad_norm": 0.4535151095832485, + "learning_rate": 6.198116841152816e-06, + "loss": 0.3132, + "step": 4038 + }, + { + "epoch": 1.9299325010453376, + "grad_norm": 0.4459931426857373, + "learning_rate": 6.1932461404412994e-06, + "loss": 0.3109, + "step": 4039 + }, + { + "epoch": 1.930410369750911, + "grad_norm": 0.5319828701956664, + "learning_rate": 6.188376495656156e-06, + "loss": 0.3035, + "step": 4040 + }, + { + "epoch": 1.930888238456484, + "grad_norm": 0.46969694056011513, + "learning_rate": 6.183507908148137e-06, + "loss": 0.3163, + "step": 4041 + }, + { + "epoch": 1.9313661071620571, + "grad_norm": 0.45244458398728504, + "learning_rate": 6.178640379267702e-06, + "loss": 0.3025, + "step": 4042 + }, + { + "epoch": 1.9318439758676305, + "grad_norm": 0.45648090587965856, + "learning_rate": 6.173773910365018e-06, + "loss": 0.2901, + "step": 4043 + }, + { + "epoch": 1.9323218445732036, + "grad_norm": 0.4610212804634707, + "learning_rate": 6.168908502789961e-06, + "loss": 0.3086, + "step": 4044 + }, + { + "epoch": 1.9327997132787766, + "grad_norm": 0.46276231755271585, + "learning_rate": 6.164044157892102e-06, + "loss": 0.3272, + "step": 4045 + }, + { + "epoch": 1.93327758198435, + "grad_norm": 0.4494641795334654, + "learning_rate": 6.15918087702073e-06, + "loss": 0.3214, + "step": 4046 + }, + { + "epoch": 1.9337554506899228, + "grad_norm": 0.4495205402291461, + "learning_rate": 6.154318661524832e-06, + "loss": 0.3209, + "step": 4047 + }, + { + "epoch": 1.9342333193954961, + "grad_norm": 0.4707004184620095, + "learning_rate": 6.149457512753101e-06, + "loss": 0.3053, + "step": 4048 + }, + { + "epoch": 1.9347111881010692, + "grad_norm": 0.5357951441151997, + "learning_rate": 6.144597432053932e-06, + "loss": 0.3119, + "step": 4049 + }, + { + "epoch": 1.9351890568066423, + "grad_norm": 0.4535876834445935, + "learning_rate": 6.13973842077543e-06, + "loss": 0.3023, + "step": 4050 + }, + { + "epoch": 1.9356669255122156, + "grad_norm": 0.48376924175039243, + "learning_rate": 6.134880480265393e-06, + "loss": 0.3059, + "step": 4051 + }, + { + "epoch": 1.9361447942177885, + "grad_norm": 0.49754708572204154, + "learning_rate": 6.130023611871332e-06, + "loss": 0.3139, + "step": 4052 + }, + { + "epoch": 1.9366226629233618, + "grad_norm": 0.6147957148377768, + "learning_rate": 6.1251678169404585e-06, + "loss": 0.3017, + "step": 4053 + }, + { + "epoch": 1.937100531628935, + "grad_norm": 0.4597512390463644, + "learning_rate": 6.120313096819679e-06, + "loss": 0.3184, + "step": 4054 + }, + { + "epoch": 1.937578400334508, + "grad_norm": 0.4559692746363391, + "learning_rate": 6.11545945285561e-06, + "loss": 0.3085, + "step": 4055 + }, + { + "epoch": 1.9380562690400813, + "grad_norm": 0.47922658966713283, + "learning_rate": 6.110606886394568e-06, + "loss": 0.3254, + "step": 4056 + }, + { + "epoch": 1.9385341377456544, + "grad_norm": 0.4534671409403072, + "learning_rate": 6.105755398782567e-06, + "loss": 0.324, + "step": 4057 + }, + { + "epoch": 1.9390120064512275, + "grad_norm": 0.4545090220212905, + "learning_rate": 6.100904991365324e-06, + "loss": 0.3079, + "step": 4058 + }, + { + "epoch": 1.9394898751568008, + "grad_norm": 0.4762232060665426, + "learning_rate": 6.096055665488261e-06, + "loss": 0.3051, + "step": 4059 + }, + { + "epoch": 1.9399677438623737, + "grad_norm": 0.5066682145376573, + "learning_rate": 6.091207422496489e-06, + "loss": 0.3159, + "step": 4060 + }, + { + "epoch": 1.940445612567947, + "grad_norm": 0.44386055788270473, + "learning_rate": 6.0863602637348284e-06, + "loss": 0.3119, + "step": 4061 + }, + { + "epoch": 1.94092348127352, + "grad_norm": 0.43903421860827463, + "learning_rate": 6.081514190547797e-06, + "loss": 0.3078, + "step": 4062 + }, + { + "epoch": 1.9414013499790932, + "grad_norm": 0.47742457334875266, + "learning_rate": 6.076669204279606e-06, + "loss": 0.3176, + "step": 4063 + }, + { + "epoch": 1.9418792186846665, + "grad_norm": 0.4665930593611481, + "learning_rate": 6.071825306274173e-06, + "loss": 0.3036, + "step": 4064 + }, + { + "epoch": 1.9423570873902394, + "grad_norm": 0.5253515300391597, + "learning_rate": 6.066982497875109e-06, + "loss": 0.3198, + "step": 4065 + }, + { + "epoch": 1.9428349560958127, + "grad_norm": 0.5049405090302596, + "learning_rate": 6.0621407804257205e-06, + "loss": 0.3048, + "step": 4066 + }, + { + "epoch": 1.9433128248013858, + "grad_norm": 0.4700459384897942, + "learning_rate": 6.057300155269017e-06, + "loss": 0.3106, + "step": 4067 + }, + { + "epoch": 1.9437906935069589, + "grad_norm": 0.4434734524599051, + "learning_rate": 6.052460623747705e-06, + "loss": 0.3237, + "step": 4068 + }, + { + "epoch": 1.9442685622125322, + "grad_norm": 0.46014030921295707, + "learning_rate": 6.0476221872041794e-06, + "loss": 0.3223, + "step": 4069 + }, + { + "epoch": 1.9447464309181053, + "grad_norm": 0.468750131948572, + "learning_rate": 6.042784846980542e-06, + "loss": 0.2991, + "step": 4070 + }, + { + "epoch": 1.9452242996236784, + "grad_norm": 0.47243399384340334, + "learning_rate": 6.037948604418584e-06, + "loss": 0.3229, + "step": 4071 + }, + { + "epoch": 1.9457021683292517, + "grad_norm": 0.48404534391010834, + "learning_rate": 6.033113460859794e-06, + "loss": 0.3042, + "step": 4072 + }, + { + "epoch": 1.9461800370348246, + "grad_norm": 0.459036713881152, + "learning_rate": 6.028279417645351e-06, + "loss": 0.3125, + "step": 4073 + }, + { + "epoch": 1.9466579057403979, + "grad_norm": 0.7112099246861842, + "learning_rate": 6.023446476116141e-06, + "loss": 0.3112, + "step": 4074 + }, + { + "epoch": 1.947135774445971, + "grad_norm": 0.4643938545098871, + "learning_rate": 6.018614637612733e-06, + "loss": 0.2963, + "step": 4075 + }, + { + "epoch": 1.947613643151544, + "grad_norm": 0.44678142004677796, + "learning_rate": 6.013783903475396e-06, + "loss": 0.3037, + "step": 4076 + }, + { + "epoch": 1.9480915118571174, + "grad_norm": 0.5656950978702013, + "learning_rate": 6.008954275044088e-06, + "loss": 0.2962, + "step": 4077 + }, + { + "epoch": 1.9485693805626902, + "grad_norm": 0.47423542716615724, + "learning_rate": 6.004125753658461e-06, + "loss": 0.3106, + "step": 4078 + }, + { + "epoch": 1.9490472492682636, + "grad_norm": 0.45972148442045957, + "learning_rate": 5.9992983406578666e-06, + "loss": 0.3016, + "step": 4079 + }, + { + "epoch": 1.9495251179738367, + "grad_norm": 0.48436905917421313, + "learning_rate": 5.99447203738134e-06, + "loss": 0.3036, + "step": 4080 + }, + { + "epoch": 1.9500029866794097, + "grad_norm": 0.44553011226551475, + "learning_rate": 5.989646845167614e-06, + "loss": 0.2932, + "step": 4081 + }, + { + "epoch": 1.950480855384983, + "grad_norm": 0.4607346503392114, + "learning_rate": 5.984822765355113e-06, + "loss": 0.3358, + "step": 4082 + }, + { + "epoch": 1.9509587240905562, + "grad_norm": 0.4952981445916362, + "learning_rate": 5.979999799281948e-06, + "loss": 0.3151, + "step": 4083 + }, + { + "epoch": 1.9514365927961292, + "grad_norm": 0.4675121455424015, + "learning_rate": 5.975177948285929e-06, + "loss": 0.3148, + "step": 4084 + }, + { + "epoch": 1.9519144615017026, + "grad_norm": 0.45369885707604624, + "learning_rate": 5.9703572137045495e-06, + "loss": 0.3034, + "step": 4085 + }, + { + "epoch": 1.9523923302072754, + "grad_norm": 0.4579342776702539, + "learning_rate": 5.965537596874997e-06, + "loss": 0.313, + "step": 4086 + }, + { + "epoch": 1.9528701989128487, + "grad_norm": 0.6942835585119591, + "learning_rate": 5.960719099134149e-06, + "loss": 0.317, + "step": 4087 + }, + { + "epoch": 1.9533480676184218, + "grad_norm": 0.46918542083573794, + "learning_rate": 5.9559017218185724e-06, + "loss": 0.309, + "step": 4088 + }, + { + "epoch": 1.953825936323995, + "grad_norm": 0.45277876411651685, + "learning_rate": 5.951085466264519e-06, + "loss": 0.3032, + "step": 4089 + }, + { + "epoch": 1.9543038050295682, + "grad_norm": 0.4690874120536212, + "learning_rate": 5.946270333807937e-06, + "loss": 0.3039, + "step": 4090 + }, + { + "epoch": 1.9547816737351411, + "grad_norm": 0.4507823330535709, + "learning_rate": 5.94145632578446e-06, + "loss": 0.3077, + "step": 4091 + }, + { + "epoch": 1.9552595424407144, + "grad_norm": 0.44925892593784034, + "learning_rate": 5.9366434435294026e-06, + "loss": 0.3155, + "step": 4092 + }, + { + "epoch": 1.9557374111462875, + "grad_norm": 0.48240656700569656, + "learning_rate": 5.9318316883777795e-06, + "loss": 0.3208, + "step": 4093 + }, + { + "epoch": 1.9562152798518606, + "grad_norm": 0.4604534040175604, + "learning_rate": 5.927021061664287e-06, + "loss": 0.2998, + "step": 4094 + }, + { + "epoch": 1.956693148557434, + "grad_norm": 0.4526034134761243, + "learning_rate": 5.922211564723302e-06, + "loss": 0.2946, + "step": 4095 + }, + { + "epoch": 1.957171017263007, + "grad_norm": 0.46416284670050006, + "learning_rate": 5.9174031988888995e-06, + "loss": 0.3118, + "step": 4096 + }, + { + "epoch": 1.95764888596858, + "grad_norm": 0.45215047247388573, + "learning_rate": 5.912595965494835e-06, + "loss": 0.3182, + "step": 4097 + }, + { + "epoch": 1.9581267546741534, + "grad_norm": 0.5546875852085521, + "learning_rate": 5.907789865874547e-06, + "loss": 0.3064, + "step": 4098 + }, + { + "epoch": 1.9586046233797263, + "grad_norm": 0.4752323586160744, + "learning_rate": 5.902984901361166e-06, + "loss": 0.3278, + "step": 4099 + }, + { + "epoch": 1.9590824920852996, + "grad_norm": 0.452399975076052, + "learning_rate": 5.898181073287504e-06, + "loss": 0.3028, + "step": 4100 + }, + { + "epoch": 1.9595603607908727, + "grad_norm": 0.4578845518507173, + "learning_rate": 5.893378382986057e-06, + "loss": 0.2949, + "step": 4101 + }, + { + "epoch": 1.9600382294964458, + "grad_norm": 0.4390145249730691, + "learning_rate": 5.8885768317890054e-06, + "loss": 0.3016, + "step": 4102 + }, + { + "epoch": 1.960516098202019, + "grad_norm": 0.9622036180396034, + "learning_rate": 5.883776421028219e-06, + "loss": 0.3221, + "step": 4103 + }, + { + "epoch": 1.960993966907592, + "grad_norm": 0.45023967591580155, + "learning_rate": 5.878977152035243e-06, + "loss": 0.3117, + "step": 4104 + }, + { + "epoch": 1.9614718356131653, + "grad_norm": 0.44945352983330045, + "learning_rate": 5.87417902614131e-06, + "loss": 0.2991, + "step": 4105 + }, + { + "epoch": 1.9619497043187384, + "grad_norm": 0.4507418181158918, + "learning_rate": 5.869382044677341e-06, + "loss": 0.3181, + "step": 4106 + }, + { + "epoch": 1.9624275730243115, + "grad_norm": 0.44350954473838944, + "learning_rate": 5.8645862089739215e-06, + "loss": 0.3151, + "step": 4107 + }, + { + "epoch": 1.9629054417298848, + "grad_norm": 0.5341361831039657, + "learning_rate": 5.859791520361348e-06, + "loss": 0.3072, + "step": 4108 + }, + { + "epoch": 1.9633833104354579, + "grad_norm": 0.46709165414809023, + "learning_rate": 5.854997980169572e-06, + "loss": 0.3031, + "step": 4109 + }, + { + "epoch": 1.963861179141031, + "grad_norm": 0.4280650147245011, + "learning_rate": 5.850205589728239e-06, + "loss": 0.3127, + "step": 4110 + }, + { + "epoch": 1.9643390478466043, + "grad_norm": 0.44278861997127844, + "learning_rate": 5.845414350366679e-06, + "loss": 0.3177, + "step": 4111 + }, + { + "epoch": 1.9648169165521772, + "grad_norm": 0.43260620835576497, + "learning_rate": 5.8406242634138875e-06, + "loss": 0.3133, + "step": 4112 + }, + { + "epoch": 1.9652947852577505, + "grad_norm": 0.46599928724656164, + "learning_rate": 5.835835330198558e-06, + "loss": 0.317, + "step": 4113 + }, + { + "epoch": 1.9657726539633236, + "grad_norm": 0.4868101146764809, + "learning_rate": 5.83104755204905e-06, + "loss": 0.3072, + "step": 4114 + }, + { + "epoch": 1.9662505226688967, + "grad_norm": 0.43340141877422556, + "learning_rate": 5.826260930293417e-06, + "loss": 0.3041, + "step": 4115 + }, + { + "epoch": 1.96672839137447, + "grad_norm": 0.4737663310328414, + "learning_rate": 5.8214754662593765e-06, + "loss": 0.3196, + "step": 4116 + }, + { + "epoch": 1.9672062600800428, + "grad_norm": 0.4510243883089674, + "learning_rate": 5.81669116127433e-06, + "loss": 0.2846, + "step": 4117 + }, + { + "epoch": 1.9676841287856162, + "grad_norm": 0.4721590552134255, + "learning_rate": 5.811908016665369e-06, + "loss": 0.2991, + "step": 4118 + }, + { + "epoch": 1.9681619974911893, + "grad_norm": 0.4485701829219417, + "learning_rate": 5.807126033759245e-06, + "loss": 0.3201, + "step": 4119 + }, + { + "epoch": 1.9686398661967623, + "grad_norm": 0.42971050420170925, + "learning_rate": 5.802345213882396e-06, + "loss": 0.2848, + "step": 4120 + }, + { + "epoch": 1.9691177349023357, + "grad_norm": 0.5060676790854636, + "learning_rate": 5.797565558360943e-06, + "loss": 0.3189, + "step": 4121 + }, + { + "epoch": 1.9695956036079088, + "grad_norm": 0.5368619185936723, + "learning_rate": 5.792787068520674e-06, + "loss": 0.3272, + "step": 4122 + }, + { + "epoch": 1.9700734723134818, + "grad_norm": 0.4969544005475237, + "learning_rate": 5.788009745687053e-06, + "loss": 0.3044, + "step": 4123 + }, + { + "epoch": 1.9705513410190552, + "grad_norm": 0.5192317905793113, + "learning_rate": 5.7832335911852355e-06, + "loss": 0.3073, + "step": 4124 + }, + { + "epoch": 1.971029209724628, + "grad_norm": 0.4584957853344697, + "learning_rate": 5.778458606340037e-06, + "loss": 0.3182, + "step": 4125 + }, + { + "epoch": 1.9715070784302013, + "grad_norm": 0.4826482249055619, + "learning_rate": 5.7736847924759505e-06, + "loss": 0.2936, + "step": 4126 + }, + { + "epoch": 1.9719849471357744, + "grad_norm": 0.4565375353417649, + "learning_rate": 5.7689121509171564e-06, + "loss": 0.3002, + "step": 4127 + }, + { + "epoch": 1.9724628158413475, + "grad_norm": 0.4524472919502672, + "learning_rate": 5.764140682987496e-06, + "loss": 0.3091, + "step": 4128 + }, + { + "epoch": 1.9729406845469208, + "grad_norm": 0.47905618351860524, + "learning_rate": 5.759370390010487e-06, + "loss": 0.3123, + "step": 4129 + }, + { + "epoch": 1.9734185532524937, + "grad_norm": 0.4588694980027561, + "learning_rate": 5.754601273309333e-06, + "loss": 0.3194, + "step": 4130 + }, + { + "epoch": 1.973896421958067, + "grad_norm": 0.45707604771801996, + "learning_rate": 5.749833334206897e-06, + "loss": 0.3094, + "step": 4131 + }, + { + "epoch": 1.9743742906636401, + "grad_norm": 0.4587355771749119, + "learning_rate": 5.745066574025718e-06, + "loss": 0.302, + "step": 4132 + }, + { + "epoch": 1.9748521593692132, + "grad_norm": 0.4740061736075553, + "learning_rate": 5.740300994088022e-06, + "loss": 0.3304, + "step": 4133 + }, + { + "epoch": 1.9753300280747865, + "grad_norm": 0.4566089421929211, + "learning_rate": 5.735536595715687e-06, + "loss": 0.3113, + "step": 4134 + }, + { + "epoch": 1.9758078967803596, + "grad_norm": 0.4663581459744624, + "learning_rate": 5.730773380230276e-06, + "loss": 0.2976, + "step": 4135 + }, + { + "epoch": 1.9762857654859327, + "grad_norm": 0.44354111986279265, + "learning_rate": 5.726011348953023e-06, + "loss": 0.303, + "step": 4136 + }, + { + "epoch": 1.976763634191506, + "grad_norm": 0.43956310394463377, + "learning_rate": 5.7212505032048315e-06, + "loss": 0.3212, + "step": 4137 + }, + { + "epoch": 1.977241502897079, + "grad_norm": 0.43772616497346545, + "learning_rate": 5.716490844306271e-06, + "loss": 0.3071, + "step": 4138 + }, + { + "epoch": 1.9777193716026522, + "grad_norm": 0.5091670239004066, + "learning_rate": 5.711732373577592e-06, + "loss": 0.3224, + "step": 4139 + }, + { + "epoch": 1.9781972403082253, + "grad_norm": 0.47000699212819913, + "learning_rate": 5.70697509233871e-06, + "loss": 0.29, + "step": 4140 + }, + { + "epoch": 1.9786751090137984, + "grad_norm": 0.4704139528256369, + "learning_rate": 5.702219001909206e-06, + "loss": 0.2997, + "step": 4141 + }, + { + "epoch": 1.9791529777193717, + "grad_norm": 0.4614015610752338, + "learning_rate": 5.697464103608339e-06, + "loss": 0.3103, + "step": 4142 + }, + { + "epoch": 1.9796308464249448, + "grad_norm": 0.45164231657830634, + "learning_rate": 5.692710398755039e-06, + "loss": 0.3092, + "step": 4143 + }, + { + "epoch": 1.980108715130518, + "grad_norm": 0.47238767638550566, + "learning_rate": 5.687957888667894e-06, + "loss": 0.3087, + "step": 4144 + }, + { + "epoch": 1.980586583836091, + "grad_norm": 0.4513243311661097, + "learning_rate": 5.683206574665165e-06, + "loss": 0.3104, + "step": 4145 + }, + { + "epoch": 1.981064452541664, + "grad_norm": 0.4721048544965712, + "learning_rate": 5.678456458064788e-06, + "loss": 0.306, + "step": 4146 + }, + { + "epoch": 1.9815423212472374, + "grad_norm": 0.5125862122325965, + "learning_rate": 5.673707540184359e-06, + "loss": 0.2986, + "step": 4147 + }, + { + "epoch": 1.9820201899528105, + "grad_norm": 0.4389466347186533, + "learning_rate": 5.66895982234114e-06, + "loss": 0.3081, + "step": 4148 + }, + { + "epoch": 1.9824980586583836, + "grad_norm": 0.4407487284192881, + "learning_rate": 5.664213305852073e-06, + "loss": 0.3238, + "step": 4149 + }, + { + "epoch": 1.982975927363957, + "grad_norm": 0.47364655963546964, + "learning_rate": 5.6594679920337514e-06, + "loss": 0.3118, + "step": 4150 + }, + { + "epoch": 1.9834537960695298, + "grad_norm": 0.4304623563483301, + "learning_rate": 5.6547238822024395e-06, + "loss": 0.3229, + "step": 4151 + }, + { + "epoch": 1.983931664775103, + "grad_norm": 0.45174215596394307, + "learning_rate": 5.649980977674079e-06, + "loss": 0.3041, + "step": 4152 + }, + { + "epoch": 1.9844095334806762, + "grad_norm": 0.45178499648893194, + "learning_rate": 5.6452392797642605e-06, + "loss": 0.3035, + "step": 4153 + }, + { + "epoch": 1.9848874021862493, + "grad_norm": 0.4521203750930736, + "learning_rate": 5.640498789788246e-06, + "loss": 0.3094, + "step": 4154 + }, + { + "epoch": 1.9853652708918226, + "grad_norm": 0.47602827989218915, + "learning_rate": 5.635759509060969e-06, + "loss": 0.3116, + "step": 4155 + }, + { + "epoch": 1.9858431395973957, + "grad_norm": 0.4801852786233523, + "learning_rate": 5.631021438897023e-06, + "loss": 0.3241, + "step": 4156 + }, + { + "epoch": 1.9863210083029688, + "grad_norm": 0.44827145290236947, + "learning_rate": 5.626284580610657e-06, + "loss": 0.3137, + "step": 4157 + }, + { + "epoch": 1.9867988770085419, + "grad_norm": 0.520461667707988, + "learning_rate": 5.621548935515801e-06, + "loss": 0.3203, + "step": 4158 + }, + { + "epoch": 1.987276745714115, + "grad_norm": 0.4662181204209844, + "learning_rate": 5.616814504926037e-06, + "loss": 0.3011, + "step": 4159 + }, + { + "epoch": 1.9877546144196883, + "grad_norm": 0.4531917716602397, + "learning_rate": 5.612081290154607e-06, + "loss": 0.3324, + "step": 4160 + }, + { + "epoch": 1.9882324831252614, + "grad_norm": 0.47179765398325363, + "learning_rate": 5.607349292514429e-06, + "loss": 0.3167, + "step": 4161 + }, + { + "epoch": 1.9887103518308344, + "grad_norm": 0.7180803991243624, + "learning_rate": 5.602618513318072e-06, + "loss": 0.3052, + "step": 4162 + }, + { + "epoch": 1.9891882205364078, + "grad_norm": 0.4606351292564438, + "learning_rate": 5.597888953877768e-06, + "loss": 0.3148, + "step": 4163 + }, + { + "epoch": 1.9896660892419806, + "grad_norm": 0.44908296173819723, + "learning_rate": 5.5931606155054195e-06, + "loss": 0.3084, + "step": 4164 + }, + { + "epoch": 1.990143957947554, + "grad_norm": 0.5095275810168982, + "learning_rate": 5.58843349951258e-06, + "loss": 0.2959, + "step": 4165 + }, + { + "epoch": 1.990621826653127, + "grad_norm": 0.46103156732406936, + "learning_rate": 5.583707607210467e-06, + "loss": 0.3, + "step": 4166 + }, + { + "epoch": 1.9910996953587001, + "grad_norm": 0.45327848630075884, + "learning_rate": 5.578982939909965e-06, + "loss": 0.3107, + "step": 4167 + }, + { + "epoch": 1.9915775640642734, + "grad_norm": 0.47076365416712834, + "learning_rate": 5.574259498921608e-06, + "loss": 0.292, + "step": 4168 + }, + { + "epoch": 1.9920554327698465, + "grad_norm": 0.4667403906050238, + "learning_rate": 5.569537285555596e-06, + "loss": 0.3196, + "step": 4169 + }, + { + "epoch": 1.9925333014754196, + "grad_norm": 0.4926455741619648, + "learning_rate": 5.564816301121792e-06, + "loss": 0.3176, + "step": 4170 + }, + { + "epoch": 1.9930111701809927, + "grad_norm": 0.4940948150038382, + "learning_rate": 5.5600965469297105e-06, + "loss": 0.3132, + "step": 4171 + }, + { + "epoch": 1.9934890388865658, + "grad_norm": 0.4516008067632424, + "learning_rate": 5.555378024288525e-06, + "loss": 0.3259, + "step": 4172 + }, + { + "epoch": 1.9939669075921391, + "grad_norm": 0.45739246251961, + "learning_rate": 5.550660734507077e-06, + "loss": 0.3084, + "step": 4173 + }, + { + "epoch": 1.9944447762977122, + "grad_norm": 0.4526553226283521, + "learning_rate": 5.545944678893853e-06, + "loss": 0.3158, + "step": 4174 + }, + { + "epoch": 1.9949226450032853, + "grad_norm": 0.4659888379257696, + "learning_rate": 5.541229858757011e-06, + "loss": 0.2973, + "step": 4175 + }, + { + "epoch": 1.9954005137088586, + "grad_norm": 0.45374769534588466, + "learning_rate": 5.53651627540435e-06, + "loss": 0.3078, + "step": 4176 + }, + { + "epoch": 1.9958783824144315, + "grad_norm": 0.47368493531667, + "learning_rate": 5.531803930143345e-06, + "loss": 0.3152, + "step": 4177 + }, + { + "epoch": 1.9963562511200048, + "grad_norm": 0.470660248933249, + "learning_rate": 5.527092824281111e-06, + "loss": 0.2873, + "step": 4178 + }, + { + "epoch": 1.996834119825578, + "grad_norm": 0.4557800815216673, + "learning_rate": 5.522382959124422e-06, + "loss": 0.2972, + "step": 4179 + }, + { + "epoch": 1.997311988531151, + "grad_norm": 0.4748327062670451, + "learning_rate": 5.517674335979721e-06, + "loss": 0.2962, + "step": 4180 + }, + { + "epoch": 1.9977898572367243, + "grad_norm": 0.4718260147834405, + "learning_rate": 5.512966956153093e-06, + "loss": 0.3072, + "step": 4181 + }, + { + "epoch": 1.9982677259422974, + "grad_norm": 0.4766665255326838, + "learning_rate": 5.508260820950278e-06, + "loss": 0.3006, + "step": 4182 + }, + { + "epoch": 1.9987455946478705, + "grad_norm": 0.43482332106111293, + "learning_rate": 5.503555931676681e-06, + "loss": 0.3002, + "step": 4183 + }, + { + "epoch": 1.9992234633534436, + "grad_norm": 0.5879689150189751, + "learning_rate": 5.498852289637353e-06, + "loss": 0.3064, + "step": 4184 + }, + { + "epoch": 1.9997013320590167, + "grad_norm": 0.6039537333215967, + "learning_rate": 5.494149896136998e-06, + "loss": 0.2986, + "step": 4185 + }, + { + "epoch": 2.0, + "grad_norm": 0.507643331231026, + "learning_rate": 5.489448752479982e-06, + "loss": 0.3016, + "step": 4186 + }, + { + "epoch": 2.0004778687055733, + "grad_norm": 0.5557131392016892, + "learning_rate": 5.484748859970319e-06, + "loss": 0.2706, + "step": 4187 + }, + { + "epoch": 2.000955737411146, + "grad_norm": 0.5129666479233886, + "learning_rate": 5.48005021991167e-06, + "loss": 0.2874, + "step": 4188 + }, + { + "epoch": 2.0014336061167195, + "grad_norm": 0.48363044424144763, + "learning_rate": 5.475352833607363e-06, + "loss": 0.2666, + "step": 4189 + }, + { + "epoch": 2.0019114748222924, + "grad_norm": 0.4571679864809817, + "learning_rate": 5.470656702360367e-06, + "loss": 0.2944, + "step": 4190 + }, + { + "epoch": 2.0023893435278657, + "grad_norm": 0.49923351769769775, + "learning_rate": 5.4659618274733e-06, + "loss": 0.2708, + "step": 4191 + }, + { + "epoch": 2.002867212233439, + "grad_norm": 0.5726085889940755, + "learning_rate": 5.461268210248449e-06, + "loss": 0.2778, + "step": 4192 + }, + { + "epoch": 2.003345080939012, + "grad_norm": 0.5151146352983149, + "learning_rate": 5.4565758519877354e-06, + "loss": 0.2768, + "step": 4193 + }, + { + "epoch": 2.003822949644585, + "grad_norm": 0.5408118208195342, + "learning_rate": 5.45188475399273e-06, + "loss": 0.2789, + "step": 4194 + }, + { + "epoch": 2.0043008183501585, + "grad_norm": 0.5218303636526377, + "learning_rate": 5.447194917564671e-06, + "loss": 0.283, + "step": 4195 + }, + { + "epoch": 2.0047786870557314, + "grad_norm": 0.48975172353959573, + "learning_rate": 5.442506344004433e-06, + "loss": 0.2583, + "step": 4196 + }, + { + "epoch": 2.0052565557613047, + "grad_norm": 0.48789372594335206, + "learning_rate": 5.437819034612536e-06, + "loss": 0.2891, + "step": 4197 + }, + { + "epoch": 2.0057344244668776, + "grad_norm": 0.5173557034209996, + "learning_rate": 5.433132990689168e-06, + "loss": 0.2718, + "step": 4198 + }, + { + "epoch": 2.006212293172451, + "grad_norm": 0.4979488229723115, + "learning_rate": 5.42844821353415e-06, + "loss": 0.2763, + "step": 4199 + }, + { + "epoch": 2.006690161878024, + "grad_norm": 0.46766644811447605, + "learning_rate": 5.423764704446954e-06, + "loss": 0.2718, + "step": 4200 + }, + { + "epoch": 2.007168030583597, + "grad_norm": 0.4858229963672474, + "learning_rate": 5.41908246472671e-06, + "loss": 0.2717, + "step": 4201 + }, + { + "epoch": 2.0076458992891704, + "grad_norm": 0.4871157460335423, + "learning_rate": 5.414401495672183e-06, + "loss": 0.2479, + "step": 4202 + }, + { + "epoch": 2.0081237679947432, + "grad_norm": 0.4719329157303939, + "learning_rate": 5.4097217985817885e-06, + "loss": 0.2707, + "step": 4203 + }, + { + "epoch": 2.0086016367003166, + "grad_norm": 0.4784620215856789, + "learning_rate": 5.4050433747536e-06, + "loss": 0.2793, + "step": 4204 + }, + { + "epoch": 2.00907950540589, + "grad_norm": 0.4480022553241313, + "learning_rate": 5.400366225485326e-06, + "loss": 0.2551, + "step": 4205 + }, + { + "epoch": 2.0095573741114627, + "grad_norm": 0.4862355352173036, + "learning_rate": 5.395690352074321e-06, + "loss": 0.277, + "step": 4206 + }, + { + "epoch": 2.010035242817036, + "grad_norm": 0.4832048787344954, + "learning_rate": 5.391015755817597e-06, + "loss": 0.2741, + "step": 4207 + }, + { + "epoch": 2.0105131115226094, + "grad_norm": 0.5260287493937732, + "learning_rate": 5.386342438011798e-06, + "loss": 0.2737, + "step": 4208 + }, + { + "epoch": 2.0109909802281822, + "grad_norm": 0.46713974738731096, + "learning_rate": 5.3816703999532225e-06, + "loss": 0.2707, + "step": 4209 + }, + { + "epoch": 2.0114688489337555, + "grad_norm": 0.48820679807463374, + "learning_rate": 5.376999642937817e-06, + "loss": 0.271, + "step": 4210 + }, + { + "epoch": 2.0119467176393284, + "grad_norm": 0.4546420853003914, + "learning_rate": 5.372330168261162e-06, + "loss": 0.266, + "step": 4211 + }, + { + "epoch": 2.0124245863449017, + "grad_norm": 0.5092550048537878, + "learning_rate": 5.367661977218484e-06, + "loss": 0.2718, + "step": 4212 + }, + { + "epoch": 2.012902455050475, + "grad_norm": 0.6128690560615392, + "learning_rate": 5.362995071104664e-06, + "loss": 0.2576, + "step": 4213 + }, + { + "epoch": 2.013380323756048, + "grad_norm": 0.45952890722819323, + "learning_rate": 5.358329451214215e-06, + "loss": 0.2868, + "step": 4214 + }, + { + "epoch": 2.0138581924616212, + "grad_norm": 0.48006938509230707, + "learning_rate": 5.353665118841296e-06, + "loss": 0.2816, + "step": 4215 + }, + { + "epoch": 2.014336061167194, + "grad_norm": 0.45754420825780606, + "learning_rate": 5.3490020752797165e-06, + "loss": 0.2846, + "step": 4216 + }, + { + "epoch": 2.0148139298727674, + "grad_norm": 0.48929584524947306, + "learning_rate": 5.344340321822919e-06, + "loss": 0.2877, + "step": 4217 + }, + { + "epoch": 2.0152917985783407, + "grad_norm": 0.4735902467266419, + "learning_rate": 5.339679859763988e-06, + "loss": 0.2622, + "step": 4218 + }, + { + "epoch": 2.0157696672839136, + "grad_norm": 0.4723270046896756, + "learning_rate": 5.33502069039566e-06, + "loss": 0.283, + "step": 4219 + }, + { + "epoch": 2.016247535989487, + "grad_norm": 0.47606202201064185, + "learning_rate": 5.330362815010306e-06, + "loss": 0.2811, + "step": 4220 + }, + { + "epoch": 2.0167254046950602, + "grad_norm": 0.47024447912354794, + "learning_rate": 5.325706234899931e-06, + "loss": 0.2724, + "step": 4221 + }, + { + "epoch": 2.017203273400633, + "grad_norm": 0.46214107281210676, + "learning_rate": 5.321050951356197e-06, + "loss": 0.2671, + "step": 4222 + }, + { + "epoch": 2.0176811421062064, + "grad_norm": 0.4685381460638152, + "learning_rate": 5.316396965670394e-06, + "loss": 0.2763, + "step": 4223 + }, + { + "epoch": 2.0181590108117793, + "grad_norm": 0.45819421363781176, + "learning_rate": 5.3117442791334514e-06, + "loss": 0.264, + "step": 4224 + }, + { + "epoch": 2.0186368795173526, + "grad_norm": 0.45413644290919253, + "learning_rate": 5.307092893035951e-06, + "loss": 0.285, + "step": 4225 + }, + { + "epoch": 2.019114748222926, + "grad_norm": 0.45027926657460016, + "learning_rate": 5.3024428086681e-06, + "loss": 0.2687, + "step": 4226 + }, + { + "epoch": 2.019592616928499, + "grad_norm": 0.4656522036888355, + "learning_rate": 5.297794027319747e-06, + "loss": 0.2688, + "step": 4227 + }, + { + "epoch": 2.020070485634072, + "grad_norm": 0.4763467893183739, + "learning_rate": 5.293146550280388e-06, + "loss": 0.2758, + "step": 4228 + }, + { + "epoch": 2.020548354339645, + "grad_norm": 0.46374013514301876, + "learning_rate": 5.28850037883915e-06, + "loss": 0.2621, + "step": 4229 + }, + { + "epoch": 2.0210262230452183, + "grad_norm": 0.48224561494719265, + "learning_rate": 5.2838555142847925e-06, + "loss": 0.2731, + "step": 4230 + }, + { + "epoch": 2.0215040917507916, + "grad_norm": 0.6679479212555328, + "learning_rate": 5.2792119579057275e-06, + "loss": 0.2578, + "step": 4231 + }, + { + "epoch": 2.0219819604563645, + "grad_norm": 0.49348773245256794, + "learning_rate": 5.274569710989994e-06, + "loss": 0.2573, + "step": 4232 + }, + { + "epoch": 2.022459829161938, + "grad_norm": 0.47356875200053883, + "learning_rate": 5.269928774825261e-06, + "loss": 0.2815, + "step": 4233 + }, + { + "epoch": 2.022937697867511, + "grad_norm": 0.531779784720976, + "learning_rate": 5.265289150698855e-06, + "loss": 0.286, + "step": 4234 + }, + { + "epoch": 2.023415566573084, + "grad_norm": 0.46128402209008207, + "learning_rate": 5.260650839897719e-06, + "loss": 0.2742, + "step": 4235 + }, + { + "epoch": 2.0238934352786573, + "grad_norm": 0.5084306437023877, + "learning_rate": 5.256013843708435e-06, + "loss": 0.3016, + "step": 4236 + }, + { + "epoch": 2.02437130398423, + "grad_norm": 0.5186816576776002, + "learning_rate": 5.251378163417232e-06, + "loss": 0.2646, + "step": 4237 + }, + { + "epoch": 2.0248491726898035, + "grad_norm": 0.7432820985970882, + "learning_rate": 5.246743800309964e-06, + "loss": 0.2893, + "step": 4238 + }, + { + "epoch": 2.025327041395377, + "grad_norm": 0.5031043114302443, + "learning_rate": 5.242110755672114e-06, + "loss": 0.2574, + "step": 4239 + }, + { + "epoch": 2.0258049101009497, + "grad_norm": 0.47460643773168915, + "learning_rate": 5.237479030788817e-06, + "loss": 0.2619, + "step": 4240 + }, + { + "epoch": 2.026282778806523, + "grad_norm": 0.483227882308964, + "learning_rate": 5.232848626944827e-06, + "loss": 0.2796, + "step": 4241 + }, + { + "epoch": 2.026760647512096, + "grad_norm": 0.5588184535169874, + "learning_rate": 5.228219545424533e-06, + "loss": 0.2732, + "step": 4242 + }, + { + "epoch": 2.027238516217669, + "grad_norm": 0.4734462488933155, + "learning_rate": 5.2235917875119656e-06, + "loss": 0.2708, + "step": 4243 + }, + { + "epoch": 2.0277163849232425, + "grad_norm": 0.4781528576455179, + "learning_rate": 5.218965354490786e-06, + "loss": 0.2753, + "step": 4244 + }, + { + "epoch": 2.0281942536288153, + "grad_norm": 0.5011498208217005, + "learning_rate": 5.214340247644278e-06, + "loss": 0.2675, + "step": 4245 + }, + { + "epoch": 2.0286721223343886, + "grad_norm": 0.48552354280981097, + "learning_rate": 5.209716468255367e-06, + "loss": 0.274, + "step": 4246 + }, + { + "epoch": 2.029149991039962, + "grad_norm": 0.5513420020431447, + "learning_rate": 5.205094017606611e-06, + "loss": 0.2806, + "step": 4247 + }, + { + "epoch": 2.029627859745535, + "grad_norm": 0.7169421131443054, + "learning_rate": 5.2004728969801945e-06, + "loss": 0.281, + "step": 4248 + }, + { + "epoch": 2.030105728451108, + "grad_norm": 0.48421888141015607, + "learning_rate": 5.19585310765793e-06, + "loss": 0.2649, + "step": 4249 + }, + { + "epoch": 2.030583597156681, + "grad_norm": 0.4750848932286133, + "learning_rate": 5.191234650921273e-06, + "loss": 0.2617, + "step": 4250 + }, + { + "epoch": 2.0310614658622543, + "grad_norm": 0.45867486142995845, + "learning_rate": 5.1866175280513e-06, + "loss": 0.2652, + "step": 4251 + }, + { + "epoch": 2.0315393345678276, + "grad_norm": 0.4993525355930482, + "learning_rate": 5.182001740328713e-06, + "loss": 0.2535, + "step": 4252 + }, + { + "epoch": 2.0320172032734005, + "grad_norm": 0.478090275533228, + "learning_rate": 5.17738728903386e-06, + "loss": 0.2761, + "step": 4253 + }, + { + "epoch": 2.032495071978974, + "grad_norm": 0.47697353288833244, + "learning_rate": 5.172774175446703e-06, + "loss": 0.2733, + "step": 4254 + }, + { + "epoch": 2.0329729406845467, + "grad_norm": 0.46412526746517374, + "learning_rate": 5.168162400846835e-06, + "loss": 0.2818, + "step": 4255 + }, + { + "epoch": 2.03345080939012, + "grad_norm": 0.4740087446649736, + "learning_rate": 5.16355196651349e-06, + "loss": 0.2696, + "step": 4256 + }, + { + "epoch": 2.0339286780956933, + "grad_norm": 0.4895373321168128, + "learning_rate": 5.158942873725514e-06, + "loss": 0.2909, + "step": 4257 + }, + { + "epoch": 2.034406546801266, + "grad_norm": 0.48823594256633335, + "learning_rate": 5.154335123761387e-06, + "loss": 0.2603, + "step": 4258 + }, + { + "epoch": 2.0348844155068395, + "grad_norm": 0.4403350386861437, + "learning_rate": 5.149728717899225e-06, + "loss": 0.2739, + "step": 4259 + }, + { + "epoch": 2.035362284212413, + "grad_norm": 0.4744290808806091, + "learning_rate": 5.145123657416759e-06, + "loss": 0.2665, + "step": 4260 + }, + { + "epoch": 2.0358401529179857, + "grad_norm": 0.49559161901762544, + "learning_rate": 5.140519943591348e-06, + "loss": 0.2802, + "step": 4261 + }, + { + "epoch": 2.036318021623559, + "grad_norm": 0.4945068985838198, + "learning_rate": 5.135917577699988e-06, + "loss": 0.2627, + "step": 4262 + }, + { + "epoch": 2.036795890329132, + "grad_norm": 0.47314478267085935, + "learning_rate": 5.131316561019293e-06, + "loss": 0.2787, + "step": 4263 + }, + { + "epoch": 2.037273759034705, + "grad_norm": 0.48612430158367564, + "learning_rate": 5.126716894825496e-06, + "loss": 0.2784, + "step": 4264 + }, + { + "epoch": 2.0377516277402785, + "grad_norm": 0.4685580006629703, + "learning_rate": 5.122118580394473e-06, + "loss": 0.2604, + "step": 4265 + }, + { + "epoch": 2.0382294964458514, + "grad_norm": 0.4876824775843487, + "learning_rate": 5.117521619001713e-06, + "loss": 0.2737, + "step": 4266 + }, + { + "epoch": 2.0387073651514247, + "grad_norm": 0.4533392736697577, + "learning_rate": 5.112926011922326e-06, + "loss": 0.2571, + "step": 4267 + }, + { + "epoch": 2.0391852338569976, + "grad_norm": 0.48519263547086117, + "learning_rate": 5.10833176043106e-06, + "loss": 0.2742, + "step": 4268 + }, + { + "epoch": 2.039663102562571, + "grad_norm": 0.4562442726047049, + "learning_rate": 5.103738865802277e-06, + "loss": 0.2658, + "step": 4269 + }, + { + "epoch": 2.040140971268144, + "grad_norm": 0.4672534849486868, + "learning_rate": 5.099147329309959e-06, + "loss": 0.256, + "step": 4270 + }, + { + "epoch": 2.040618839973717, + "grad_norm": 0.4772694807476173, + "learning_rate": 5.0945571522277255e-06, + "loss": 0.2663, + "step": 4271 + }, + { + "epoch": 2.0410967086792904, + "grad_norm": 0.5706905661498267, + "learning_rate": 5.08996833582881e-06, + "loss": 0.2859, + "step": 4272 + }, + { + "epoch": 2.0415745773848637, + "grad_norm": 0.4947667148137507, + "learning_rate": 5.0853808813860616e-06, + "loss": 0.3067, + "step": 4273 + }, + { + "epoch": 2.0420524460904366, + "grad_norm": 0.4851005503979229, + "learning_rate": 5.080794790171968e-06, + "loss": 0.2716, + "step": 4274 + }, + { + "epoch": 2.04253031479601, + "grad_norm": 0.48079461477386043, + "learning_rate": 5.076210063458622e-06, + "loss": 0.2688, + "step": 4275 + }, + { + "epoch": 2.0430081835015828, + "grad_norm": 0.4673487498716285, + "learning_rate": 5.071626702517756e-06, + "loss": 0.2632, + "step": 4276 + }, + { + "epoch": 2.043486052207156, + "grad_norm": 0.4653754050455356, + "learning_rate": 5.067044708620702e-06, + "loss": 0.2827, + "step": 4277 + }, + { + "epoch": 2.0439639209127294, + "grad_norm": 0.4697586401370185, + "learning_rate": 5.062464083038434e-06, + "loss": 0.2667, + "step": 4278 + }, + { + "epoch": 2.0444417896183023, + "grad_norm": 0.5260281606453924, + "learning_rate": 5.057884827041533e-06, + "loss": 0.2515, + "step": 4279 + }, + { + "epoch": 2.0449196583238756, + "grad_norm": 0.4994642856059735, + "learning_rate": 5.0533069419002e-06, + "loss": 0.2671, + "step": 4280 + }, + { + "epoch": 2.045397527029449, + "grad_norm": 0.48493301210780937, + "learning_rate": 5.048730428884268e-06, + "loss": 0.2727, + "step": 4281 + }, + { + "epoch": 2.0458753957350218, + "grad_norm": 0.46353129132177423, + "learning_rate": 5.044155289263174e-06, + "loss": 0.2765, + "step": 4282 + }, + { + "epoch": 2.046353264440595, + "grad_norm": 0.5018313449986281, + "learning_rate": 5.03958152430598e-06, + "loss": 0.2727, + "step": 4283 + }, + { + "epoch": 2.046831133146168, + "grad_norm": 0.4851187115521681, + "learning_rate": 5.035009135281375e-06, + "loss": 0.2652, + "step": 4284 + }, + { + "epoch": 2.0473090018517413, + "grad_norm": 0.4723372968025927, + "learning_rate": 5.030438123457655e-06, + "loss": 0.2729, + "step": 4285 + }, + { + "epoch": 2.0477868705573146, + "grad_norm": 0.47152065145636834, + "learning_rate": 5.025868490102734e-06, + "loss": 0.2724, + "step": 4286 + }, + { + "epoch": 2.0482647392628874, + "grad_norm": 0.5260680519772822, + "learning_rate": 5.021300236484156e-06, + "loss": 0.2648, + "step": 4287 + }, + { + "epoch": 2.0487426079684607, + "grad_norm": 0.47329119567837263, + "learning_rate": 5.016733363869068e-06, + "loss": 0.2713, + "step": 4288 + }, + { + "epoch": 2.0492204766740336, + "grad_norm": 0.50642114847147, + "learning_rate": 5.01216787352424e-06, + "loss": 0.2638, + "step": 4289 + }, + { + "epoch": 2.049698345379607, + "grad_norm": 0.4705941501131214, + "learning_rate": 5.007603766716063e-06, + "loss": 0.2643, + "step": 4290 + }, + { + "epoch": 2.0501762140851802, + "grad_norm": 0.4803945099496459, + "learning_rate": 5.003041044710536e-06, + "loss": 0.2717, + "step": 4291 + }, + { + "epoch": 2.050654082790753, + "grad_norm": 0.4612851976830831, + "learning_rate": 4.998479708773275e-06, + "loss": 0.2805, + "step": 4292 + }, + { + "epoch": 2.0511319514963264, + "grad_norm": 0.4759211681133958, + "learning_rate": 4.993919760169521e-06, + "loss": 0.2659, + "step": 4293 + }, + { + "epoch": 2.0516098202018993, + "grad_norm": 0.4979267001248353, + "learning_rate": 4.98936120016412e-06, + "loss": 0.2764, + "step": 4294 + }, + { + "epoch": 2.0520876889074726, + "grad_norm": 0.5214662750542031, + "learning_rate": 4.984804030021533e-06, + "loss": 0.2707, + "step": 4295 + }, + { + "epoch": 2.052565557613046, + "grad_norm": 0.4913707086962388, + "learning_rate": 4.9802482510058445e-06, + "loss": 0.2687, + "step": 4296 + }, + { + "epoch": 2.053043426318619, + "grad_norm": 0.5027711346349333, + "learning_rate": 4.975693864380744e-06, + "loss": 0.2613, + "step": 4297 + }, + { + "epoch": 2.053521295024192, + "grad_norm": 0.5024104113465125, + "learning_rate": 4.971140871409536e-06, + "loss": 0.2741, + "step": 4298 + }, + { + "epoch": 2.0539991637297654, + "grad_norm": 0.4811287363006302, + "learning_rate": 4.966589273355144e-06, + "loss": 0.2689, + "step": 4299 + }, + { + "epoch": 2.0544770324353383, + "grad_norm": 0.4644489936036945, + "learning_rate": 4.962039071480102e-06, + "loss": 0.2791, + "step": 4300 + }, + { + "epoch": 2.0549549011409116, + "grad_norm": 0.5336660306893748, + "learning_rate": 4.957490267046549e-06, + "loss": 0.2803, + "step": 4301 + }, + { + "epoch": 2.0554327698464845, + "grad_norm": 0.47071851786985547, + "learning_rate": 4.95294286131625e-06, + "loss": 0.2673, + "step": 4302 + }, + { + "epoch": 2.055910638552058, + "grad_norm": 0.7828065837615993, + "learning_rate": 4.948396855550575e-06, + "loss": 0.2633, + "step": 4303 + }, + { + "epoch": 2.056388507257631, + "grad_norm": 0.45464028072528234, + "learning_rate": 4.943852251010498e-06, + "loss": 0.2793, + "step": 4304 + }, + { + "epoch": 2.056866375963204, + "grad_norm": 0.6373445627548838, + "learning_rate": 4.939309048956622e-06, + "loss": 0.2825, + "step": 4305 + }, + { + "epoch": 2.0573442446687773, + "grad_norm": 0.4671574195509635, + "learning_rate": 4.934767250649146e-06, + "loss": 0.2825, + "step": 4306 + }, + { + "epoch": 2.0578221133743506, + "grad_norm": 0.5249600277106131, + "learning_rate": 4.9302268573478825e-06, + "loss": 0.2798, + "step": 4307 + }, + { + "epoch": 2.0582999820799235, + "grad_norm": 0.532751022686781, + "learning_rate": 4.925687870312263e-06, + "loss": 0.2704, + "step": 4308 + }, + { + "epoch": 2.058777850785497, + "grad_norm": 0.4936173702678726, + "learning_rate": 4.921150290801316e-06, + "loss": 0.2791, + "step": 4309 + }, + { + "epoch": 2.0592557194910697, + "grad_norm": 0.46696117374431717, + "learning_rate": 4.9166141200736885e-06, + "loss": 0.2717, + "step": 4310 + }, + { + "epoch": 2.059733588196643, + "grad_norm": 0.4612537280249499, + "learning_rate": 4.912079359387638e-06, + "loss": 0.2712, + "step": 4311 + }, + { + "epoch": 2.0602114569022163, + "grad_norm": 0.44406796450445546, + "learning_rate": 4.907546010001026e-06, + "loss": 0.2772, + "step": 4312 + }, + { + "epoch": 2.060689325607789, + "grad_norm": 0.4685017422306754, + "learning_rate": 4.903014073171315e-06, + "loss": 0.2689, + "step": 4313 + }, + { + "epoch": 2.0611671943133625, + "grad_norm": 0.49168518163616726, + "learning_rate": 4.898483550155595e-06, + "loss": 0.2685, + "step": 4314 + }, + { + "epoch": 2.0616450630189354, + "grad_norm": 0.47506846437092554, + "learning_rate": 4.89395444221055e-06, + "loss": 0.2636, + "step": 4315 + }, + { + "epoch": 2.0621229317245087, + "grad_norm": 0.44749028284198716, + "learning_rate": 4.889426750592469e-06, + "loss": 0.273, + "step": 4316 + }, + { + "epoch": 2.062600800430082, + "grad_norm": 0.4643039012469354, + "learning_rate": 4.884900476557263e-06, + "loss": 0.2843, + "step": 4317 + }, + { + "epoch": 2.063078669135655, + "grad_norm": 0.524850579695799, + "learning_rate": 4.880375621360435e-06, + "loss": 0.2732, + "step": 4318 + }, + { + "epoch": 2.063556537841228, + "grad_norm": 0.4937697172955831, + "learning_rate": 4.8758521862570975e-06, + "loss": 0.2877, + "step": 4319 + }, + { + "epoch": 2.0640344065468015, + "grad_norm": 0.4568437096178462, + "learning_rate": 4.871330172501979e-06, + "loss": 0.2591, + "step": 4320 + }, + { + "epoch": 2.0645122752523744, + "grad_norm": 0.4818485682691764, + "learning_rate": 4.866809581349403e-06, + "loss": 0.2842, + "step": 4321 + }, + { + "epoch": 2.0649901439579477, + "grad_norm": 0.500279053318082, + "learning_rate": 4.862290414053296e-06, + "loss": 0.2689, + "step": 4322 + }, + { + "epoch": 2.0654680126635205, + "grad_norm": 0.4796867445554122, + "learning_rate": 4.857772671867206e-06, + "loss": 0.261, + "step": 4323 + }, + { + "epoch": 2.065945881369094, + "grad_norm": 0.45209725252949634, + "learning_rate": 4.853256356044269e-06, + "loss": 0.263, + "step": 4324 + }, + { + "epoch": 2.066423750074667, + "grad_norm": 0.5262595142130551, + "learning_rate": 4.848741467837228e-06, + "loss": 0.3065, + "step": 4325 + }, + { + "epoch": 2.06690161878024, + "grad_norm": 0.5027222558313732, + "learning_rate": 4.844228008498441e-06, + "loss": 0.2771, + "step": 4326 + }, + { + "epoch": 2.0673794874858133, + "grad_norm": 0.5140923745008394, + "learning_rate": 4.839715979279857e-06, + "loss": 0.2673, + "step": 4327 + }, + { + "epoch": 2.067857356191386, + "grad_norm": 0.6688922617742407, + "learning_rate": 4.835205381433033e-06, + "loss": 0.2759, + "step": 4328 + }, + { + "epoch": 2.0683352248969595, + "grad_norm": 0.5106868534593813, + "learning_rate": 4.830696216209133e-06, + "loss": 0.2707, + "step": 4329 + }, + { + "epoch": 2.068813093602533, + "grad_norm": 0.5125392579142816, + "learning_rate": 4.826188484858918e-06, + "loss": 0.2796, + "step": 4330 + }, + { + "epoch": 2.0692909623081057, + "grad_norm": 0.4731305023489138, + "learning_rate": 4.821682188632749e-06, + "loss": 0.2698, + "step": 4331 + }, + { + "epoch": 2.069768831013679, + "grad_norm": 0.537078693052272, + "learning_rate": 4.8171773287806e-06, + "loss": 0.2763, + "step": 4332 + }, + { + "epoch": 2.0702466997192523, + "grad_norm": 0.48986368836511185, + "learning_rate": 4.812673906552038e-06, + "loss": 0.2682, + "step": 4333 + }, + { + "epoch": 2.070724568424825, + "grad_norm": 0.49970157825319683, + "learning_rate": 4.808171923196227e-06, + "loss": 0.2745, + "step": 4334 + }, + { + "epoch": 2.0712024371303985, + "grad_norm": 0.46279135492922463, + "learning_rate": 4.803671379961945e-06, + "loss": 0.2732, + "step": 4335 + }, + { + "epoch": 2.0716803058359714, + "grad_norm": 0.5228404024805013, + "learning_rate": 4.7991722780975614e-06, + "loss": 0.2569, + "step": 4336 + }, + { + "epoch": 2.0721581745415447, + "grad_norm": 0.47489759603230547, + "learning_rate": 4.794674618851044e-06, + "loss": 0.2702, + "step": 4337 + }, + { + "epoch": 2.072636043247118, + "grad_norm": 0.45711910658365396, + "learning_rate": 4.7901784034699695e-06, + "loss": 0.2581, + "step": 4338 + }, + { + "epoch": 2.073113911952691, + "grad_norm": 0.5062614569880117, + "learning_rate": 4.785683633201507e-06, + "loss": 0.2494, + "step": 4339 + }, + { + "epoch": 2.073591780658264, + "grad_norm": 0.4838987562625956, + "learning_rate": 4.781190309292421e-06, + "loss": 0.2666, + "step": 4340 + }, + { + "epoch": 2.074069649363837, + "grad_norm": 0.48244525999987964, + "learning_rate": 4.776698432989089e-06, + "loss": 0.2641, + "step": 4341 + }, + { + "epoch": 2.0745475180694104, + "grad_norm": 0.481389690872325, + "learning_rate": 4.7722080055374745e-06, + "loss": 0.2766, + "step": 4342 + }, + { + "epoch": 2.0750253867749837, + "grad_norm": 0.547344894388519, + "learning_rate": 4.767719028183139e-06, + "loss": 0.2779, + "step": 4343 + }, + { + "epoch": 2.0755032554805566, + "grad_norm": 0.4969679278468139, + "learning_rate": 4.7632315021712494e-06, + "loss": 0.2767, + "step": 4344 + }, + { + "epoch": 2.07598112418613, + "grad_norm": 0.4969616199208339, + "learning_rate": 4.758745428746569e-06, + "loss": 0.2609, + "step": 4345 + }, + { + "epoch": 2.076458992891703, + "grad_norm": 0.46420623564379426, + "learning_rate": 4.754260809153453e-06, + "loss": 0.2606, + "step": 4346 + }, + { + "epoch": 2.076936861597276, + "grad_norm": 0.4624803013184737, + "learning_rate": 4.749777644635851e-06, + "loss": 0.2658, + "step": 4347 + }, + { + "epoch": 2.0774147303028494, + "grad_norm": 0.4564411009275747, + "learning_rate": 4.745295936437323e-06, + "loss": 0.2893, + "step": 4348 + }, + { + "epoch": 2.0778925990084223, + "grad_norm": 0.45522303982023976, + "learning_rate": 4.74081568580101e-06, + "loss": 0.2888, + "step": 4349 + }, + { + "epoch": 2.0783704677139956, + "grad_norm": 0.45649179256830386, + "learning_rate": 4.736336893969652e-06, + "loss": 0.2627, + "step": 4350 + }, + { + "epoch": 2.078848336419569, + "grad_norm": 0.8720177385632242, + "learning_rate": 4.731859562185593e-06, + "loss": 0.2504, + "step": 4351 + }, + { + "epoch": 2.0793262051251418, + "grad_norm": 0.464019520768861, + "learning_rate": 4.727383691690765e-06, + "loss": 0.2803, + "step": 4352 + }, + { + "epoch": 2.079804073830715, + "grad_norm": 0.4829753872971073, + "learning_rate": 4.722909283726687e-06, + "loss": 0.2747, + "step": 4353 + }, + { + "epoch": 2.080281942536288, + "grad_norm": 0.49238958382928866, + "learning_rate": 4.718436339534493e-06, + "loss": 0.2696, + "step": 4354 + }, + { + "epoch": 2.0807598112418613, + "grad_norm": 0.5331681056474048, + "learning_rate": 4.7139648603548925e-06, + "loss": 0.2803, + "step": 4355 + }, + { + "epoch": 2.0812376799474346, + "grad_norm": 0.5073464839867855, + "learning_rate": 4.709494847428193e-06, + "loss": 0.2841, + "step": 4356 + }, + { + "epoch": 2.0817155486530075, + "grad_norm": 0.45403717199200416, + "learning_rate": 4.7050263019943035e-06, + "loss": 0.2693, + "step": 4357 + }, + { + "epoch": 2.0821934173585808, + "grad_norm": 0.4790899870289159, + "learning_rate": 4.700559225292714e-06, + "loss": 0.2723, + "step": 4358 + }, + { + "epoch": 2.082671286064154, + "grad_norm": 0.5479563382822494, + "learning_rate": 4.696093618562514e-06, + "loss": 0.2827, + "step": 4359 + }, + { + "epoch": 2.083149154769727, + "grad_norm": 0.4645988394382689, + "learning_rate": 4.691629483042387e-06, + "loss": 0.2834, + "step": 4360 + }, + { + "epoch": 2.0836270234753003, + "grad_norm": 0.48819600396553203, + "learning_rate": 4.687166819970605e-06, + "loss": 0.2806, + "step": 4361 + }, + { + "epoch": 2.084104892180873, + "grad_norm": 0.48443281966006396, + "learning_rate": 4.682705630585024e-06, + "loss": 0.2777, + "step": 4362 + }, + { + "epoch": 2.0845827608864465, + "grad_norm": 0.4840904657426456, + "learning_rate": 4.678245916123111e-06, + "loss": 0.2632, + "step": 4363 + }, + { + "epoch": 2.0850606295920198, + "grad_norm": 0.4817924591446688, + "learning_rate": 4.673787677821906e-06, + "loss": 0.2571, + "step": 4364 + }, + { + "epoch": 2.0855384982975926, + "grad_norm": 0.4525499762733216, + "learning_rate": 4.669330916918043e-06, + "loss": 0.2837, + "step": 4365 + }, + { + "epoch": 2.086016367003166, + "grad_norm": 0.4568315340280084, + "learning_rate": 4.664875634647756e-06, + "loss": 0.2785, + "step": 4366 + }, + { + "epoch": 2.086494235708739, + "grad_norm": 0.5183117984656025, + "learning_rate": 4.660421832246858e-06, + "loss": 0.2936, + "step": 4367 + }, + { + "epoch": 2.086972104414312, + "grad_norm": 0.4872870778331395, + "learning_rate": 4.655969510950752e-06, + "loss": 0.2733, + "step": 4368 + }, + { + "epoch": 2.0874499731198854, + "grad_norm": 0.46677867691861946, + "learning_rate": 4.65151867199444e-06, + "loss": 0.2678, + "step": 4369 + }, + { + "epoch": 2.0879278418254583, + "grad_norm": 0.46455148899464205, + "learning_rate": 4.647069316612502e-06, + "loss": 0.2687, + "step": 4370 + }, + { + "epoch": 2.0884057105310316, + "grad_norm": 1.2838537701376698, + "learning_rate": 4.6426214460391095e-06, + "loss": 0.2844, + "step": 4371 + }, + { + "epoch": 2.088883579236605, + "grad_norm": 0.47029125249903925, + "learning_rate": 4.6381750615080275e-06, + "loss": 0.2618, + "step": 4372 + }, + { + "epoch": 2.089361447942178, + "grad_norm": 0.4888788479772587, + "learning_rate": 4.633730164252603e-06, + "loss": 0.2643, + "step": 4373 + }, + { + "epoch": 2.089839316647751, + "grad_norm": 0.4591781777504566, + "learning_rate": 4.629286755505768e-06, + "loss": 0.2754, + "step": 4374 + }, + { + "epoch": 2.090317185353324, + "grad_norm": 0.5110989494551736, + "learning_rate": 4.624844836500052e-06, + "loss": 0.2586, + "step": 4375 + }, + { + "epoch": 2.0907950540588973, + "grad_norm": 0.4951473466562376, + "learning_rate": 4.620404408467559e-06, + "loss": 0.2701, + "step": 4376 + }, + { + "epoch": 2.0912729227644706, + "grad_norm": 0.45662970354071386, + "learning_rate": 4.615965472639992e-06, + "loss": 0.2649, + "step": 4377 + }, + { + "epoch": 2.0917507914700435, + "grad_norm": 0.46953730734656046, + "learning_rate": 4.611528030248629e-06, + "loss": 0.2712, + "step": 4378 + }, + { + "epoch": 2.092228660175617, + "grad_norm": 0.47897826667944887, + "learning_rate": 4.607092082524341e-06, + "loss": 0.2544, + "step": 4379 + }, + { + "epoch": 2.0927065288811897, + "grad_norm": 0.45754772942931315, + "learning_rate": 4.60265763069758e-06, + "loss": 0.2739, + "step": 4380 + }, + { + "epoch": 2.093184397586763, + "grad_norm": 0.47343432218709175, + "learning_rate": 4.598224675998381e-06, + "loss": 0.2632, + "step": 4381 + }, + { + "epoch": 2.0936622662923363, + "grad_norm": 0.5079217054437215, + "learning_rate": 4.593793219656375e-06, + "loss": 0.2884, + "step": 4382 + }, + { + "epoch": 2.094140134997909, + "grad_norm": 0.4755846137336544, + "learning_rate": 4.589363262900767e-06, + "loss": 0.2769, + "step": 4383 + }, + { + "epoch": 2.0946180037034825, + "grad_norm": 0.4624399910521351, + "learning_rate": 4.5849348069603424e-06, + "loss": 0.2833, + "step": 4384 + }, + { + "epoch": 2.095095872409056, + "grad_norm": 0.47379072504138925, + "learning_rate": 4.580507853063487e-06, + "loss": 0.2723, + "step": 4385 + }, + { + "epoch": 2.0955737411146287, + "grad_norm": 0.5404425798815802, + "learning_rate": 4.5760824024381545e-06, + "loss": 0.2836, + "step": 4386 + }, + { + "epoch": 2.096051609820202, + "grad_norm": 0.4744131279778523, + "learning_rate": 4.571658456311885e-06, + "loss": 0.2646, + "step": 4387 + }, + { + "epoch": 2.096529478525775, + "grad_norm": 0.5257445500444607, + "learning_rate": 4.567236015911808e-06, + "loss": 0.2756, + "step": 4388 + }, + { + "epoch": 2.097007347231348, + "grad_norm": 0.46379662230318247, + "learning_rate": 4.562815082464628e-06, + "loss": 0.2577, + "step": 4389 + }, + { + "epoch": 2.0974852159369215, + "grad_norm": 0.4711856915631454, + "learning_rate": 4.5583956571966295e-06, + "loss": 0.2695, + "step": 4390 + }, + { + "epoch": 2.0979630846424944, + "grad_norm": 0.4668862612467058, + "learning_rate": 4.5539777413336916e-06, + "loss": 0.2811, + "step": 4391 + }, + { + "epoch": 2.0984409533480677, + "grad_norm": 0.8250470635919427, + "learning_rate": 4.549561336101263e-06, + "loss": 0.2681, + "step": 4392 + }, + { + "epoch": 2.0989188220536406, + "grad_norm": 0.5474390912842495, + "learning_rate": 4.545146442724371e-06, + "loss": 0.278, + "step": 4393 + }, + { + "epoch": 2.099396690759214, + "grad_norm": 0.45864322699427934, + "learning_rate": 4.540733062427637e-06, + "loss": 0.2572, + "step": 4394 + }, + { + "epoch": 2.099874559464787, + "grad_norm": 0.4642926001377059, + "learning_rate": 4.5363211964352524e-06, + "loss": 0.2754, + "step": 4395 + }, + { + "epoch": 2.10035242817036, + "grad_norm": 1.2708133595604119, + "learning_rate": 4.531910845970986e-06, + "loss": 0.2515, + "step": 4396 + }, + { + "epoch": 2.1008302968759334, + "grad_norm": 0.46107312847431264, + "learning_rate": 4.527502012258201e-06, + "loss": 0.2798, + "step": 4397 + }, + { + "epoch": 2.1013081655815067, + "grad_norm": 0.4654804719208443, + "learning_rate": 4.523094696519822e-06, + "loss": 0.2599, + "step": 4398 + }, + { + "epoch": 2.1017860342870796, + "grad_norm": 0.5035051987740295, + "learning_rate": 4.5186888999783604e-06, + "loss": 0.2796, + "step": 4399 + }, + { + "epoch": 2.102263902992653, + "grad_norm": 0.4639728245767181, + "learning_rate": 4.514284623855915e-06, + "loss": 0.2711, + "step": 4400 + }, + { + "epoch": 2.1027417716982257, + "grad_norm": 0.4628009973817066, + "learning_rate": 4.509881869374146e-06, + "loss": 0.2749, + "step": 4401 + }, + { + "epoch": 2.103219640403799, + "grad_norm": 0.46310930729244, + "learning_rate": 4.5054806377543e-06, + "loss": 0.2766, + "step": 4402 + }, + { + "epoch": 2.1036975091093724, + "grad_norm": 0.4815051304539407, + "learning_rate": 4.501080930217206e-06, + "loss": 0.2686, + "step": 4403 + }, + { + "epoch": 2.1041753778149452, + "grad_norm": 0.452776883071896, + "learning_rate": 4.4966827479832645e-06, + "loss": 0.2815, + "step": 4404 + }, + { + "epoch": 2.1046532465205186, + "grad_norm": 0.5572260774768857, + "learning_rate": 4.4922860922724466e-06, + "loss": 0.2795, + "step": 4405 + }, + { + "epoch": 2.1051311152260914, + "grad_norm": 0.46536041364576214, + "learning_rate": 4.487890964304317e-06, + "loss": 0.2607, + "step": 4406 + }, + { + "epoch": 2.1056089839316647, + "grad_norm": 0.4671149161280602, + "learning_rate": 4.483497365298001e-06, + "loss": 0.2656, + "step": 4407 + }, + { + "epoch": 2.106086852637238, + "grad_norm": 0.47754698203735124, + "learning_rate": 4.479105296472204e-06, + "loss": 0.2636, + "step": 4408 + }, + { + "epoch": 2.106564721342811, + "grad_norm": 0.4691739905835083, + "learning_rate": 4.474714759045213e-06, + "loss": 0.2797, + "step": 4409 + }, + { + "epoch": 2.1070425900483842, + "grad_norm": 0.5429458246931929, + "learning_rate": 4.470325754234881e-06, + "loss": 0.2727, + "step": 4410 + }, + { + "epoch": 2.1075204587539575, + "grad_norm": 0.5405366505121745, + "learning_rate": 4.465938283258643e-06, + "loss": 0.2734, + "step": 4411 + }, + { + "epoch": 2.1079983274595304, + "grad_norm": 0.7314425834233769, + "learning_rate": 4.461552347333509e-06, + "loss": 0.2515, + "step": 4412 + }, + { + "epoch": 2.1084761961651037, + "grad_norm": 0.6100608642443537, + "learning_rate": 4.457167947676058e-06, + "loss": 0.2697, + "step": 4413 + }, + { + "epoch": 2.1089540648706766, + "grad_norm": 0.4837731698853667, + "learning_rate": 4.45278508550244e-06, + "loss": 0.2907, + "step": 4414 + }, + { + "epoch": 2.10943193357625, + "grad_norm": 0.47440995597138375, + "learning_rate": 4.448403762028391e-06, + "loss": 0.2709, + "step": 4415 + }, + { + "epoch": 2.1099098022818232, + "grad_norm": 0.5691216197594825, + "learning_rate": 4.444023978469212e-06, + "loss": 0.2778, + "step": 4416 + }, + { + "epoch": 2.110387670987396, + "grad_norm": 0.4528201720944284, + "learning_rate": 4.4396457360397704e-06, + "loss": 0.2619, + "step": 4417 + }, + { + "epoch": 2.1108655396929694, + "grad_norm": 0.7031077512368775, + "learning_rate": 4.435269035954523e-06, + "loss": 0.2658, + "step": 4418 + }, + { + "epoch": 2.1113434083985423, + "grad_norm": 0.4719207523909015, + "learning_rate": 4.430893879427486e-06, + "loss": 0.2622, + "step": 4419 + }, + { + "epoch": 2.1118212771041156, + "grad_norm": 0.4711222028375983, + "learning_rate": 4.426520267672244e-06, + "loss": 0.2908, + "step": 4420 + }, + { + "epoch": 2.112299145809689, + "grad_norm": 0.5717573571560005, + "learning_rate": 4.422148201901969e-06, + "loss": 0.272, + "step": 4421 + }, + { + "epoch": 2.112777014515262, + "grad_norm": 0.48469162216982337, + "learning_rate": 4.4177776833293915e-06, + "loss": 0.2789, + "step": 4422 + }, + { + "epoch": 2.113254883220835, + "grad_norm": 0.4591193788359184, + "learning_rate": 4.4134087131668135e-06, + "loss": 0.2855, + "step": 4423 + }, + { + "epoch": 2.1137327519264084, + "grad_norm": 0.451459503417534, + "learning_rate": 4.409041292626115e-06, + "loss": 0.2718, + "step": 4424 + }, + { + "epoch": 2.1142106206319813, + "grad_norm": 0.5180358351813875, + "learning_rate": 4.40467542291874e-06, + "loss": 0.2669, + "step": 4425 + }, + { + "epoch": 2.1146884893375546, + "grad_norm": 0.48205075176726636, + "learning_rate": 4.400311105255698e-06, + "loss": 0.2659, + "step": 4426 + }, + { + "epoch": 2.1151663580431275, + "grad_norm": 0.4544529548772736, + "learning_rate": 4.395948340847584e-06, + "loss": 0.2739, + "step": 4427 + }, + { + "epoch": 2.115644226748701, + "grad_norm": 0.44998143432924953, + "learning_rate": 4.391587130904544e-06, + "loss": 0.2568, + "step": 4428 + }, + { + "epoch": 2.116122095454274, + "grad_norm": 0.4857935888324327, + "learning_rate": 4.387227476636301e-06, + "loss": 0.2564, + "step": 4429 + }, + { + "epoch": 2.116599964159847, + "grad_norm": 0.482388126357244, + "learning_rate": 4.382869379252152e-06, + "loss": 0.269, + "step": 4430 + }, + { + "epoch": 2.1170778328654203, + "grad_norm": 0.46675426655921626, + "learning_rate": 4.378512839960953e-06, + "loss": 0.2701, + "step": 4431 + }, + { + "epoch": 2.1175557015709936, + "grad_norm": 0.4567298710073206, + "learning_rate": 4.374157859971127e-06, + "loss": 0.2582, + "step": 4432 + }, + { + "epoch": 2.1180335702765665, + "grad_norm": 1.0653818376694562, + "learning_rate": 4.369804440490676e-06, + "loss": 0.2658, + "step": 4433 + }, + { + "epoch": 2.11851143898214, + "grad_norm": 0.4691784307308584, + "learning_rate": 4.3654525827271576e-06, + "loss": 0.2797, + "step": 4434 + }, + { + "epoch": 2.1189893076877127, + "grad_norm": 1.179538881720667, + "learning_rate": 4.361102287887698e-06, + "loss": 0.2675, + "step": 4435 + }, + { + "epoch": 2.119467176393286, + "grad_norm": 0.470987292017048, + "learning_rate": 4.356753557178999e-06, + "loss": 0.2782, + "step": 4436 + }, + { + "epoch": 2.1199450450988593, + "grad_norm": 0.46682100572447127, + "learning_rate": 4.352406391807318e-06, + "loss": 0.2718, + "step": 4437 + }, + { + "epoch": 2.120422913804432, + "grad_norm": 0.5044197345551845, + "learning_rate": 4.348060792978479e-06, + "loss": 0.2655, + "step": 4438 + }, + { + "epoch": 2.1209007825100055, + "grad_norm": 0.4681899555394538, + "learning_rate": 4.34371676189788e-06, + "loss": 0.2658, + "step": 4439 + }, + { + "epoch": 2.1213786512155783, + "grad_norm": 0.4703827383388374, + "learning_rate": 4.339374299770477e-06, + "loss": 0.2728, + "step": 4440 + }, + { + "epoch": 2.1218565199211517, + "grad_norm": 0.48211199249212094, + "learning_rate": 4.335033407800787e-06, + "loss": 0.2863, + "step": 4441 + }, + { + "epoch": 2.122334388626725, + "grad_norm": 0.48843268326528816, + "learning_rate": 4.330694087192906e-06, + "loss": 0.2813, + "step": 4442 + }, + { + "epoch": 2.122812257332298, + "grad_norm": 0.5164345291433518, + "learning_rate": 4.32635633915048e-06, + "loss": 0.2712, + "step": 4443 + }, + { + "epoch": 2.123290126037871, + "grad_norm": 0.46371527898883846, + "learning_rate": 4.322020164876722e-06, + "loss": 0.2639, + "step": 4444 + }, + { + "epoch": 2.123767994743444, + "grad_norm": 0.48627299210477126, + "learning_rate": 4.31768556557441e-06, + "loss": 0.2661, + "step": 4445 + }, + { + "epoch": 2.1242458634490173, + "grad_norm": 0.5076108512131333, + "learning_rate": 4.313352542445892e-06, + "loss": 0.2763, + "step": 4446 + }, + { + "epoch": 2.1247237321545906, + "grad_norm": 0.4982314792290385, + "learning_rate": 4.309021096693069e-06, + "loss": 0.2623, + "step": 4447 + }, + { + "epoch": 2.1252016008601635, + "grad_norm": 0.5098348084219093, + "learning_rate": 4.3046912295174015e-06, + "loss": 0.2598, + "step": 4448 + }, + { + "epoch": 2.125679469565737, + "grad_norm": 0.4748754501419402, + "learning_rate": 4.300362942119929e-06, + "loss": 0.2707, + "step": 4449 + }, + { + "epoch": 2.12615733827131, + "grad_norm": 0.47655181277542424, + "learning_rate": 4.296036235701235e-06, + "loss": 0.2738, + "step": 4450 + }, + { + "epoch": 2.126635206976883, + "grad_norm": 0.7574458511584494, + "learning_rate": 4.29171111146147e-06, + "loss": 0.2991, + "step": 4451 + }, + { + "epoch": 2.1271130756824563, + "grad_norm": 0.47222019908644236, + "learning_rate": 4.2873875706003535e-06, + "loss": 0.2535, + "step": 4452 + }, + { + "epoch": 2.127590944388029, + "grad_norm": 0.45799953680867805, + "learning_rate": 4.283065614317156e-06, + "loss": 0.2612, + "step": 4453 + }, + { + "epoch": 2.1280688130936025, + "grad_norm": 0.522281047897022, + "learning_rate": 4.278745243810709e-06, + "loss": 0.2844, + "step": 4454 + }, + { + "epoch": 2.128546681799176, + "grad_norm": 0.4660306281172418, + "learning_rate": 4.274426460279412e-06, + "loss": 0.2466, + "step": 4455 + }, + { + "epoch": 2.1290245505047487, + "grad_norm": 0.4785082179915365, + "learning_rate": 4.270109264921221e-06, + "loss": 0.2797, + "step": 4456 + }, + { + "epoch": 2.129502419210322, + "grad_norm": 0.48055449069442646, + "learning_rate": 4.26579365893364e-06, + "loss": 0.2984, + "step": 4457 + }, + { + "epoch": 2.1299802879158953, + "grad_norm": 0.5098722373675344, + "learning_rate": 4.261479643513753e-06, + "loss": 0.26, + "step": 4458 + }, + { + "epoch": 2.130458156621468, + "grad_norm": 0.49399059503460835, + "learning_rate": 4.257167219858187e-06, + "loss": 0.2683, + "step": 4459 + }, + { + "epoch": 2.1309360253270415, + "grad_norm": 0.5080365898455398, + "learning_rate": 4.252856389163128e-06, + "loss": 0.2754, + "step": 4460 + }, + { + "epoch": 2.1314138940326144, + "grad_norm": 0.5554438469805086, + "learning_rate": 4.248547152624334e-06, + "loss": 0.2601, + "step": 4461 + }, + { + "epoch": 2.1318917627381877, + "grad_norm": 0.48601767105637644, + "learning_rate": 4.244239511437105e-06, + "loss": 0.2734, + "step": 4462 + }, + { + "epoch": 2.132369631443761, + "grad_norm": 0.46636918648450326, + "learning_rate": 4.239933466796301e-06, + "loss": 0.2573, + "step": 4463 + }, + { + "epoch": 2.132847500149334, + "grad_norm": 0.4882887183192489, + "learning_rate": 4.235629019896352e-06, + "loss": 0.2736, + "step": 4464 + }, + { + "epoch": 2.133325368854907, + "grad_norm": 0.4776658176673478, + "learning_rate": 4.231326171931231e-06, + "loss": 0.2692, + "step": 4465 + }, + { + "epoch": 2.13380323756048, + "grad_norm": 0.4866846981727982, + "learning_rate": 4.227024924094469e-06, + "loss": 0.2577, + "step": 4466 + }, + { + "epoch": 2.1342811062660534, + "grad_norm": 0.4980447283271286, + "learning_rate": 4.222725277579164e-06, + "loss": 0.2876, + "step": 4467 + }, + { + "epoch": 2.1347589749716267, + "grad_norm": 0.4623292009049184, + "learning_rate": 4.218427233577956e-06, + "loss": 0.2736, + "step": 4468 + }, + { + "epoch": 2.1352368436771996, + "grad_norm": 0.48084141681866016, + "learning_rate": 4.214130793283046e-06, + "loss": 0.2801, + "step": 4469 + }, + { + "epoch": 2.135714712382773, + "grad_norm": 0.4649177248019152, + "learning_rate": 4.209835957886196e-06, + "loss": 0.2729, + "step": 4470 + }, + { + "epoch": 2.1361925810883458, + "grad_norm": 0.47322585157208374, + "learning_rate": 4.205542728578714e-06, + "loss": 0.2757, + "step": 4471 + }, + { + "epoch": 2.136670449793919, + "grad_norm": 0.4583877785505501, + "learning_rate": 4.2012511065514636e-06, + "loss": 0.2689, + "step": 4472 + }, + { + "epoch": 2.1371483184994924, + "grad_norm": 0.4870399136505855, + "learning_rate": 4.196961092994871e-06, + "loss": 0.2805, + "step": 4473 + }, + { + "epoch": 2.1376261872050653, + "grad_norm": 0.4685135597479858, + "learning_rate": 4.192672689098908e-06, + "loss": 0.2648, + "step": 4474 + }, + { + "epoch": 2.1381040559106386, + "grad_norm": 0.47535234788643843, + "learning_rate": 4.188385896053098e-06, + "loss": 0.2791, + "step": 4475 + }, + { + "epoch": 2.138581924616212, + "grad_norm": 0.5302293391229079, + "learning_rate": 4.184100715046529e-06, + "loss": 0.2731, + "step": 4476 + }, + { + "epoch": 2.1390597933217848, + "grad_norm": 0.5240248943881992, + "learning_rate": 4.179817147267829e-06, + "loss": 0.2638, + "step": 4477 + }, + { + "epoch": 2.139537662027358, + "grad_norm": 0.4619458564887166, + "learning_rate": 4.1755351939051845e-06, + "loss": 0.2643, + "step": 4478 + }, + { + "epoch": 2.140015530732931, + "grad_norm": 0.4702047131010919, + "learning_rate": 4.171254856146335e-06, + "loss": 0.2615, + "step": 4479 + }, + { + "epoch": 2.1404933994385043, + "grad_norm": 0.4773661172330248, + "learning_rate": 4.166976135178575e-06, + "loss": 0.2865, + "step": 4480 + }, + { + "epoch": 2.1409712681440776, + "grad_norm": 0.47727793467693286, + "learning_rate": 4.1626990321887425e-06, + "loss": 0.2621, + "step": 4481 + }, + { + "epoch": 2.1414491368496504, + "grad_norm": 0.4871383170567948, + "learning_rate": 4.1584235483632265e-06, + "loss": 0.2651, + "step": 4482 + }, + { + "epoch": 2.1419270055552238, + "grad_norm": 0.4714614885059131, + "learning_rate": 4.154149684887977e-06, + "loss": 0.2794, + "step": 4483 + }, + { + "epoch": 2.142404874260797, + "grad_norm": 0.4969507858614288, + "learning_rate": 4.149877442948486e-06, + "loss": 0.2622, + "step": 4484 + }, + { + "epoch": 2.14288274296637, + "grad_norm": 0.507286683219254, + "learning_rate": 4.1456068237297964e-06, + "loss": 0.2664, + "step": 4485 + }, + { + "epoch": 2.1433606116719432, + "grad_norm": 0.4810756637158575, + "learning_rate": 4.1413378284165065e-06, + "loss": 0.2584, + "step": 4486 + }, + { + "epoch": 2.143838480377516, + "grad_norm": 0.49276173044738886, + "learning_rate": 4.1370704581927575e-06, + "loss": 0.2645, + "step": 4487 + }, + { + "epoch": 2.1443163490830894, + "grad_norm": 0.5062183592646868, + "learning_rate": 4.13280471424224e-06, + "loss": 0.2681, + "step": 4488 + }, + { + "epoch": 2.1447942177886627, + "grad_norm": 0.49099788891531865, + "learning_rate": 4.128540597748203e-06, + "loss": 0.2761, + "step": 4489 + }, + { + "epoch": 2.1452720864942356, + "grad_norm": 0.518176466847488, + "learning_rate": 4.124278109893432e-06, + "loss": 0.2663, + "step": 4490 + }, + { + "epoch": 2.145749955199809, + "grad_norm": 0.49131335284040245, + "learning_rate": 4.120017251860266e-06, + "loss": 0.2871, + "step": 4491 + }, + { + "epoch": 2.146227823905382, + "grad_norm": 0.5072033097189093, + "learning_rate": 4.115758024830595e-06, + "loss": 0.2814, + "step": 4492 + }, + { + "epoch": 2.146705692610955, + "grad_norm": 0.4876564849105318, + "learning_rate": 4.111500429985853e-06, + "loss": 0.2759, + "step": 4493 + }, + { + "epoch": 2.1471835613165284, + "grad_norm": 0.4421916536653529, + "learning_rate": 4.1072444685070155e-06, + "loss": 0.2618, + "step": 4494 + }, + { + "epoch": 2.1476614300221013, + "grad_norm": 0.46331905163767734, + "learning_rate": 4.10299014157462e-06, + "loss": 0.2763, + "step": 4495 + }, + { + "epoch": 2.1481392987276746, + "grad_norm": 0.5158897747210883, + "learning_rate": 4.098737450368738e-06, + "loss": 0.2773, + "step": 4496 + }, + { + "epoch": 2.1486171674332475, + "grad_norm": 0.45891199284426043, + "learning_rate": 4.094486396068987e-06, + "loss": 0.2774, + "step": 4497 + }, + { + "epoch": 2.149095036138821, + "grad_norm": 0.47523435146742227, + "learning_rate": 4.0902369798545426e-06, + "loss": 0.2551, + "step": 4498 + }, + { + "epoch": 2.149572904844394, + "grad_norm": 0.44933432383147964, + "learning_rate": 4.085989202904113e-06, + "loss": 0.2606, + "step": 4499 + }, + { + "epoch": 2.150050773549967, + "grad_norm": 0.47676276634177017, + "learning_rate": 4.0817430663959536e-06, + "loss": 0.2785, + "step": 4500 + }, + { + "epoch": 2.1505286422555403, + "grad_norm": 0.45939776538275545, + "learning_rate": 4.077498571507874e-06, + "loss": 0.2714, + "step": 4501 + }, + { + "epoch": 2.1510065109611136, + "grad_norm": 0.4702734224108815, + "learning_rate": 4.073255719417221e-06, + "loss": 0.2615, + "step": 4502 + }, + { + "epoch": 2.1514843796666865, + "grad_norm": 0.4910412013612873, + "learning_rate": 4.0690145113008815e-06, + "loss": 0.2566, + "step": 4503 + }, + { + "epoch": 2.15196224837226, + "grad_norm": 0.48456487516342267, + "learning_rate": 4.064774948335299e-06, + "loss": 0.2683, + "step": 4504 + }, + { + "epoch": 2.1524401170778327, + "grad_norm": 0.4873630350755374, + "learning_rate": 4.06053703169645e-06, + "loss": 0.2628, + "step": 4505 + }, + { + "epoch": 2.152917985783406, + "grad_norm": 0.512860605368886, + "learning_rate": 4.056300762559855e-06, + "loss": 0.267, + "step": 4506 + }, + { + "epoch": 2.1533958544889793, + "grad_norm": 0.5806471029367954, + "learning_rate": 4.052066142100587e-06, + "loss": 0.2632, + "step": 4507 + }, + { + "epoch": 2.153873723194552, + "grad_norm": 0.4748345500291511, + "learning_rate": 4.047833171493251e-06, + "loss": 0.2567, + "step": 4508 + }, + { + "epoch": 2.1543515919001255, + "grad_norm": 0.4789611805358055, + "learning_rate": 4.043601851911996e-06, + "loss": 0.2696, + "step": 4509 + }, + { + "epoch": 2.154829460605699, + "grad_norm": 0.49413914811960363, + "learning_rate": 4.039372184530521e-06, + "loss": 0.2575, + "step": 4510 + }, + { + "epoch": 2.1553073293112717, + "grad_norm": 0.5070625267068553, + "learning_rate": 4.035144170522055e-06, + "loss": 0.2765, + "step": 4511 + }, + { + "epoch": 2.155785198016845, + "grad_norm": 0.45705266978001025, + "learning_rate": 4.030917811059378e-06, + "loss": 0.2705, + "step": 4512 + }, + { + "epoch": 2.156263066722418, + "grad_norm": 1.5569910667354723, + "learning_rate": 4.02669310731481e-06, + "loss": 0.2803, + "step": 4513 + }, + { + "epoch": 2.156740935427991, + "grad_norm": 0.49066820686495577, + "learning_rate": 4.0224700604602085e-06, + "loss": 0.2786, + "step": 4514 + }, + { + "epoch": 2.1572188041335645, + "grad_norm": 0.47471065389215006, + "learning_rate": 4.0182486716669656e-06, + "loss": 0.2566, + "step": 4515 + }, + { + "epoch": 2.1576966728391374, + "grad_norm": 0.46201303993122544, + "learning_rate": 4.014028942106028e-06, + "loss": 0.2644, + "step": 4516 + }, + { + "epoch": 2.1581745415447107, + "grad_norm": 0.4810938505072536, + "learning_rate": 4.009810872947873e-06, + "loss": 0.2723, + "step": 4517 + }, + { + "epoch": 2.1586524102502835, + "grad_norm": 0.45055790887900427, + "learning_rate": 4.005594465362512e-06, + "loss": 0.2731, + "step": 4518 + }, + { + "epoch": 2.159130278955857, + "grad_norm": 0.5023239987249768, + "learning_rate": 4.00137972051951e-06, + "loss": 0.2721, + "step": 4519 + }, + { + "epoch": 2.15960814766143, + "grad_norm": 0.507580098147782, + "learning_rate": 3.9971666395879605e-06, + "loss": 0.2635, + "step": 4520 + }, + { + "epoch": 2.160086016367003, + "grad_norm": 0.47302338046197384, + "learning_rate": 3.992955223736493e-06, + "loss": 0.2621, + "step": 4521 + }, + { + "epoch": 2.1605638850725764, + "grad_norm": 0.4426581792894162, + "learning_rate": 3.9887454741332874e-06, + "loss": 0.2738, + "step": 4522 + }, + { + "epoch": 2.161041753778149, + "grad_norm": 0.5038924092723644, + "learning_rate": 3.984537391946051e-06, + "loss": 0.2859, + "step": 4523 + }, + { + "epoch": 2.1615196224837225, + "grad_norm": 0.5097699500822241, + "learning_rate": 3.980330978342027e-06, + "loss": 0.2836, + "step": 4524 + }, + { + "epoch": 2.161997491189296, + "grad_norm": 0.4997990150539291, + "learning_rate": 3.9761262344880096e-06, + "loss": 0.2862, + "step": 4525 + }, + { + "epoch": 2.1624753598948687, + "grad_norm": 0.5068009705170106, + "learning_rate": 3.971923161550314e-06, + "loss": 0.2617, + "step": 4526 + }, + { + "epoch": 2.162953228600442, + "grad_norm": 0.5121923973953294, + "learning_rate": 3.967721760694796e-06, + "loss": 0.2677, + "step": 4527 + }, + { + "epoch": 2.1634310973060153, + "grad_norm": 0.4586590918299566, + "learning_rate": 3.963522033086858e-06, + "loss": 0.2812, + "step": 4528 + }, + { + "epoch": 2.163908966011588, + "grad_norm": 0.48386944735308773, + "learning_rate": 3.959323979891427e-06, + "loss": 0.2657, + "step": 4529 + }, + { + "epoch": 2.1643868347171615, + "grad_norm": 0.48660859734621076, + "learning_rate": 3.9551276022729644e-06, + "loss": 0.292, + "step": 4530 + }, + { + "epoch": 2.1648647034227344, + "grad_norm": 0.571916496126486, + "learning_rate": 3.9509329013954775e-06, + "loss": 0.2669, + "step": 4531 + }, + { + "epoch": 2.1653425721283077, + "grad_norm": 0.4731522622117423, + "learning_rate": 3.946739878422502e-06, + "loss": 0.2674, + "step": 4532 + }, + { + "epoch": 2.165820440833881, + "grad_norm": 0.4585592677843948, + "learning_rate": 3.942548534517102e-06, + "loss": 0.2754, + "step": 4533 + }, + { + "epoch": 2.166298309539454, + "grad_norm": 0.45992994288587097, + "learning_rate": 3.938358870841891e-06, + "loss": 0.2842, + "step": 4534 + }, + { + "epoch": 2.166776178245027, + "grad_norm": 0.48157396171772915, + "learning_rate": 3.9341708885590034e-06, + "loss": 0.2721, + "step": 4535 + }, + { + "epoch": 2.1672540469506005, + "grad_norm": 0.46636131218315996, + "learning_rate": 3.9299845888301084e-06, + "loss": 0.2545, + "step": 4536 + }, + { + "epoch": 2.1677319156561734, + "grad_norm": 0.48593748291916344, + "learning_rate": 3.925799972816419e-06, + "loss": 0.2478, + "step": 4537 + }, + { + "epoch": 2.1682097843617467, + "grad_norm": 0.5044885208397369, + "learning_rate": 3.921617041678669e-06, + "loss": 0.2679, + "step": 4538 + }, + { + "epoch": 2.1686876530673196, + "grad_norm": 0.4667942698450788, + "learning_rate": 3.917435796577128e-06, + "loss": 0.2522, + "step": 4539 + }, + { + "epoch": 2.169165521772893, + "grad_norm": 0.4949207913026006, + "learning_rate": 3.913256238671607e-06, + "loss": 0.2694, + "step": 4540 + }, + { + "epoch": 2.169643390478466, + "grad_norm": 0.4922718382704214, + "learning_rate": 3.909078369121435e-06, + "loss": 0.272, + "step": 4541 + }, + { + "epoch": 2.170121259184039, + "grad_norm": 0.5568367688738352, + "learning_rate": 3.904902189085479e-06, + "loss": 0.2744, + "step": 4542 + }, + { + "epoch": 2.1705991278896124, + "grad_norm": 0.4687165591772829, + "learning_rate": 3.900727699722144e-06, + "loss": 0.2758, + "step": 4543 + }, + { + "epoch": 2.1710769965951853, + "grad_norm": 0.45872513093731365, + "learning_rate": 3.896554902189355e-06, + "loss": 0.2723, + "step": 4544 + }, + { + "epoch": 2.1715548653007586, + "grad_norm": 0.4637298698793799, + "learning_rate": 3.89238379764457e-06, + "loss": 0.2448, + "step": 4545 + }, + { + "epoch": 2.172032734006332, + "grad_norm": 0.48363038021481214, + "learning_rate": 3.888214387244783e-06, + "loss": 0.2657, + "step": 4546 + }, + { + "epoch": 2.1725106027119048, + "grad_norm": 0.4550871624285722, + "learning_rate": 3.884046672146518e-06, + "loss": 0.2854, + "step": 4547 + }, + { + "epoch": 2.172988471417478, + "grad_norm": 0.495376118420438, + "learning_rate": 3.879880653505824e-06, + "loss": 0.2633, + "step": 4548 + }, + { + "epoch": 2.1734663401230514, + "grad_norm": 0.46098656670137583, + "learning_rate": 3.875716332478275e-06, + "loss": 0.2657, + "step": 4549 + }, + { + "epoch": 2.1739442088286243, + "grad_norm": 0.4751772768586887, + "learning_rate": 3.871553710218988e-06, + "loss": 0.2837, + "step": 4550 + }, + { + "epoch": 2.1744220775341976, + "grad_norm": 0.47152321412549136, + "learning_rate": 3.867392787882599e-06, + "loss": 0.2638, + "step": 4551 + }, + { + "epoch": 2.1748999462397705, + "grad_norm": 0.44704792893751294, + "learning_rate": 3.8632335666232686e-06, + "loss": 0.2602, + "step": 4552 + }, + { + "epoch": 2.1753778149453438, + "grad_norm": 0.6555651445287016, + "learning_rate": 3.859076047594701e-06, + "loss": 0.2518, + "step": 4553 + }, + { + "epoch": 2.175855683650917, + "grad_norm": 0.4869920295121113, + "learning_rate": 3.854920231950113e-06, + "loss": 0.2821, + "step": 4554 + }, + { + "epoch": 2.17633355235649, + "grad_norm": 0.45671533130726394, + "learning_rate": 3.850766120842252e-06, + "loss": 0.2801, + "step": 4555 + }, + { + "epoch": 2.1768114210620633, + "grad_norm": 0.4608951885566162, + "learning_rate": 3.846613715423402e-06, + "loss": 0.2658, + "step": 4556 + }, + { + "epoch": 2.177289289767636, + "grad_norm": 0.46418435619628584, + "learning_rate": 3.842463016845362e-06, + "loss": 0.2735, + "step": 4557 + }, + { + "epoch": 2.1777671584732095, + "grad_norm": 0.47932679730203265, + "learning_rate": 3.838314026259462e-06, + "loss": 0.25, + "step": 4558 + }, + { + "epoch": 2.1782450271787828, + "grad_norm": 0.47237247496031814, + "learning_rate": 3.8341667448165645e-06, + "loss": 0.262, + "step": 4559 + }, + { + "epoch": 2.1787228958843556, + "grad_norm": 0.5186757703706331, + "learning_rate": 3.830021173667048e-06, + "loss": 0.2884, + "step": 4560 + }, + { + "epoch": 2.179200764589929, + "grad_norm": 0.44583744351159743, + "learning_rate": 3.8258773139608185e-06, + "loss": 0.2725, + "step": 4561 + }, + { + "epoch": 2.1796786332955023, + "grad_norm": 0.4772180156318942, + "learning_rate": 3.821735166847316e-06, + "loss": 0.2641, + "step": 4562 + }, + { + "epoch": 2.180156502001075, + "grad_norm": 0.5094816309115656, + "learning_rate": 3.817594733475494e-06, + "loss": 0.2657, + "step": 4563 + }, + { + "epoch": 2.1806343707066485, + "grad_norm": 0.4456377533344565, + "learning_rate": 3.813456014993835e-06, + "loss": 0.2637, + "step": 4564 + }, + { + "epoch": 2.1811122394122213, + "grad_norm": 0.47505222662407803, + "learning_rate": 3.809319012550352e-06, + "loss": 0.2713, + "step": 4565 + }, + { + "epoch": 2.1815901081177946, + "grad_norm": 0.5173681589868041, + "learning_rate": 3.8051837272925728e-06, + "loss": 0.2813, + "step": 4566 + }, + { + "epoch": 2.182067976823368, + "grad_norm": 0.5015740347460304, + "learning_rate": 3.80105016036755e-06, + "loss": 0.2708, + "step": 4567 + }, + { + "epoch": 2.182545845528941, + "grad_norm": 0.45211083979868544, + "learning_rate": 3.796918312921868e-06, + "loss": 0.2774, + "step": 4568 + }, + { + "epoch": 2.183023714234514, + "grad_norm": 0.4580673412053882, + "learning_rate": 3.792788186101626e-06, + "loss": 0.2786, + "step": 4569 + }, + { + "epoch": 2.183501582940087, + "grad_norm": 0.483142068598242, + "learning_rate": 3.788659781052444e-06, + "loss": 0.2662, + "step": 4570 + }, + { + "epoch": 2.1839794516456603, + "grad_norm": 0.4585071424628377, + "learning_rate": 3.7845330989194762e-06, + "loss": 0.2537, + "step": 4571 + }, + { + "epoch": 2.1844573203512336, + "grad_norm": 0.46364154074230174, + "learning_rate": 3.780408140847387e-06, + "loss": 0.2703, + "step": 4572 + }, + { + "epoch": 2.1849351890568065, + "grad_norm": 0.47912865130428883, + "learning_rate": 3.7762849079803654e-06, + "loss": 0.2599, + "step": 4573 + }, + { + "epoch": 2.18541305776238, + "grad_norm": 0.486225553510062, + "learning_rate": 3.772163401462129e-06, + "loss": 0.2707, + "step": 4574 + }, + { + "epoch": 2.185890926467953, + "grad_norm": 0.5017350180630475, + "learning_rate": 3.7680436224359084e-06, + "loss": 0.278, + "step": 4575 + }, + { + "epoch": 2.186368795173526, + "grad_norm": 0.44633892621886945, + "learning_rate": 3.7639255720444532e-06, + "loss": 0.2564, + "step": 4576 + }, + { + "epoch": 2.1868466638790993, + "grad_norm": 0.47140634322880026, + "learning_rate": 3.7598092514300456e-06, + "loss": 0.2706, + "step": 4577 + }, + { + "epoch": 2.187324532584672, + "grad_norm": 0.47775263124381995, + "learning_rate": 3.7556946617344757e-06, + "loss": 0.2768, + "step": 4578 + }, + { + "epoch": 2.1878024012902455, + "grad_norm": 0.4662918039512788, + "learning_rate": 3.751581804099056e-06, + "loss": 0.2702, + "step": 4579 + }, + { + "epoch": 2.188280269995819, + "grad_norm": 0.4526279489034841, + "learning_rate": 3.747470679664624e-06, + "loss": 0.27, + "step": 4580 + }, + { + "epoch": 2.1887581387013917, + "grad_norm": 0.461833024668242, + "learning_rate": 3.7433612895715356e-06, + "loss": 0.2525, + "step": 4581 + }, + { + "epoch": 2.189236007406965, + "grad_norm": 0.45967111066950117, + "learning_rate": 3.739253634959661e-06, + "loss": 0.2755, + "step": 4582 + }, + { + "epoch": 2.1897138761125383, + "grad_norm": 0.4888680597465754, + "learning_rate": 3.735147716968386e-06, + "loss": 0.2578, + "step": 4583 + }, + { + "epoch": 2.190191744818111, + "grad_norm": 0.4986060150836118, + "learning_rate": 3.731043536736628e-06, + "loss": 0.2694, + "step": 4584 + }, + { + "epoch": 2.1906696135236845, + "grad_norm": 0.4543474936433639, + "learning_rate": 3.7269410954028107e-06, + "loss": 0.2592, + "step": 4585 + }, + { + "epoch": 2.1911474822292574, + "grad_norm": 1.2235953943030329, + "learning_rate": 3.7228403941048753e-06, + "loss": 0.2595, + "step": 4586 + }, + { + "epoch": 2.1916253509348307, + "grad_norm": 0.48594813503110623, + "learning_rate": 3.7187414339802906e-06, + "loss": 0.2774, + "step": 4587 + }, + { + "epoch": 2.192103219640404, + "grad_norm": 0.4911464829074915, + "learning_rate": 3.7146442161660336e-06, + "loss": 0.2636, + "step": 4588 + }, + { + "epoch": 2.192581088345977, + "grad_norm": 0.47380456620012534, + "learning_rate": 3.710548741798594e-06, + "loss": 0.265, + "step": 4589 + }, + { + "epoch": 2.19305895705155, + "grad_norm": 0.4706478201898707, + "learning_rate": 3.706455012013994e-06, + "loss": 0.2758, + "step": 4590 + }, + { + "epoch": 2.193536825757123, + "grad_norm": 0.4635842650188313, + "learning_rate": 3.702363027947757e-06, + "loss": 0.2703, + "step": 4591 + }, + { + "epoch": 2.1940146944626964, + "grad_norm": 0.4655317804125815, + "learning_rate": 3.6982727907349247e-06, + "loss": 0.2718, + "step": 4592 + }, + { + "epoch": 2.1944925631682697, + "grad_norm": 0.4495126115308564, + "learning_rate": 3.694184301510063e-06, + "loss": 0.2737, + "step": 4593 + }, + { + "epoch": 2.1949704318738426, + "grad_norm": 0.4535524871952706, + "learning_rate": 3.6900975614072433e-06, + "loss": 0.2621, + "step": 4594 + }, + { + "epoch": 2.195448300579416, + "grad_norm": 0.44992841504040854, + "learning_rate": 3.6860125715600513e-06, + "loss": 0.2805, + "step": 4595 + }, + { + "epoch": 2.1959261692849887, + "grad_norm": 0.45869083287966966, + "learning_rate": 3.6819293331015993e-06, + "loss": 0.2865, + "step": 4596 + }, + { + "epoch": 2.196404037990562, + "grad_norm": 0.46107098925354134, + "learning_rate": 3.6778478471645008e-06, + "loss": 0.2624, + "step": 4597 + }, + { + "epoch": 2.1968819066961354, + "grad_norm": 0.44462298567559727, + "learning_rate": 3.6737681148808855e-06, + "loss": 0.2602, + "step": 4598 + }, + { + "epoch": 2.1973597754017082, + "grad_norm": 0.4596846501814117, + "learning_rate": 3.6696901373824056e-06, + "loss": 0.2721, + "step": 4599 + }, + { + "epoch": 2.1978376441072816, + "grad_norm": 0.6638891345781536, + "learning_rate": 3.665613915800217e-06, + "loss": 0.261, + "step": 4600 + }, + { + "epoch": 2.198315512812855, + "grad_norm": 0.49610959186848946, + "learning_rate": 3.6615394512649884e-06, + "loss": 0.2774, + "step": 4601 + }, + { + "epoch": 2.1987933815184277, + "grad_norm": 0.5056345462489438, + "learning_rate": 3.65746674490691e-06, + "loss": 0.2835, + "step": 4602 + }, + { + "epoch": 2.199271250224001, + "grad_norm": 0.44612162373312525, + "learning_rate": 3.6533957978556777e-06, + "loss": 0.2798, + "step": 4603 + }, + { + "epoch": 2.199749118929574, + "grad_norm": 0.4722815796388612, + "learning_rate": 3.6493266112404947e-06, + "loss": 0.2571, + "step": 4604 + }, + { + "epoch": 2.2002269876351472, + "grad_norm": 0.4719966235892821, + "learning_rate": 3.6452591861900886e-06, + "loss": 0.2504, + "step": 4605 + }, + { + "epoch": 2.2007048563407205, + "grad_norm": 0.4738688790143464, + "learning_rate": 3.641193523832689e-06, + "loss": 0.2642, + "step": 4606 + }, + { + "epoch": 2.2011827250462934, + "grad_norm": 0.5033486532463676, + "learning_rate": 3.637129625296035e-06, + "loss": 0.2634, + "step": 4607 + }, + { + "epoch": 2.2016605937518667, + "grad_norm": 0.4960105505883145, + "learning_rate": 3.633067491707387e-06, + "loss": 0.2978, + "step": 4608 + }, + { + "epoch": 2.20213846245744, + "grad_norm": 0.47252895928191396, + "learning_rate": 3.6290071241935067e-06, + "loss": 0.2712, + "step": 4609 + }, + { + "epoch": 2.202616331163013, + "grad_norm": 0.46564634687060924, + "learning_rate": 3.6249485238806637e-06, + "loss": 0.2749, + "step": 4610 + }, + { + "epoch": 2.2030941998685862, + "grad_norm": 0.47341686338285643, + "learning_rate": 3.62089169189465e-06, + "loss": 0.2481, + "step": 4611 + }, + { + "epoch": 2.203572068574159, + "grad_norm": 0.4813130606312593, + "learning_rate": 3.6168366293607526e-06, + "loss": 0.2686, + "step": 4612 + }, + { + "epoch": 2.2040499372797324, + "grad_norm": 0.4572672198704176, + "learning_rate": 3.612783337403776e-06, + "loss": 0.2702, + "step": 4613 + }, + { + "epoch": 2.2045278059853057, + "grad_norm": 0.4506090325494477, + "learning_rate": 3.6087318171480368e-06, + "loss": 0.2668, + "step": 4614 + }, + { + "epoch": 2.2050056746908786, + "grad_norm": 0.47410093157277494, + "learning_rate": 3.6046820697173514e-06, + "loss": 0.2699, + "step": 4615 + }, + { + "epoch": 2.205483543396452, + "grad_norm": 0.46061091915894664, + "learning_rate": 3.600634096235046e-06, + "loss": 0.2567, + "step": 4616 + }, + { + "epoch": 2.205961412102025, + "grad_norm": 0.4868564505945191, + "learning_rate": 3.596587897823962e-06, + "loss": 0.2779, + "step": 4617 + }, + { + "epoch": 2.206439280807598, + "grad_norm": 0.4832363834974607, + "learning_rate": 3.59254347560644e-06, + "loss": 0.2638, + "step": 4618 + }, + { + "epoch": 2.2069171495131714, + "grad_norm": 0.5648718684599673, + "learning_rate": 3.58850083070433e-06, + "loss": 0.274, + "step": 4619 + }, + { + "epoch": 2.2073950182187443, + "grad_norm": 0.4940852949638152, + "learning_rate": 3.5844599642389965e-06, + "loss": 0.2675, + "step": 4620 + }, + { + "epoch": 2.2078728869243176, + "grad_norm": 0.4787783856445962, + "learning_rate": 3.5804208773313e-06, + "loss": 0.2678, + "step": 4621 + }, + { + "epoch": 2.2083507556298905, + "grad_norm": 0.4985708934791662, + "learning_rate": 3.576383571101609e-06, + "loss": 0.2607, + "step": 4622 + }, + { + "epoch": 2.208828624335464, + "grad_norm": 0.4467068890356679, + "learning_rate": 3.572348046669809e-06, + "loss": 0.2782, + "step": 4623 + }, + { + "epoch": 2.209306493041037, + "grad_norm": 0.624896596323991, + "learning_rate": 3.5683143051552784e-06, + "loss": 0.2758, + "step": 4624 + }, + { + "epoch": 2.20978436174661, + "grad_norm": 0.49994003409060306, + "learning_rate": 3.564282347676903e-06, + "loss": 0.2788, + "step": 4625 + }, + { + "epoch": 2.2102622304521833, + "grad_norm": 0.47742176535286496, + "learning_rate": 3.560252175353084e-06, + "loss": 0.266, + "step": 4626 + }, + { + "epoch": 2.2107400991577566, + "grad_norm": 0.47656971421522326, + "learning_rate": 3.556223789301716e-06, + "loss": 0.2681, + "step": 4627 + }, + { + "epoch": 2.2112179678633295, + "grad_norm": 0.569244955801473, + "learning_rate": 3.552197190640203e-06, + "loss": 0.2668, + "step": 4628 + }, + { + "epoch": 2.211695836568903, + "grad_norm": 0.4642389811727502, + "learning_rate": 3.5481723804854485e-06, + "loss": 0.2682, + "step": 4629 + }, + { + "epoch": 2.2121737052744757, + "grad_norm": 0.5017601977034839, + "learning_rate": 3.54414935995387e-06, + "loss": 0.257, + "step": 4630 + }, + { + "epoch": 2.212651573980049, + "grad_norm": 0.4986978616999572, + "learning_rate": 3.540128130161381e-06, + "loss": 0.2618, + "step": 4631 + }, + { + "epoch": 2.2131294426856223, + "grad_norm": 0.6552711808635082, + "learning_rate": 3.5361086922233944e-06, + "loss": 0.2771, + "step": 4632 + }, + { + "epoch": 2.213607311391195, + "grad_norm": 0.5079540763737515, + "learning_rate": 3.53209104725484e-06, + "loss": 0.2715, + "step": 4633 + }, + { + "epoch": 2.2140851800967685, + "grad_norm": 0.5478379669178205, + "learning_rate": 3.5280751963701356e-06, + "loss": 0.253, + "step": 4634 + }, + { + "epoch": 2.214563048802342, + "grad_norm": 0.481691741150192, + "learning_rate": 3.524061140683206e-06, + "loss": 0.2822, + "step": 4635 + }, + { + "epoch": 2.2150409175079147, + "grad_norm": 0.46183826966564606, + "learning_rate": 3.520048881307486e-06, + "loss": 0.277, + "step": 4636 + }, + { + "epoch": 2.215518786213488, + "grad_norm": 0.6334146180967442, + "learning_rate": 3.5160384193559017e-06, + "loss": 0.2745, + "step": 4637 + }, + { + "epoch": 2.215996654919061, + "grad_norm": 0.4842285182462256, + "learning_rate": 3.512029755940882e-06, + "loss": 0.2682, + "step": 4638 + }, + { + "epoch": 2.216474523624634, + "grad_norm": 0.47466868378671784, + "learning_rate": 3.5080228921743653e-06, + "loss": 0.2902, + "step": 4639 + }, + { + "epoch": 2.2169523923302075, + "grad_norm": 0.51214418083174, + "learning_rate": 3.5040178291677816e-06, + "loss": 0.2522, + "step": 4640 + }, + { + "epoch": 2.2174302610357803, + "grad_norm": 0.4665166989520757, + "learning_rate": 3.5000145680320617e-06, + "loss": 0.265, + "step": 4641 + }, + { + "epoch": 2.2179081297413537, + "grad_norm": 0.48603526531215135, + "learning_rate": 3.496013109877646e-06, + "loss": 0.2772, + "step": 4642 + }, + { + "epoch": 2.2183859984469265, + "grad_norm": 0.5575762979046887, + "learning_rate": 3.4920134558144645e-06, + "loss": 0.2589, + "step": 4643 + }, + { + "epoch": 2.2188638671525, + "grad_norm": 0.4895518470618773, + "learning_rate": 3.48801560695195e-06, + "loss": 0.2759, + "step": 4644 + }, + { + "epoch": 2.219341735858073, + "grad_norm": 0.6734863253510791, + "learning_rate": 3.4840195643990383e-06, + "loss": 0.2593, + "step": 4645 + }, + { + "epoch": 2.219819604563646, + "grad_norm": 0.48164159307131443, + "learning_rate": 3.4800253292641574e-06, + "loss": 0.2496, + "step": 4646 + }, + { + "epoch": 2.2202974732692193, + "grad_norm": 0.5194894835045092, + "learning_rate": 3.476032902655239e-06, + "loss": 0.2731, + "step": 4647 + }, + { + "epoch": 2.220775341974792, + "grad_norm": 0.5048689424410697, + "learning_rate": 3.4720422856797163e-06, + "loss": 0.272, + "step": 4648 + }, + { + "epoch": 2.2212532106803655, + "grad_norm": 0.4555324657809056, + "learning_rate": 3.468053479444512e-06, + "loss": 0.275, + "step": 4649 + }, + { + "epoch": 2.221731079385939, + "grad_norm": 0.49021751307121847, + "learning_rate": 3.464066485056048e-06, + "loss": 0.2744, + "step": 4650 + }, + { + "epoch": 2.2222089480915117, + "grad_norm": 0.4410281797101959, + "learning_rate": 3.460081303620252e-06, + "loss": 0.2757, + "step": 4651 + }, + { + "epoch": 2.222686816797085, + "grad_norm": 0.5014942315206841, + "learning_rate": 3.4560979362425406e-06, + "loss": 0.2797, + "step": 4652 + }, + { + "epoch": 2.2231646855026583, + "grad_norm": 0.45504543451237, + "learning_rate": 3.452116384027826e-06, + "loss": 0.2573, + "step": 4653 + }, + { + "epoch": 2.223642554208231, + "grad_norm": 0.4803760937380114, + "learning_rate": 3.4481366480805266e-06, + "loss": 0.2483, + "step": 4654 + }, + { + "epoch": 2.2241204229138045, + "grad_norm": 0.457505611780426, + "learning_rate": 3.444158729504549e-06, + "loss": 0.2645, + "step": 4655 + }, + { + "epoch": 2.2245982916193774, + "grad_norm": 0.49598384831058834, + "learning_rate": 3.4401826294032924e-06, + "loss": 0.2738, + "step": 4656 + }, + { + "epoch": 2.2250761603249507, + "grad_norm": 0.47321887795699297, + "learning_rate": 3.436208348879665e-06, + "loss": 0.2852, + "step": 4657 + }, + { + "epoch": 2.225554029030524, + "grad_norm": 0.47386541058260834, + "learning_rate": 3.4322358890360586e-06, + "loss": 0.2809, + "step": 4658 + }, + { + "epoch": 2.226031897736097, + "grad_norm": 0.4403468594295977, + "learning_rate": 3.4282652509743596e-06, + "loss": 0.2586, + "step": 4659 + }, + { + "epoch": 2.22650976644167, + "grad_norm": 0.48721058760605696, + "learning_rate": 3.4242964357959597e-06, + "loss": 0.277, + "step": 4660 + }, + { + "epoch": 2.2269876351472435, + "grad_norm": 0.5068739043457748, + "learning_rate": 3.4203294446017354e-06, + "loss": 0.2519, + "step": 4661 + }, + { + "epoch": 2.2274655038528164, + "grad_norm": 0.4420653696705306, + "learning_rate": 3.416364278492057e-06, + "loss": 0.2831, + "step": 4662 + }, + { + "epoch": 2.2279433725583897, + "grad_norm": 0.486948816480161, + "learning_rate": 3.4124009385667967e-06, + "loss": 0.2734, + "step": 4663 + }, + { + "epoch": 2.2284212412639626, + "grad_norm": 0.49182635386072504, + "learning_rate": 3.408439425925313e-06, + "loss": 0.2615, + "step": 4664 + }, + { + "epoch": 2.228899109969536, + "grad_norm": 0.4713713292607333, + "learning_rate": 3.4044797416664564e-06, + "loss": 0.2789, + "step": 4665 + }, + { + "epoch": 2.229376978675109, + "grad_norm": 0.4553911870292443, + "learning_rate": 3.4005218868885794e-06, + "loss": 0.2865, + "step": 4666 + }, + { + "epoch": 2.229854847380682, + "grad_norm": 0.48295582603522347, + "learning_rate": 3.396565862689518e-06, + "loss": 0.2564, + "step": 4667 + }, + { + "epoch": 2.2303327160862554, + "grad_norm": 0.5092986006165036, + "learning_rate": 3.3926116701666013e-06, + "loss": 0.2805, + "step": 4668 + }, + { + "epoch": 2.2308105847918283, + "grad_norm": 0.45858410229406726, + "learning_rate": 3.3886593104166575e-06, + "loss": 0.2815, + "step": 4669 + }, + { + "epoch": 2.2312884534974016, + "grad_norm": 0.4947175505998691, + "learning_rate": 3.3847087845359996e-06, + "loss": 0.2752, + "step": 4670 + }, + { + "epoch": 2.231766322202975, + "grad_norm": 0.4978122665367534, + "learning_rate": 3.38076009362043e-06, + "loss": 0.2818, + "step": 4671 + }, + { + "epoch": 2.2322441909085478, + "grad_norm": 0.5749495630859802, + "learning_rate": 3.376813238765252e-06, + "loss": 0.2616, + "step": 4672 + }, + { + "epoch": 2.232722059614121, + "grad_norm": 0.46541839278207525, + "learning_rate": 3.3728682210652497e-06, + "loss": 0.2738, + "step": 4673 + }, + { + "epoch": 2.233199928319694, + "grad_norm": 0.4822660477000455, + "learning_rate": 3.3689250416147e-06, + "loss": 0.2846, + "step": 4674 + }, + { + "epoch": 2.2336777970252673, + "grad_norm": 0.4569814026626145, + "learning_rate": 3.364983701507376e-06, + "loss": 0.2602, + "step": 4675 + }, + { + "epoch": 2.2341556657308406, + "grad_norm": 0.4445416419282075, + "learning_rate": 3.361044201836534e-06, + "loss": 0.2695, + "step": 4676 + }, + { + "epoch": 2.2346335344364134, + "grad_norm": 0.45506836119647587, + "learning_rate": 3.357106543694918e-06, + "loss": 0.2778, + "step": 4677 + }, + { + "epoch": 2.2351114031419868, + "grad_norm": 0.4574261555122098, + "learning_rate": 3.3531707281747717e-06, + "loss": 0.2693, + "step": 4678 + }, + { + "epoch": 2.23558927184756, + "grad_norm": 0.5141167452823477, + "learning_rate": 3.3492367563678173e-06, + "loss": 0.2765, + "step": 4679 + }, + { + "epoch": 2.236067140553133, + "grad_norm": 0.6521885973282003, + "learning_rate": 3.3453046293652657e-06, + "loss": 0.2638, + "step": 4680 + }, + { + "epoch": 2.2365450092587063, + "grad_norm": 0.492272086211248, + "learning_rate": 3.3413743482578233e-06, + "loss": 0.2608, + "step": 4681 + }, + { + "epoch": 2.237022877964279, + "grad_norm": 0.4635845305006642, + "learning_rate": 3.337445914135684e-06, + "loss": 0.2617, + "step": 4682 + }, + { + "epoch": 2.2375007466698524, + "grad_norm": 0.5023980255782854, + "learning_rate": 3.3335193280885215e-06, + "loss": 0.2761, + "step": 4683 + }, + { + "epoch": 2.2379786153754258, + "grad_norm": 0.5343058309706397, + "learning_rate": 3.3295945912055006e-06, + "loss": 0.2845, + "step": 4684 + }, + { + "epoch": 2.2384564840809986, + "grad_norm": 0.45734291677162836, + "learning_rate": 3.3256717045752794e-06, + "loss": 0.2629, + "step": 4685 + }, + { + "epoch": 2.238934352786572, + "grad_norm": 0.4585226946873842, + "learning_rate": 3.3217506692859937e-06, + "loss": 0.2684, + "step": 4686 + }, + { + "epoch": 2.2394122214921452, + "grad_norm": 0.48321193724337996, + "learning_rate": 3.317831486425267e-06, + "loss": 0.2785, + "step": 4687 + }, + { + "epoch": 2.239890090197718, + "grad_norm": 0.49773006035607, + "learning_rate": 3.313914157080218e-06, + "loss": 0.2723, + "step": 4688 + }, + { + "epoch": 2.2403679589032914, + "grad_norm": 0.4908921121030998, + "learning_rate": 3.3099986823374407e-06, + "loss": 0.264, + "step": 4689 + }, + { + "epoch": 2.2408458276088643, + "grad_norm": 0.5288018646857777, + "learning_rate": 3.3060850632830167e-06, + "loss": 0.2614, + "step": 4690 + }, + { + "epoch": 2.2413236963144376, + "grad_norm": 0.4422685276222856, + "learning_rate": 3.3021733010025203e-06, + "loss": 0.268, + "step": 4691 + }, + { + "epoch": 2.241801565020011, + "grad_norm": 0.4989573893125483, + "learning_rate": 3.298263396581003e-06, + "loss": 0.266, + "step": 4692 + }, + { + "epoch": 2.242279433725584, + "grad_norm": 0.48611829997172, + "learning_rate": 3.294355351102999e-06, + "loss": 0.2527, + "step": 4693 + }, + { + "epoch": 2.242757302431157, + "grad_norm": 0.4975978815944788, + "learning_rate": 3.2904491656525396e-06, + "loss": 0.2634, + "step": 4694 + }, + { + "epoch": 2.24323517113673, + "grad_norm": 0.47584888029570965, + "learning_rate": 3.286544841313126e-06, + "loss": 0.2589, + "step": 4695 + }, + { + "epoch": 2.2437130398423033, + "grad_norm": 0.4684430376922013, + "learning_rate": 3.2826423791677475e-06, + "loss": 0.2665, + "step": 4696 + }, + { + "epoch": 2.2441909085478766, + "grad_norm": 0.4697921966643509, + "learning_rate": 3.278741780298883e-06, + "loss": 0.2684, + "step": 4697 + }, + { + "epoch": 2.2446687772534495, + "grad_norm": 0.4471854069336229, + "learning_rate": 3.2748430457884883e-06, + "loss": 0.2858, + "step": 4698 + }, + { + "epoch": 2.245146645959023, + "grad_norm": 0.5125951863846561, + "learning_rate": 3.2709461767180007e-06, + "loss": 0.2806, + "step": 4699 + }, + { + "epoch": 2.2456245146645957, + "grad_norm": 0.5015064985997865, + "learning_rate": 3.2670511741683475e-06, + "loss": 0.2919, + "step": 4700 + }, + { + "epoch": 2.246102383370169, + "grad_norm": 0.441527020343577, + "learning_rate": 3.2631580392199316e-06, + "loss": 0.281, + "step": 4701 + }, + { + "epoch": 2.2465802520757423, + "grad_norm": 0.48081974361271224, + "learning_rate": 3.259266772952636e-06, + "loss": 0.2814, + "step": 4702 + }, + { + "epoch": 2.247058120781315, + "grad_norm": 0.5288631768506852, + "learning_rate": 3.2553773764458374e-06, + "loss": 0.2626, + "step": 4703 + }, + { + "epoch": 2.2475359894868885, + "grad_norm": 0.46254313690849835, + "learning_rate": 3.251489850778381e-06, + "loss": 0.2711, + "step": 4704 + }, + { + "epoch": 2.248013858192462, + "grad_norm": 0.4737518642755567, + "learning_rate": 3.2476041970285945e-06, + "loss": 0.2625, + "step": 4705 + }, + { + "epoch": 2.2484917268980347, + "grad_norm": 0.44502937263508435, + "learning_rate": 3.2437204162742975e-06, + "loss": 0.2571, + "step": 4706 + }, + { + "epoch": 2.248969595603608, + "grad_norm": 0.45119405560525677, + "learning_rate": 3.2398385095927775e-06, + "loss": 0.2877, + "step": 4707 + }, + { + "epoch": 2.249447464309181, + "grad_norm": 0.45027149302862923, + "learning_rate": 3.2359584780608055e-06, + "loss": 0.2709, + "step": 4708 + }, + { + "epoch": 2.249925333014754, + "grad_norm": 0.4419939110436344, + "learning_rate": 3.232080322754638e-06, + "loss": 0.2712, + "step": 4709 + }, + { + "epoch": 2.2504032017203275, + "grad_norm": 0.45427067027017565, + "learning_rate": 3.2282040447500063e-06, + "loss": 0.282, + "step": 4710 + }, + { + "epoch": 2.2508810704259004, + "grad_norm": 0.45850633742518837, + "learning_rate": 3.2243296451221164e-06, + "loss": 0.2502, + "step": 4711 + }, + { + "epoch": 2.2513589391314737, + "grad_norm": 0.4575443548721124, + "learning_rate": 3.220457124945665e-06, + "loss": 0.2625, + "step": 4712 + }, + { + "epoch": 2.251836807837047, + "grad_norm": 0.4657370801081544, + "learning_rate": 3.2165864852948147e-06, + "loss": 0.2373, + "step": 4713 + }, + { + "epoch": 2.25231467654262, + "grad_norm": 0.45836793153138744, + "learning_rate": 3.21271772724322e-06, + "loss": 0.2762, + "step": 4714 + }, + { + "epoch": 2.252792545248193, + "grad_norm": 0.4946186941630403, + "learning_rate": 3.208850851863998e-06, + "loss": 0.2888, + "step": 4715 + }, + { + "epoch": 2.253270413953766, + "grad_norm": 0.45248472166716014, + "learning_rate": 3.20498586022976e-06, + "loss": 0.2702, + "step": 4716 + }, + { + "epoch": 2.2537482826593394, + "grad_norm": 0.4594724499710993, + "learning_rate": 3.201122753412582e-06, + "loss": 0.2632, + "step": 4717 + }, + { + "epoch": 2.2542261513649127, + "grad_norm": 0.48222462052144954, + "learning_rate": 3.1972615324840197e-06, + "loss": 0.2553, + "step": 4718 + }, + { + "epoch": 2.2547040200704855, + "grad_norm": 0.4616647404690517, + "learning_rate": 3.193402198515112e-06, + "loss": 0.2704, + "step": 4719 + }, + { + "epoch": 2.255181888776059, + "grad_norm": 0.46141169216599737, + "learning_rate": 3.189544752576369e-06, + "loss": 0.2547, + "step": 4720 + }, + { + "epoch": 2.2556597574816317, + "grad_norm": 0.46118690700708775, + "learning_rate": 3.1856891957377735e-06, + "loss": 0.2678, + "step": 4721 + }, + { + "epoch": 2.256137626187205, + "grad_norm": 0.45651170157809823, + "learning_rate": 3.1818355290687962e-06, + "loss": 0.2782, + "step": 4722 + }, + { + "epoch": 2.2566154948927784, + "grad_norm": 0.49019149614604685, + "learning_rate": 3.177983753638373e-06, + "loss": 0.2498, + "step": 4723 + }, + { + "epoch": 2.257093363598351, + "grad_norm": 0.47354007893571354, + "learning_rate": 3.174133870514914e-06, + "loss": 0.2607, + "step": 4724 + }, + { + "epoch": 2.2575712323039245, + "grad_norm": 0.4752144096947728, + "learning_rate": 3.1702858807663175e-06, + "loss": 0.2769, + "step": 4725 + }, + { + "epoch": 2.2580491010094974, + "grad_norm": 0.48451667423176453, + "learning_rate": 3.166439785459943e-06, + "loss": 0.2708, + "step": 4726 + }, + { + "epoch": 2.2585269697150707, + "grad_norm": 0.4570766080552371, + "learning_rate": 3.1625955856626267e-06, + "loss": 0.2646, + "step": 4727 + }, + { + "epoch": 2.259004838420644, + "grad_norm": 0.4445274144500832, + "learning_rate": 3.1587532824406887e-06, + "loss": 0.2777, + "step": 4728 + }, + { + "epoch": 2.259482707126217, + "grad_norm": 0.4703985338457953, + "learning_rate": 3.1549128768599123e-06, + "loss": 0.2724, + "step": 4729 + }, + { + "epoch": 2.25996057583179, + "grad_norm": 0.46555737939351904, + "learning_rate": 3.151074369985556e-06, + "loss": 0.2716, + "step": 4730 + }, + { + "epoch": 2.2604384445373635, + "grad_norm": 0.45652113924751747, + "learning_rate": 3.147237762882359e-06, + "loss": 0.2695, + "step": 4731 + }, + { + "epoch": 2.2609163132429364, + "grad_norm": 0.4598715452430841, + "learning_rate": 3.143403056614527e-06, + "loss": 0.2631, + "step": 4732 + }, + { + "epoch": 2.2613941819485097, + "grad_norm": 0.5304735402801358, + "learning_rate": 3.139570252245734e-06, + "loss": 0.2731, + "step": 4733 + }, + { + "epoch": 2.261872050654083, + "grad_norm": 0.47206865977241064, + "learning_rate": 3.135739350839141e-06, + "loss": 0.2752, + "step": 4734 + }, + { + "epoch": 2.262349919359656, + "grad_norm": 0.4527194547413837, + "learning_rate": 3.131910353457369e-06, + "loss": 0.2663, + "step": 4735 + }, + { + "epoch": 2.262827788065229, + "grad_norm": 0.4742204418152278, + "learning_rate": 3.1280832611625112e-06, + "loss": 0.2727, + "step": 4736 + }, + { + "epoch": 2.263305656770802, + "grad_norm": 0.4698999007147011, + "learning_rate": 3.12425807501614e-06, + "loss": 0.2672, + "step": 4737 + }, + { + "epoch": 2.2637835254763754, + "grad_norm": 0.4507660759859787, + "learning_rate": 3.1204347960792935e-06, + "loss": 0.3011, + "step": 4738 + }, + { + "epoch": 2.2642613941819487, + "grad_norm": 0.47815020250201823, + "learning_rate": 3.116613425412478e-06, + "loss": 0.2707, + "step": 4739 + }, + { + "epoch": 2.2647392628875216, + "grad_norm": 0.466778767224418, + "learning_rate": 3.112793964075681e-06, + "loss": 0.2742, + "step": 4740 + }, + { + "epoch": 2.265217131593095, + "grad_norm": 0.46764545840736804, + "learning_rate": 3.1089764131283497e-06, + "loss": 0.2618, + "step": 4741 + }, + { + "epoch": 2.2656950002986678, + "grad_norm": 0.4598354811835029, + "learning_rate": 3.105160773629402e-06, + "loss": 0.2727, + "step": 4742 + }, + { + "epoch": 2.266172869004241, + "grad_norm": 0.45939402218354186, + "learning_rate": 3.1013470466372373e-06, + "loss": 0.2617, + "step": 4743 + }, + { + "epoch": 2.2666507377098144, + "grad_norm": 0.689169049474016, + "learning_rate": 3.0975352332097107e-06, + "loss": 0.2719, + "step": 4744 + }, + { + "epoch": 2.2671286064153873, + "grad_norm": 0.4457399095001578, + "learning_rate": 3.0937253344041507e-06, + "loss": 0.2641, + "step": 4745 + }, + { + "epoch": 2.2676064751209606, + "grad_norm": 0.5931896862224231, + "learning_rate": 3.0899173512773607e-06, + "loss": 0.271, + "step": 4746 + }, + { + "epoch": 2.2680843438265335, + "grad_norm": 0.44667772647454984, + "learning_rate": 3.0861112848856024e-06, + "loss": 0.2756, + "step": 4747 + }, + { + "epoch": 2.2685622125321068, + "grad_norm": 0.4870112961699444, + "learning_rate": 3.082307136284616e-06, + "loss": 0.2586, + "step": 4748 + }, + { + "epoch": 2.26904008123768, + "grad_norm": 1.0214438039442433, + "learning_rate": 3.0785049065296057e-06, + "loss": 0.2663, + "step": 4749 + }, + { + "epoch": 2.269517949943253, + "grad_norm": 0.5664811753038376, + "learning_rate": 3.074704596675242e-06, + "loss": 0.2616, + "step": 4750 + }, + { + "epoch": 2.2699958186488263, + "grad_norm": 0.4869676974104964, + "learning_rate": 3.07090620777566e-06, + "loss": 0.2865, + "step": 4751 + }, + { + "epoch": 2.270473687354399, + "grad_norm": 0.5088607622709265, + "learning_rate": 3.067109740884472e-06, + "loss": 0.268, + "step": 4752 + }, + { + "epoch": 2.2709515560599725, + "grad_norm": 0.4469599957069902, + "learning_rate": 3.063315197054747e-06, + "loss": 0.2721, + "step": 4753 + }, + { + "epoch": 2.2714294247655458, + "grad_norm": 0.4524405576091524, + "learning_rate": 3.0595225773390225e-06, + "loss": 0.2709, + "step": 4754 + }, + { + "epoch": 2.2719072934711186, + "grad_norm": 0.5863151040314684, + "learning_rate": 3.055731882789311e-06, + "loss": 0.2697, + "step": 4755 + }, + { + "epoch": 2.272385162176692, + "grad_norm": 0.4626038538364683, + "learning_rate": 3.05194311445708e-06, + "loss": 0.2539, + "step": 4756 + }, + { + "epoch": 2.2728630308822653, + "grad_norm": 0.45285520691551345, + "learning_rate": 3.0481562733932647e-06, + "loss": 0.2605, + "step": 4757 + }, + { + "epoch": 2.273340899587838, + "grad_norm": 0.48766103070408395, + "learning_rate": 3.0443713606482727e-06, + "loss": 0.2584, + "step": 4758 + }, + { + "epoch": 2.2738187682934115, + "grad_norm": 0.49630506462100404, + "learning_rate": 3.0405883772719715e-06, + "loss": 0.2783, + "step": 4759 + }, + { + "epoch": 2.2742966369989848, + "grad_norm": 0.4608594610607203, + "learning_rate": 3.0368073243136874e-06, + "loss": 0.2561, + "step": 4760 + }, + { + "epoch": 2.2747745057045576, + "grad_norm": 0.4530972395926978, + "learning_rate": 3.033028202822228e-06, + "loss": 0.2554, + "step": 4761 + }, + { + "epoch": 2.275252374410131, + "grad_norm": 0.45511832040091293, + "learning_rate": 3.029251013845849e-06, + "loss": 0.2577, + "step": 4762 + }, + { + "epoch": 2.275730243115704, + "grad_norm": 0.9922747694397978, + "learning_rate": 3.0254757584322736e-06, + "loss": 0.2623, + "step": 4763 + }, + { + "epoch": 2.276208111821277, + "grad_norm": 0.48096765381573897, + "learning_rate": 3.0217024376286984e-06, + "loss": 0.2493, + "step": 4764 + }, + { + "epoch": 2.2766859805268505, + "grad_norm": 0.47667943487788295, + "learning_rate": 3.0179310524817707e-06, + "loss": 0.2859, + "step": 4765 + }, + { + "epoch": 2.2771638492324233, + "grad_norm": 0.43920123160920244, + "learning_rate": 3.0141616040376052e-06, + "loss": 0.2761, + "step": 4766 + }, + { + "epoch": 2.2776417179379966, + "grad_norm": 0.485107810636819, + "learning_rate": 3.010394093341785e-06, + "loss": 0.2865, + "step": 4767 + }, + { + "epoch": 2.2781195866435695, + "grad_norm": 0.4621289185973684, + "learning_rate": 3.00662852143935e-06, + "loss": 0.2788, + "step": 4768 + }, + { + "epoch": 2.278597455349143, + "grad_norm": 0.4850819067216184, + "learning_rate": 3.002864889374798e-06, + "loss": 0.2655, + "step": 4769 + }, + { + "epoch": 2.279075324054716, + "grad_norm": 0.47125925955991355, + "learning_rate": 2.9991031981921026e-06, + "loss": 0.2511, + "step": 4770 + }, + { + "epoch": 2.279553192760289, + "grad_norm": 1.1105889455924962, + "learning_rate": 2.9953434489346856e-06, + "loss": 0.2551, + "step": 4771 + }, + { + "epoch": 2.2800310614658623, + "grad_norm": 0.4773544991225756, + "learning_rate": 2.9915856426454324e-06, + "loss": 0.2583, + "step": 4772 + }, + { + "epoch": 2.280508930171435, + "grad_norm": 0.49707858432136753, + "learning_rate": 2.987829780366699e-06, + "loss": 0.2675, + "step": 4773 + }, + { + "epoch": 2.2809867988770085, + "grad_norm": 0.47711687874889647, + "learning_rate": 2.984075863140292e-06, + "loss": 0.2658, + "step": 4774 + }, + { + "epoch": 2.281464667582582, + "grad_norm": 0.585688509831411, + "learning_rate": 2.9803238920074784e-06, + "loss": 0.2547, + "step": 4775 + }, + { + "epoch": 2.2819425362881547, + "grad_norm": 0.46857304420027696, + "learning_rate": 2.976573868008995e-06, + "loss": 0.2658, + "step": 4776 + }, + { + "epoch": 2.282420404993728, + "grad_norm": 0.4558766138524307, + "learning_rate": 2.9728257921850302e-06, + "loss": 0.2586, + "step": 4777 + }, + { + "epoch": 2.282898273699301, + "grad_norm": 0.45756901243901504, + "learning_rate": 2.9690796655752306e-06, + "loss": 0.2689, + "step": 4778 + }, + { + "epoch": 2.283376142404874, + "grad_norm": 0.47832703138164223, + "learning_rate": 2.965335489218711e-06, + "loss": 0.2786, + "step": 4779 + }, + { + "epoch": 2.2838540111104475, + "grad_norm": 0.45406741476852475, + "learning_rate": 2.961593264154038e-06, + "loss": 0.2628, + "step": 4780 + }, + { + "epoch": 2.2843318798160204, + "grad_norm": 0.47436505180344063, + "learning_rate": 2.9578529914192342e-06, + "loss": 0.2603, + "step": 4781 + }, + { + "epoch": 2.2848097485215937, + "grad_norm": 0.45974865789697694, + "learning_rate": 2.954114672051789e-06, + "loss": 0.2602, + "step": 4782 + }, + { + "epoch": 2.285287617227167, + "grad_norm": 0.5087537120467954, + "learning_rate": 2.9503783070886504e-06, + "loss": 0.2638, + "step": 4783 + }, + { + "epoch": 2.28576548593274, + "grad_norm": 0.4593223370985407, + "learning_rate": 2.946643897566216e-06, + "loss": 0.26, + "step": 4784 + }, + { + "epoch": 2.286243354638313, + "grad_norm": 0.4739581626972536, + "learning_rate": 2.9429114445203423e-06, + "loss": 0.2834, + "step": 4785 + }, + { + "epoch": 2.2867212233438865, + "grad_norm": 0.4473066092125534, + "learning_rate": 2.939180948986352e-06, + "loss": 0.258, + "step": 4786 + }, + { + "epoch": 2.2871990920494594, + "grad_norm": 0.7178714404426283, + "learning_rate": 2.9354524119990156e-06, + "loss": 0.264, + "step": 4787 + }, + { + "epoch": 2.2876769607550327, + "grad_norm": 0.4518483998022226, + "learning_rate": 2.9317258345925603e-06, + "loss": 0.2552, + "step": 4788 + }, + { + "epoch": 2.2881548294606056, + "grad_norm": 0.4592657516604581, + "learning_rate": 2.92800121780068e-06, + "loss": 0.2758, + "step": 4789 + }, + { + "epoch": 2.288632698166179, + "grad_norm": 0.703821050907462, + "learning_rate": 2.924278562656514e-06, + "loss": 0.2712, + "step": 4790 + }, + { + "epoch": 2.289110566871752, + "grad_norm": 0.5327889383144262, + "learning_rate": 2.9205578701926575e-06, + "loss": 0.2548, + "step": 4791 + }, + { + "epoch": 2.289588435577325, + "grad_norm": 0.4542697805884633, + "learning_rate": 2.916839141441172e-06, + "loss": 0.2648, + "step": 4792 + }, + { + "epoch": 2.2900663042828984, + "grad_norm": 0.4471368664844579, + "learning_rate": 2.913122377433564e-06, + "loss": 0.2701, + "step": 4793 + }, + { + "epoch": 2.2905441729884712, + "grad_norm": 0.49515537031904927, + "learning_rate": 2.9094075792007948e-06, + "loss": 0.2612, + "step": 4794 + }, + { + "epoch": 2.2910220416940446, + "grad_norm": 0.5023092878736718, + "learning_rate": 2.90569474777329e-06, + "loss": 0.2731, + "step": 4795 + }, + { + "epoch": 2.291499910399618, + "grad_norm": 0.5203729973321249, + "learning_rate": 2.901983884180921e-06, + "loss": 0.2693, + "step": 4796 + }, + { + "epoch": 2.2919777791051907, + "grad_norm": 0.458762113548676, + "learning_rate": 2.8982749894530128e-06, + "loss": 0.2528, + "step": 4797 + }, + { + "epoch": 2.292455647810764, + "grad_norm": 0.4454688102007361, + "learning_rate": 2.8945680646183527e-06, + "loss": 0.2642, + "step": 4798 + }, + { + "epoch": 2.292933516516337, + "grad_norm": 0.47272093877831456, + "learning_rate": 2.8908631107051743e-06, + "loss": 0.2845, + "step": 4799 + }, + { + "epoch": 2.2934113852219102, + "grad_norm": 0.44051662539793884, + "learning_rate": 2.8871601287411634e-06, + "loss": 0.2814, + "step": 4800 + }, + { + "epoch": 2.2938892539274836, + "grad_norm": 0.44919987478542955, + "learning_rate": 2.8834591197534668e-06, + "loss": 0.2616, + "step": 4801 + }, + { + "epoch": 2.2943671226330564, + "grad_norm": 0.4655075873793121, + "learning_rate": 2.879760084768677e-06, + "loss": 0.2598, + "step": 4802 + }, + { + "epoch": 2.2948449913386297, + "grad_norm": 0.44695659849319586, + "learning_rate": 2.8760630248128374e-06, + "loss": 0.2809, + "step": 4803 + }, + { + "epoch": 2.2953228600442026, + "grad_norm": 0.46112658531258754, + "learning_rate": 2.8723679409114536e-06, + "loss": 0.2625, + "step": 4804 + }, + { + "epoch": 2.295800728749776, + "grad_norm": 0.4656636436014579, + "learning_rate": 2.8686748340894744e-06, + "loss": 0.2731, + "step": 4805 + }, + { + "epoch": 2.2962785974553492, + "grad_norm": 0.5844196537588953, + "learning_rate": 2.864983705371298e-06, + "loss": 0.2663, + "step": 4806 + }, + { + "epoch": 2.296756466160922, + "grad_norm": 0.46998718606495676, + "learning_rate": 2.861294555780786e-06, + "loss": 0.2574, + "step": 4807 + }, + { + "epoch": 2.2972343348664954, + "grad_norm": 0.4823897817406597, + "learning_rate": 2.8576073863412402e-06, + "loss": 0.2823, + "step": 4808 + }, + { + "epoch": 2.2977122035720687, + "grad_norm": 0.4531753739973854, + "learning_rate": 2.8539221980754115e-06, + "loss": 0.2541, + "step": 4809 + }, + { + "epoch": 2.2981900722776416, + "grad_norm": 0.4579423043984502, + "learning_rate": 2.850238992005514e-06, + "loss": 0.2737, + "step": 4810 + }, + { + "epoch": 2.298667940983215, + "grad_norm": 0.44211975158230343, + "learning_rate": 2.8465577691532e-06, + "loss": 0.2716, + "step": 4811 + }, + { + "epoch": 2.2991458096887882, + "grad_norm": 0.4480696060954269, + "learning_rate": 2.8428785305395733e-06, + "loss": 0.261, + "step": 4812 + }, + { + "epoch": 2.299623678394361, + "grad_norm": 0.4605681190004609, + "learning_rate": 2.8392012771851963e-06, + "loss": 0.2693, + "step": 4813 + }, + { + "epoch": 2.3001015470999344, + "grad_norm": 0.6621094950095469, + "learning_rate": 2.83552601011007e-06, + "loss": 0.2694, + "step": 4814 + }, + { + "epoch": 2.3005794158055073, + "grad_norm": 0.4783897161449604, + "learning_rate": 2.8318527303336465e-06, + "loss": 0.2756, + "step": 4815 + }, + { + "epoch": 2.3010572845110806, + "grad_norm": 0.4512383810963222, + "learning_rate": 2.828181438874832e-06, + "loss": 0.2712, + "step": 4816 + }, + { + "epoch": 2.301535153216654, + "grad_norm": 0.4712970827373998, + "learning_rate": 2.8245121367519812e-06, + "loss": 0.2603, + "step": 4817 + }, + { + "epoch": 2.302013021922227, + "grad_norm": 0.47038727743638786, + "learning_rate": 2.820844824982889e-06, + "loss": 0.2601, + "step": 4818 + }, + { + "epoch": 2.3024908906278, + "grad_norm": 0.4593763213754035, + "learning_rate": 2.817179504584802e-06, + "loss": 0.2699, + "step": 4819 + }, + { + "epoch": 2.302968759333373, + "grad_norm": 0.4753946259917545, + "learning_rate": 2.81351617657442e-06, + "loss": 0.2676, + "step": 4820 + }, + { + "epoch": 2.3034466280389463, + "grad_norm": 0.46869051732758144, + "learning_rate": 2.8098548419678838e-06, + "loss": 0.2838, + "step": 4821 + }, + { + "epoch": 2.3039244967445196, + "grad_norm": 0.45543892367550015, + "learning_rate": 2.8061955017807797e-06, + "loss": 0.2707, + "step": 4822 + }, + { + "epoch": 2.3044023654500925, + "grad_norm": 0.48105929755291854, + "learning_rate": 2.8025381570281495e-06, + "loss": 0.2735, + "step": 4823 + }, + { + "epoch": 2.304880234155666, + "grad_norm": 0.4562110398780488, + "learning_rate": 2.7988828087244735e-06, + "loss": 0.2584, + "step": 4824 + }, + { + "epoch": 2.3053581028612387, + "grad_norm": 0.4708490596777413, + "learning_rate": 2.795229457883678e-06, + "loss": 0.2521, + "step": 4825 + }, + { + "epoch": 2.305835971566812, + "grad_norm": 0.47823032227536855, + "learning_rate": 2.7915781055191437e-06, + "loss": 0.2446, + "step": 4826 + }, + { + "epoch": 2.3063138402723853, + "grad_norm": 0.4583497787749354, + "learning_rate": 2.7879287526436884e-06, + "loss": 0.2682, + "step": 4827 + }, + { + "epoch": 2.306791708977958, + "grad_norm": 0.5195107891663974, + "learning_rate": 2.784281400269575e-06, + "loss": 0.2639, + "step": 4828 + }, + { + "epoch": 2.3072695776835315, + "grad_norm": 0.44700898622956803, + "learning_rate": 2.7806360494085218e-06, + "loss": 0.2704, + "step": 4829 + }, + { + "epoch": 2.307747446389105, + "grad_norm": 0.45593935022847376, + "learning_rate": 2.7769927010716814e-06, + "loss": 0.2561, + "step": 4830 + }, + { + "epoch": 2.3082253150946777, + "grad_norm": 0.4779945808794087, + "learning_rate": 2.77335135626965e-06, + "loss": 0.2612, + "step": 4831 + }, + { + "epoch": 2.308703183800251, + "grad_norm": 0.4650534484982984, + "learning_rate": 2.76971201601248e-06, + "loss": 0.2704, + "step": 4832 + }, + { + "epoch": 2.309181052505824, + "grad_norm": 0.45968865169832307, + "learning_rate": 2.7660746813096575e-06, + "loss": 0.2788, + "step": 4833 + }, + { + "epoch": 2.309658921211397, + "grad_norm": 0.5586814813353537, + "learning_rate": 2.76243935317011e-06, + "loss": 0.2586, + "step": 4834 + }, + { + "epoch": 2.3101367899169705, + "grad_norm": 0.46957361443859275, + "learning_rate": 2.7588060326022205e-06, + "loss": 0.2636, + "step": 4835 + }, + { + "epoch": 2.3106146586225433, + "grad_norm": 0.5799118470767503, + "learning_rate": 2.755174720613806e-06, + "loss": 0.2903, + "step": 4836 + }, + { + "epoch": 2.3110925273281167, + "grad_norm": 0.48771811958787314, + "learning_rate": 2.7515454182121238e-06, + "loss": 0.2648, + "step": 4837 + }, + { + "epoch": 2.31157039603369, + "grad_norm": 0.4454160846341557, + "learning_rate": 2.7479181264038847e-06, + "loss": 0.2673, + "step": 4838 + }, + { + "epoch": 2.312048264739263, + "grad_norm": 0.48485972776665004, + "learning_rate": 2.7442928461952333e-06, + "loss": 0.2792, + "step": 4839 + }, + { + "epoch": 2.312526133444836, + "grad_norm": 0.44389446972461244, + "learning_rate": 2.740669578591755e-06, + "loss": 0.2575, + "step": 4840 + }, + { + "epoch": 2.313004002150409, + "grad_norm": 0.5119507485701839, + "learning_rate": 2.7370483245984857e-06, + "loss": 0.2579, + "step": 4841 + }, + { + "epoch": 2.3134818708559823, + "grad_norm": 0.479630718784202, + "learning_rate": 2.733429085219895e-06, + "loss": 0.2533, + "step": 4842 + }, + { + "epoch": 2.3139597395615557, + "grad_norm": 0.45539893096166484, + "learning_rate": 2.7298118614598934e-06, + "loss": 0.2874, + "step": 4843 + }, + { + "epoch": 2.3144376082671285, + "grad_norm": 0.4597008048466559, + "learning_rate": 2.726196654321841e-06, + "loss": 0.2775, + "step": 4844 + }, + { + "epoch": 2.314915476972702, + "grad_norm": 0.4884323297495012, + "learning_rate": 2.7225834648085282e-06, + "loss": 0.2512, + "step": 4845 + }, + { + "epoch": 2.3153933456782747, + "grad_norm": 0.4765028408676871, + "learning_rate": 2.7189722939221875e-06, + "loss": 0.2589, + "step": 4846 + }, + { + "epoch": 2.315871214383848, + "grad_norm": 0.4660406532791925, + "learning_rate": 2.715363142664501e-06, + "loss": 0.2648, + "step": 4847 + }, + { + "epoch": 2.3163490830894213, + "grad_norm": 0.4747962483625691, + "learning_rate": 2.711756012036577e-06, + "loss": 0.2778, + "step": 4848 + }, + { + "epoch": 2.316826951794994, + "grad_norm": 0.4632488608800758, + "learning_rate": 2.708150903038972e-06, + "loss": 0.275, + "step": 4849 + }, + { + "epoch": 2.3173048205005675, + "grad_norm": 0.5453668545195846, + "learning_rate": 2.7045478166716843e-06, + "loss": 0.2755, + "step": 4850 + }, + { + "epoch": 2.3177826892061404, + "grad_norm": 0.5001628937077411, + "learning_rate": 2.7009467539341426e-06, + "loss": 0.2691, + "step": 4851 + }, + { + "epoch": 2.3182605579117137, + "grad_norm": 0.4553954046591676, + "learning_rate": 2.6973477158252146e-06, + "loss": 0.2655, + "step": 4852 + }, + { + "epoch": 2.318738426617287, + "grad_norm": 0.4767807917673003, + "learning_rate": 2.6937507033432177e-06, + "loss": 0.2806, + "step": 4853 + }, + { + "epoch": 2.31921629532286, + "grad_norm": 0.45179451190892383, + "learning_rate": 2.690155717485895e-06, + "loss": 0.2696, + "step": 4854 + }, + { + "epoch": 2.319694164028433, + "grad_norm": 0.45905304275528375, + "learning_rate": 2.6865627592504295e-06, + "loss": 0.2672, + "step": 4855 + }, + { + "epoch": 2.3201720327340065, + "grad_norm": 0.4375085321749962, + "learning_rate": 2.6829718296334516e-06, + "loss": 0.2562, + "step": 4856 + }, + { + "epoch": 2.3206499014395794, + "grad_norm": 0.45149139878521694, + "learning_rate": 2.6793829296310183e-06, + "loss": 0.2903, + "step": 4857 + }, + { + "epoch": 2.3211277701451527, + "grad_norm": 0.4575764223416466, + "learning_rate": 2.6757960602386223e-06, + "loss": 0.2534, + "step": 4858 + }, + { + "epoch": 2.3216056388507256, + "grad_norm": 0.4537222729437179, + "learning_rate": 2.6722112224512063e-06, + "loss": 0.2518, + "step": 4859 + }, + { + "epoch": 2.322083507556299, + "grad_norm": 0.45191436383379496, + "learning_rate": 2.668628417263137e-06, + "loss": 0.2737, + "step": 4860 + }, + { + "epoch": 2.322561376261872, + "grad_norm": 0.4670433808850828, + "learning_rate": 2.6650476456682195e-06, + "loss": 0.2645, + "step": 4861 + }, + { + "epoch": 2.323039244967445, + "grad_norm": 0.46184013428909193, + "learning_rate": 2.661468908659701e-06, + "loss": 0.2703, + "step": 4862 + }, + { + "epoch": 2.3235171136730184, + "grad_norm": 0.46573908878240283, + "learning_rate": 2.6578922072302572e-06, + "loss": 0.2681, + "step": 4863 + }, + { + "epoch": 2.3239949823785917, + "grad_norm": 0.4495053674176749, + "learning_rate": 2.6543175423720004e-06, + "loss": 0.2613, + "step": 4864 + }, + { + "epoch": 2.3244728510841646, + "grad_norm": 0.48067303236066894, + "learning_rate": 2.6507449150764852e-06, + "loss": 0.2743, + "step": 4865 + }, + { + "epoch": 2.324950719789738, + "grad_norm": 0.450687819523822, + "learning_rate": 2.6471743263346903e-06, + "loss": 0.2762, + "step": 4866 + }, + { + "epoch": 2.3254285884953108, + "grad_norm": 0.5034337207294447, + "learning_rate": 2.643605777137034e-06, + "loss": 0.2749, + "step": 4867 + }, + { + "epoch": 2.325906457200884, + "grad_norm": 0.5112858649436341, + "learning_rate": 2.6400392684733735e-06, + "loss": 0.2726, + "step": 4868 + }, + { + "epoch": 2.3263843259064574, + "grad_norm": 0.447531179096655, + "learning_rate": 2.636474801332992e-06, + "loss": 0.2798, + "step": 4869 + }, + { + "epoch": 2.3268621946120303, + "grad_norm": 0.47053088394428416, + "learning_rate": 2.632912376704607e-06, + "loss": 0.2772, + "step": 4870 + }, + { + "epoch": 2.3273400633176036, + "grad_norm": 0.4892476562482173, + "learning_rate": 2.629351995576379e-06, + "loss": 0.2728, + "step": 4871 + }, + { + "epoch": 2.3278179320231764, + "grad_norm": 0.4496411171519612, + "learning_rate": 2.6257936589358914e-06, + "loss": 0.2715, + "step": 4872 + }, + { + "epoch": 2.3282958007287498, + "grad_norm": 0.4706505012171985, + "learning_rate": 2.6222373677701607e-06, + "loss": 0.2611, + "step": 4873 + }, + { + "epoch": 2.328773669434323, + "grad_norm": 0.46352164932756873, + "learning_rate": 2.618683123065646e-06, + "loss": 0.2802, + "step": 4874 + }, + { + "epoch": 2.329251538139896, + "grad_norm": 0.4668367012503617, + "learning_rate": 2.615130925808228e-06, + "loss": 0.255, + "step": 4875 + }, + { + "epoch": 2.3297294068454693, + "grad_norm": 0.4526742078440504, + "learning_rate": 2.6115807769832226e-06, + "loss": 0.2543, + "step": 4876 + }, + { + "epoch": 2.330207275551042, + "grad_norm": 0.48303918462763046, + "learning_rate": 2.6080326775753816e-06, + "loss": 0.2816, + "step": 4877 + }, + { + "epoch": 2.3306851442566154, + "grad_norm": 0.5333148870731564, + "learning_rate": 2.604486628568885e-06, + "loss": 0.274, + "step": 4878 + }, + { + "epoch": 2.3311630129621888, + "grad_norm": 0.4581104638202767, + "learning_rate": 2.6009426309473397e-06, + "loss": 0.2565, + "step": 4879 + }, + { + "epoch": 2.3316408816677616, + "grad_norm": 0.4665861539340606, + "learning_rate": 2.597400685693795e-06, + "loss": 0.2801, + "step": 4880 + }, + { + "epoch": 2.332118750373335, + "grad_norm": 0.5767159313567876, + "learning_rate": 2.59386079379072e-06, + "loss": 0.2744, + "step": 4881 + }, + { + "epoch": 2.3325966190789083, + "grad_norm": 0.6636916277687075, + "learning_rate": 2.590322956220015e-06, + "loss": 0.2714, + "step": 4882 + }, + { + "epoch": 2.333074487784481, + "grad_norm": 0.4707596869750743, + "learning_rate": 2.586787173963019e-06, + "loss": 0.274, + "step": 4883 + }, + { + "epoch": 2.3335523564900544, + "grad_norm": 0.5135378298995066, + "learning_rate": 2.5832534480004955e-06, + "loss": 0.2703, + "step": 4884 + }, + { + "epoch": 2.3340302251956278, + "grad_norm": 0.4520964774142493, + "learning_rate": 2.5797217793126373e-06, + "loss": 0.2639, + "step": 4885 + }, + { + "epoch": 2.3345080939012006, + "grad_norm": 0.5034516755934882, + "learning_rate": 2.5761921688790635e-06, + "loss": 0.2665, + "step": 4886 + }, + { + "epoch": 2.334985962606774, + "grad_norm": 0.44517562515923137, + "learning_rate": 2.5726646176788307e-06, + "loss": 0.2736, + "step": 4887 + }, + { + "epoch": 2.335463831312347, + "grad_norm": 0.5542169246110865, + "learning_rate": 2.5691391266904165e-06, + "loss": 0.2665, + "step": 4888 + }, + { + "epoch": 2.33594170001792, + "grad_norm": 0.4735315168725527, + "learning_rate": 2.5656156968917277e-06, + "loss": 0.257, + "step": 4889 + }, + { + "epoch": 2.3364195687234934, + "grad_norm": 0.48423296811822936, + "learning_rate": 2.5620943292601074e-06, + "loss": 0.2784, + "step": 4890 + }, + { + "epoch": 2.3368974374290663, + "grad_norm": 0.4515981436184163, + "learning_rate": 2.5585750247723183e-06, + "loss": 0.2705, + "step": 4891 + }, + { + "epoch": 2.3373753061346396, + "grad_norm": 0.4733626892725105, + "learning_rate": 2.5550577844045498e-06, + "loss": 0.2558, + "step": 4892 + }, + { + "epoch": 2.3378531748402125, + "grad_norm": 0.5043562480358438, + "learning_rate": 2.551542609132428e-06, + "loss": 0.2436, + "step": 4893 + }, + { + "epoch": 2.338331043545786, + "grad_norm": 0.5000834206474044, + "learning_rate": 2.548029499930997e-06, + "loss": 0.2566, + "step": 4894 + }, + { + "epoch": 2.338808912251359, + "grad_norm": 0.4478341988025251, + "learning_rate": 2.5445184577747305e-06, + "loss": 0.2586, + "step": 4895 + }, + { + "epoch": 2.339286780956932, + "grad_norm": 0.5257945834666825, + "learning_rate": 2.5410094836375343e-06, + "loss": 0.2644, + "step": 4896 + }, + { + "epoch": 2.3397646496625053, + "grad_norm": 0.4531122219133908, + "learning_rate": 2.537502578492733e-06, + "loss": 0.2785, + "step": 4897 + }, + { + "epoch": 2.340242518368078, + "grad_norm": 0.4863425466939351, + "learning_rate": 2.533997743313077e-06, + "loss": 0.2512, + "step": 4898 + }, + { + "epoch": 2.3407203870736515, + "grad_norm": 0.47019660570137406, + "learning_rate": 2.5304949790707512e-06, + "loss": 0.257, + "step": 4899 + }, + { + "epoch": 2.341198255779225, + "grad_norm": 0.46196723454827476, + "learning_rate": 2.52699428673736e-06, + "loss": 0.2603, + "step": 4900 + }, + { + "epoch": 2.3416761244847977, + "grad_norm": 0.5103312090564901, + "learning_rate": 2.5234956672839273e-06, + "loss": 0.2668, + "step": 4901 + }, + { + "epoch": 2.342153993190371, + "grad_norm": 0.442174404107808, + "learning_rate": 2.519999121680917e-06, + "loss": 0.2504, + "step": 4902 + }, + { + "epoch": 2.342631861895944, + "grad_norm": 0.46898344435430483, + "learning_rate": 2.516504650898206e-06, + "loss": 0.269, + "step": 4903 + }, + { + "epoch": 2.343109730601517, + "grad_norm": 0.5615098737605004, + "learning_rate": 2.513012255905095e-06, + "loss": 0.2708, + "step": 4904 + }, + { + "epoch": 2.3435875993070905, + "grad_norm": 0.4488077245674206, + "learning_rate": 2.5095219376703183e-06, + "loss": 0.2668, + "step": 4905 + }, + { + "epoch": 2.3440654680126634, + "grad_norm": 0.4667442735851483, + "learning_rate": 2.5060336971620268e-06, + "loss": 0.2774, + "step": 4906 + }, + { + "epoch": 2.3445433367182367, + "grad_norm": 0.45038553396302394, + "learning_rate": 2.5025475353477933e-06, + "loss": 0.2595, + "step": 4907 + }, + { + "epoch": 2.34502120542381, + "grad_norm": 0.45570886034637664, + "learning_rate": 2.4990634531946247e-06, + "loss": 0.2662, + "step": 4908 + }, + { + "epoch": 2.345499074129383, + "grad_norm": 0.4532263527919922, + "learning_rate": 2.495581451668938e-06, + "loss": 0.2712, + "step": 4909 + }, + { + "epoch": 2.345976942834956, + "grad_norm": 0.5644370739027057, + "learning_rate": 2.4921015317365794e-06, + "loss": 0.2745, + "step": 4910 + }, + { + "epoch": 2.3464548115405295, + "grad_norm": 0.45617320912995984, + "learning_rate": 2.488623694362822e-06, + "loss": 0.254, + "step": 4911 + }, + { + "epoch": 2.3469326802461024, + "grad_norm": 0.743987369148091, + "learning_rate": 2.4851479405123524e-06, + "loss": 0.2797, + "step": 4912 + }, + { + "epoch": 2.3474105489516757, + "grad_norm": 0.4687580573243133, + "learning_rate": 2.4816742711492813e-06, + "loss": 0.2646, + "step": 4913 + }, + { + "epoch": 2.3478884176572485, + "grad_norm": 0.4689346718000839, + "learning_rate": 2.47820268723715e-06, + "loss": 0.2733, + "step": 4914 + }, + { + "epoch": 2.348366286362822, + "grad_norm": 0.4490192177022896, + "learning_rate": 2.4747331897389103e-06, + "loss": 0.2665, + "step": 4915 + }, + { + "epoch": 2.348844155068395, + "grad_norm": 0.49311803151688366, + "learning_rate": 2.471265779616938e-06, + "loss": 0.2461, + "step": 4916 + }, + { + "epoch": 2.349322023773968, + "grad_norm": 0.467178488370915, + "learning_rate": 2.467800457833034e-06, + "loss": 0.2606, + "step": 4917 + }, + { + "epoch": 2.3497998924795414, + "grad_norm": 0.4596377441591877, + "learning_rate": 2.46433722534842e-06, + "loss": 0.2535, + "step": 4918 + }, + { + "epoch": 2.3502777611851142, + "grad_norm": 0.4694260851272706, + "learning_rate": 2.460876083123733e-06, + "loss": 0.289, + "step": 4919 + }, + { + "epoch": 2.3507556298906875, + "grad_norm": 0.45269147527252934, + "learning_rate": 2.4574170321190305e-06, + "loss": 0.2706, + "step": 4920 + }, + { + "epoch": 2.351233498596261, + "grad_norm": 0.4770873996963581, + "learning_rate": 2.4539600732937964e-06, + "loss": 0.2492, + "step": 4921 + }, + { + "epoch": 2.3517113673018337, + "grad_norm": 0.4601878264182159, + "learning_rate": 2.450505207606928e-06, + "loss": 0.2758, + "step": 4922 + }, + { + "epoch": 2.352189236007407, + "grad_norm": 0.4716682705387417, + "learning_rate": 2.4470524360167413e-06, + "loss": 0.2731, + "step": 4923 + }, + { + "epoch": 2.35266710471298, + "grad_norm": 0.4939845519900784, + "learning_rate": 2.4436017594809804e-06, + "loss": 0.2924, + "step": 4924 + }, + { + "epoch": 2.353144973418553, + "grad_norm": 0.4919791373245082, + "learning_rate": 2.440153178956798e-06, + "loss": 0.2826, + "step": 4925 + }, + { + "epoch": 2.3536228421241265, + "grad_norm": 0.48187056236329523, + "learning_rate": 2.436706695400769e-06, + "loss": 0.2796, + "step": 4926 + }, + { + "epoch": 2.3541007108296994, + "grad_norm": 0.4641311091931838, + "learning_rate": 2.43326230976889e-06, + "loss": 0.2629, + "step": 4927 + }, + { + "epoch": 2.3545785795352727, + "grad_norm": 0.4670532060941091, + "learning_rate": 2.4298200230165713e-06, + "loss": 0.2706, + "step": 4928 + }, + { + "epoch": 2.3550564482408456, + "grad_norm": 0.4754697383640508, + "learning_rate": 2.4263798360986403e-06, + "loss": 0.258, + "step": 4929 + }, + { + "epoch": 2.355534316946419, + "grad_norm": 0.4958072214830247, + "learning_rate": 2.42294174996935e-06, + "loss": 0.2731, + "step": 4930 + }, + { + "epoch": 2.356012185651992, + "grad_norm": 0.4688304594840339, + "learning_rate": 2.4195057655823596e-06, + "loss": 0.2762, + "step": 4931 + }, + { + "epoch": 2.356490054357565, + "grad_norm": 0.45489501298416457, + "learning_rate": 2.4160718838907502e-06, + "loss": 0.2682, + "step": 4932 + }, + { + "epoch": 2.3569679230631384, + "grad_norm": 0.5108041983410803, + "learning_rate": 2.412640105847025e-06, + "loss": 0.2641, + "step": 4933 + }, + { + "epoch": 2.3574457917687117, + "grad_norm": 0.48773893586445766, + "learning_rate": 2.4092104324030952e-06, + "loss": 0.2573, + "step": 4934 + }, + { + "epoch": 2.3579236604742846, + "grad_norm": 0.48133666696803284, + "learning_rate": 2.40578286451029e-06, + "loss": 0.2949, + "step": 4935 + }, + { + "epoch": 2.358401529179858, + "grad_norm": 0.4912715666573927, + "learning_rate": 2.4023574031193607e-06, + "loss": 0.2742, + "step": 4936 + }, + { + "epoch": 2.358879397885431, + "grad_norm": 0.4842008813394927, + "learning_rate": 2.398934049180468e-06, + "loss": 0.2797, + "step": 4937 + }, + { + "epoch": 2.359357266591004, + "grad_norm": 0.48439100068631125, + "learning_rate": 2.395512803643186e-06, + "loss": 0.265, + "step": 4938 + }, + { + "epoch": 2.3598351352965774, + "grad_norm": 0.47578624590536817, + "learning_rate": 2.3920936674565155e-06, + "loss": 0.2881, + "step": 4939 + }, + { + "epoch": 2.3603130040021503, + "grad_norm": 0.4437343615163969, + "learning_rate": 2.38867664156886e-06, + "loss": 0.2797, + "step": 4940 + }, + { + "epoch": 2.3607908727077236, + "grad_norm": 0.46566862296582356, + "learning_rate": 2.38526172692804e-06, + "loss": 0.2719, + "step": 4941 + }, + { + "epoch": 2.361268741413297, + "grad_norm": 0.5051001842897216, + "learning_rate": 2.381848924481297e-06, + "loss": 0.2664, + "step": 4942 + }, + { + "epoch": 2.3617466101188698, + "grad_norm": 0.4806571977298347, + "learning_rate": 2.378438235175281e-06, + "loss": 0.2777, + "step": 4943 + }, + { + "epoch": 2.362224478824443, + "grad_norm": 0.4537705945452327, + "learning_rate": 2.375029659956054e-06, + "loss": 0.2692, + "step": 4944 + }, + { + "epoch": 2.362702347530016, + "grad_norm": 0.4538521904099215, + "learning_rate": 2.3716231997691007e-06, + "loss": 0.2653, + "step": 4945 + }, + { + "epoch": 2.3631802162355893, + "grad_norm": 0.44790702234683916, + "learning_rate": 2.368218855559309e-06, + "loss": 0.2547, + "step": 4946 + }, + { + "epoch": 2.3636580849411626, + "grad_norm": 0.4411839227865902, + "learning_rate": 2.3648166282709806e-06, + "loss": 0.2611, + "step": 4947 + }, + { + "epoch": 2.3641359536467355, + "grad_norm": 0.4512659481124571, + "learning_rate": 2.361416518847841e-06, + "loss": 0.2614, + "step": 4948 + }, + { + "epoch": 2.3646138223523088, + "grad_norm": 0.45377164683979093, + "learning_rate": 2.3580185282330137e-06, + "loss": 0.2686, + "step": 4949 + }, + { + "epoch": 2.3650916910578816, + "grad_norm": 0.4512279362886925, + "learning_rate": 2.3546226573690444e-06, + "loss": 0.2661, + "step": 4950 + }, + { + "epoch": 2.365569559763455, + "grad_norm": 0.4907151137339124, + "learning_rate": 2.351228907197891e-06, + "loss": 0.2725, + "step": 4951 + }, + { + "epoch": 2.3660474284690283, + "grad_norm": 0.461594964603084, + "learning_rate": 2.347837278660917e-06, + "loss": 0.274, + "step": 4952 + }, + { + "epoch": 2.366525297174601, + "grad_norm": 0.49276481493447555, + "learning_rate": 2.3444477726988966e-06, + "loss": 0.256, + "step": 4953 + }, + { + "epoch": 2.3670031658801745, + "grad_norm": 0.5271634745570248, + "learning_rate": 2.3410603902520245e-06, + "loss": 0.2734, + "step": 4954 + }, + { + "epoch": 2.3674810345857473, + "grad_norm": 0.5046047617256642, + "learning_rate": 2.3376751322599e-06, + "loss": 0.2665, + "step": 4955 + }, + { + "epoch": 2.3679589032913206, + "grad_norm": 0.43935104834974015, + "learning_rate": 2.33429199966153e-06, + "loss": 0.2542, + "step": 4956 + }, + { + "epoch": 2.368436771996894, + "grad_norm": 0.47827957926642184, + "learning_rate": 2.330910993395341e-06, + "loss": 0.2558, + "step": 4957 + }, + { + "epoch": 2.368914640702467, + "grad_norm": 0.46077109730693455, + "learning_rate": 2.3275321143991613e-06, + "loss": 0.2665, + "step": 4958 + }, + { + "epoch": 2.36939250940804, + "grad_norm": 0.4620740363224271, + "learning_rate": 2.324155363610231e-06, + "loss": 0.281, + "step": 4959 + }, + { + "epoch": 2.3698703781136135, + "grad_norm": 0.4743650364517856, + "learning_rate": 2.320780741965206e-06, + "loss": 0.2514, + "step": 4960 + }, + { + "epoch": 2.3703482468191863, + "grad_norm": 0.48881821904437056, + "learning_rate": 2.317408250400144e-06, + "loss": 0.2916, + "step": 4961 + }, + { + "epoch": 2.3708261155247596, + "grad_norm": 0.44779357419192, + "learning_rate": 2.3140378898505125e-06, + "loss": 0.2567, + "step": 4962 + }, + { + "epoch": 2.371303984230333, + "grad_norm": 0.4571272492508665, + "learning_rate": 2.3106696612511937e-06, + "loss": 0.2689, + "step": 4963 + }, + { + "epoch": 2.371781852935906, + "grad_norm": 0.4680515782108428, + "learning_rate": 2.307303565536474e-06, + "loss": 0.2632, + "step": 4964 + }, + { + "epoch": 2.372259721641479, + "grad_norm": 0.50999540376331, + "learning_rate": 2.3039396036400463e-06, + "loss": 0.2699, + "step": 4965 + }, + { + "epoch": 2.372737590347052, + "grad_norm": 0.45711754864038767, + "learning_rate": 2.3005777764950187e-06, + "loss": 0.2672, + "step": 4966 + }, + { + "epoch": 2.3732154590526253, + "grad_norm": 0.46541233157527173, + "learning_rate": 2.2972180850339e-06, + "loss": 0.2772, + "step": 4967 + }, + { + "epoch": 2.3736933277581986, + "grad_norm": 0.45372335105255907, + "learning_rate": 2.2938605301886075e-06, + "loss": 0.245, + "step": 4968 + }, + { + "epoch": 2.3741711964637715, + "grad_norm": 0.49762996831577216, + "learning_rate": 2.2905051128904733e-06, + "loss": 0.2846, + "step": 4969 + }, + { + "epoch": 2.374649065169345, + "grad_norm": 0.45309610239893056, + "learning_rate": 2.287151834070226e-06, + "loss": 0.2453, + "step": 4970 + }, + { + "epoch": 2.3751269338749177, + "grad_norm": 0.45646843129089465, + "learning_rate": 2.283800694658006e-06, + "loss": 0.2523, + "step": 4971 + }, + { + "epoch": 2.375604802580491, + "grad_norm": 0.45497585076759617, + "learning_rate": 2.2804516955833645e-06, + "loss": 0.2636, + "step": 4972 + }, + { + "epoch": 2.3760826712860643, + "grad_norm": 0.4698293678470528, + "learning_rate": 2.2771048377752527e-06, + "loss": 0.2792, + "step": 4973 + }, + { + "epoch": 2.376560539991637, + "grad_norm": 0.4481308379841722, + "learning_rate": 2.2737601221620252e-06, + "loss": 0.2734, + "step": 4974 + }, + { + "epoch": 2.3770384086972105, + "grad_norm": 0.47464422001892004, + "learning_rate": 2.2704175496714552e-06, + "loss": 0.2653, + "step": 4975 + }, + { + "epoch": 2.3775162774027834, + "grad_norm": 0.48442075929966344, + "learning_rate": 2.2670771212307087e-06, + "loss": 0.2664, + "step": 4976 + }, + { + "epoch": 2.3779941461083567, + "grad_norm": 0.46567581192533997, + "learning_rate": 2.2637388377663605e-06, + "loss": 0.2569, + "step": 4977 + }, + { + "epoch": 2.37847201481393, + "grad_norm": 0.4640997153073419, + "learning_rate": 2.260402700204395e-06, + "loss": 0.2691, + "step": 4978 + }, + { + "epoch": 2.378949883519503, + "grad_norm": 0.4635972785067223, + "learning_rate": 2.257068709470197e-06, + "loss": 0.2618, + "step": 4979 + }, + { + "epoch": 2.379427752225076, + "grad_norm": 0.4773585828087478, + "learning_rate": 2.2537368664885527e-06, + "loss": 0.2754, + "step": 4980 + }, + { + "epoch": 2.379905620930649, + "grad_norm": 0.439472956742948, + "learning_rate": 2.250407172183664e-06, + "loss": 0.2664, + "step": 4981 + }, + { + "epoch": 2.3803834896362224, + "grad_norm": 0.4558716330221375, + "learning_rate": 2.247079627479124e-06, + "loss": 0.2683, + "step": 4982 + }, + { + "epoch": 2.3808613583417957, + "grad_norm": 0.4608097977312484, + "learning_rate": 2.2437542332979336e-06, + "loss": 0.2835, + "step": 4983 + }, + { + "epoch": 2.3813392270473686, + "grad_norm": 0.4380401331798129, + "learning_rate": 2.240430990562501e-06, + "loss": 0.2919, + "step": 4984 + }, + { + "epoch": 2.381817095752942, + "grad_norm": 0.5895395029494848, + "learning_rate": 2.2371099001946385e-06, + "loss": 0.2624, + "step": 4985 + }, + { + "epoch": 2.382294964458515, + "grad_norm": 0.46508326131778577, + "learning_rate": 2.233790963115554e-06, + "loss": 0.2504, + "step": 4986 + }, + { + "epoch": 2.382772833164088, + "grad_norm": 0.4477530552158064, + "learning_rate": 2.2304741802458606e-06, + "loss": 0.2661, + "step": 4987 + }, + { + "epoch": 2.3832507018696614, + "grad_norm": 0.45202080179265264, + "learning_rate": 2.22715955250558e-06, + "loss": 0.2801, + "step": 4988 + }, + { + "epoch": 2.3837285705752347, + "grad_norm": 0.46022019801647834, + "learning_rate": 2.223847080814129e-06, + "loss": 0.2522, + "step": 4989 + }, + { + "epoch": 2.3842064392808076, + "grad_norm": 0.46337347162678094, + "learning_rate": 2.2205367660903267e-06, + "loss": 0.2622, + "step": 4990 + }, + { + "epoch": 2.384684307986381, + "grad_norm": 0.532373751607701, + "learning_rate": 2.2172286092523998e-06, + "loss": 0.253, + "step": 4991 + }, + { + "epoch": 2.3851621766919537, + "grad_norm": 0.44622554326728614, + "learning_rate": 2.2139226112179713e-06, + "loss": 0.2566, + "step": 4992 + }, + { + "epoch": 2.385640045397527, + "grad_norm": 0.4538870200700637, + "learning_rate": 2.210618772904064e-06, + "loss": 0.2646, + "step": 4993 + }, + { + "epoch": 2.3861179141031004, + "grad_norm": 0.4585327419151689, + "learning_rate": 2.2073170952271085e-06, + "loss": 0.2623, + "step": 4994 + }, + { + "epoch": 2.3865957828086732, + "grad_norm": 0.46815712443376617, + "learning_rate": 2.2040175791029305e-06, + "loss": 0.2818, + "step": 4995 + }, + { + "epoch": 2.3870736515142466, + "grad_norm": 0.5231070665972924, + "learning_rate": 2.200720225446755e-06, + "loss": 0.2722, + "step": 4996 + }, + { + "epoch": 2.3875515202198194, + "grad_norm": 0.654443625332581, + "learning_rate": 2.197425035173215e-06, + "loss": 0.2563, + "step": 4997 + }, + { + "epoch": 2.3880293889253927, + "grad_norm": 0.45803873330264716, + "learning_rate": 2.194132009196335e-06, + "loss": 0.2952, + "step": 4998 + }, + { + "epoch": 2.388507257630966, + "grad_norm": 0.4521097635127778, + "learning_rate": 2.19084114842954e-06, + "loss": 0.265, + "step": 4999 + }, + { + "epoch": 2.388985126336539, + "grad_norm": 0.4544732994444673, + "learning_rate": 2.187552453785662e-06, + "loss": 0.2745, + "step": 5000 + }, + { + "epoch": 2.3894629950421122, + "grad_norm": 0.4793264716313783, + "learning_rate": 2.1842659261769226e-06, + "loss": 0.262, + "step": 5001 + }, + { + "epoch": 2.389940863747685, + "grad_norm": 0.448019751252878, + "learning_rate": 2.180981566514947e-06, + "loss": 0.2688, + "step": 5002 + }, + { + "epoch": 2.3904187324532584, + "grad_norm": 0.48536454285303543, + "learning_rate": 2.177699375710762e-06, + "loss": 0.2623, + "step": 5003 + }, + { + "epoch": 2.3908966011588317, + "grad_norm": 0.6915355830232118, + "learning_rate": 2.174419354674787e-06, + "loss": 0.2697, + "step": 5004 + }, + { + "epoch": 2.3913744698644046, + "grad_norm": 0.47423927727053033, + "learning_rate": 2.1711415043168395e-06, + "loss": 0.2806, + "step": 5005 + }, + { + "epoch": 2.391852338569978, + "grad_norm": 0.4699229189597673, + "learning_rate": 2.1678658255461427e-06, + "loss": 0.2764, + "step": 5006 + }, + { + "epoch": 2.3923302072755512, + "grad_norm": 0.4366728988564249, + "learning_rate": 2.164592319271309e-06, + "loss": 0.2626, + "step": 5007 + }, + { + "epoch": 2.392808075981124, + "grad_norm": 0.4680895354927544, + "learning_rate": 2.16132098640035e-06, + "loss": 0.2856, + "step": 5008 + }, + { + "epoch": 2.3932859446866974, + "grad_norm": 0.4831506206251398, + "learning_rate": 2.1580518278406793e-06, + "loss": 0.2588, + "step": 5009 + }, + { + "epoch": 2.3937638133922703, + "grad_norm": 0.4699558577692185, + "learning_rate": 2.1547848444991025e-06, + "loss": 0.2701, + "step": 5010 + }, + { + "epoch": 2.3942416820978436, + "grad_norm": 0.4489222989978539, + "learning_rate": 2.15152003728182e-06, + "loss": 0.2834, + "step": 5011 + }, + { + "epoch": 2.394719550803417, + "grad_norm": 0.47665521913868575, + "learning_rate": 2.148257407094436e-06, + "loss": 0.2583, + "step": 5012 + }, + { + "epoch": 2.39519741950899, + "grad_norm": 0.45743260638815375, + "learning_rate": 2.1449969548419456e-06, + "loss": 0.2677, + "step": 5013 + }, + { + "epoch": 2.395675288214563, + "grad_norm": 0.45342448226517174, + "learning_rate": 2.141738681428738e-06, + "loss": 0.2653, + "step": 5014 + }, + { + "epoch": 2.3961531569201364, + "grad_norm": 0.44238225246261725, + "learning_rate": 2.138482587758605e-06, + "loss": 0.2823, + "step": 5015 + }, + { + "epoch": 2.3966310256257093, + "grad_norm": 0.4457574279651664, + "learning_rate": 2.1352286747347273e-06, + "loss": 0.257, + "step": 5016 + }, + { + "epoch": 2.3971088943312826, + "grad_norm": 0.4702199309191923, + "learning_rate": 2.1319769432596804e-06, + "loss": 0.276, + "step": 5017 + }, + { + "epoch": 2.3975867630368555, + "grad_norm": 0.48532027343768513, + "learning_rate": 2.1287273942354393e-06, + "loss": 0.2615, + "step": 5018 + }, + { + "epoch": 2.398064631742429, + "grad_norm": 0.4532406315784221, + "learning_rate": 2.125480028563376e-06, + "loss": 0.2611, + "step": 5019 + }, + { + "epoch": 2.398542500448002, + "grad_norm": 0.43656492650267525, + "learning_rate": 2.1222348471442477e-06, + "loss": 0.2607, + "step": 5020 + }, + { + "epoch": 2.399020369153575, + "grad_norm": 0.4437139617252678, + "learning_rate": 2.118991850878209e-06, + "loss": 0.2634, + "step": 5021 + }, + { + "epoch": 2.3994982378591483, + "grad_norm": 0.4607343235133938, + "learning_rate": 2.115751040664815e-06, + "loss": 0.2617, + "step": 5022 + }, + { + "epoch": 2.399976106564721, + "grad_norm": 0.4509401894862981, + "learning_rate": 2.1125124174030066e-06, + "loss": 0.2649, + "step": 5023 + }, + { + "epoch": 2.4004539752702945, + "grad_norm": 0.4780514985721637, + "learning_rate": 2.1092759819911178e-06, + "loss": 0.2587, + "step": 5024 + }, + { + "epoch": 2.400931843975868, + "grad_norm": 0.47293976576430685, + "learning_rate": 2.1060417353268845e-06, + "loss": 0.2605, + "step": 5025 + }, + { + "epoch": 2.4014097126814407, + "grad_norm": 0.4632747242085837, + "learning_rate": 2.102809678307427e-06, + "loss": 0.2701, + "step": 5026 + }, + { + "epoch": 2.401887581387014, + "grad_norm": 0.4519610118896511, + "learning_rate": 2.0995798118292574e-06, + "loss": 0.274, + "step": 5027 + }, + { + "epoch": 2.402365450092587, + "grad_norm": 0.49007996568127915, + "learning_rate": 2.09635213678829e-06, + "loss": 0.275, + "step": 5028 + }, + { + "epoch": 2.40284331879816, + "grad_norm": 0.4482075654858903, + "learning_rate": 2.093126654079822e-06, + "loss": 0.2724, + "step": 5029 + }, + { + "epoch": 2.4033211875037335, + "grad_norm": 0.46175751639190643, + "learning_rate": 2.0899033645985423e-06, + "loss": 0.2734, + "step": 5030 + }, + { + "epoch": 2.4037990562093063, + "grad_norm": 0.4526383312411215, + "learning_rate": 2.0866822692385404e-06, + "loss": 0.2845, + "step": 5031 + }, + { + "epoch": 2.4042769249148797, + "grad_norm": 0.4713547161425226, + "learning_rate": 2.083463368893289e-06, + "loss": 0.2838, + "step": 5032 + }, + { + "epoch": 2.404754793620453, + "grad_norm": 0.5388592684225536, + "learning_rate": 2.0802466644556507e-06, + "loss": 0.2581, + "step": 5033 + }, + { + "epoch": 2.405232662326026, + "grad_norm": 0.44781970240862895, + "learning_rate": 2.0770321568178873e-06, + "loss": 0.2552, + "step": 5034 + }, + { + "epoch": 2.405710531031599, + "grad_norm": 0.45727753815499, + "learning_rate": 2.073819846871646e-06, + "loss": 0.2784, + "step": 5035 + }, + { + "epoch": 2.406188399737172, + "grad_norm": 0.4523294487636907, + "learning_rate": 2.0706097355079614e-06, + "loss": 0.2548, + "step": 5036 + }, + { + "epoch": 2.4066662684427453, + "grad_norm": 0.4979869120677124, + "learning_rate": 2.0674018236172654e-06, + "loss": 0.262, + "step": 5037 + }, + { + "epoch": 2.4071441371483187, + "grad_norm": 0.4670045351404005, + "learning_rate": 2.064196112089376e-06, + "loss": 0.2836, + "step": 5038 + }, + { + "epoch": 2.4076220058538915, + "grad_norm": 0.44302766764113355, + "learning_rate": 2.0609926018134972e-06, + "loss": 0.2544, + "step": 5039 + }, + { + "epoch": 2.408099874559465, + "grad_norm": 0.4466796302647838, + "learning_rate": 2.0577912936782317e-06, + "loss": 0.2531, + "step": 5040 + }, + { + "epoch": 2.408577743265038, + "grad_norm": 0.4662441229505657, + "learning_rate": 2.0545921885715624e-06, + "loss": 0.2634, + "step": 5041 + }, + { + "epoch": 2.409055611970611, + "grad_norm": 0.4621440744087701, + "learning_rate": 2.051395287380864e-06, + "loss": 0.2711, + "step": 5042 + }, + { + "epoch": 2.4095334806761843, + "grad_norm": 0.4910784013605513, + "learning_rate": 2.048200590992904e-06, + "loss": 0.2564, + "step": 5043 + }, + { + "epoch": 2.410011349381757, + "grad_norm": 0.4768608798682245, + "learning_rate": 2.0450081002938316e-06, + "loss": 0.2643, + "step": 5044 + }, + { + "epoch": 2.4104892180873305, + "grad_norm": 0.45467660225646006, + "learning_rate": 2.041817816169187e-06, + "loss": 0.2567, + "step": 5045 + }, + { + "epoch": 2.410967086792904, + "grad_norm": 0.4570097506050878, + "learning_rate": 2.0386297395039023e-06, + "loss": 0.2681, + "step": 5046 + }, + { + "epoch": 2.4114449554984767, + "grad_norm": 0.4718064608424706, + "learning_rate": 2.035443871182292e-06, + "loss": 0.2523, + "step": 5047 + }, + { + "epoch": 2.41192282420405, + "grad_norm": 0.510560238857416, + "learning_rate": 2.0322602120880576e-06, + "loss": 0.2596, + "step": 5048 + }, + { + "epoch": 2.412400692909623, + "grad_norm": 0.45874298526457225, + "learning_rate": 2.0290787631042942e-06, + "loss": 0.2654, + "step": 5049 + }, + { + "epoch": 2.412878561615196, + "grad_norm": 0.4702898124625645, + "learning_rate": 2.025899525113474e-06, + "loss": 0.2607, + "step": 5050 + }, + { + "epoch": 2.4133564303207695, + "grad_norm": 0.4419385668349205, + "learning_rate": 2.022722498997465e-06, + "loss": 0.28, + "step": 5051 + }, + { + "epoch": 2.4138342990263424, + "grad_norm": 0.49832278835296895, + "learning_rate": 2.0195476856375206e-06, + "loss": 0.2722, + "step": 5052 + }, + { + "epoch": 2.4143121677319157, + "grad_norm": 0.4453378413752286, + "learning_rate": 2.016375085914275e-06, + "loss": 0.2631, + "step": 5053 + }, + { + "epoch": 2.4147900364374886, + "grad_norm": 0.4464873807438464, + "learning_rate": 2.0132047007077504e-06, + "loss": 0.2617, + "step": 5054 + }, + { + "epoch": 2.415267905143062, + "grad_norm": 0.43769077224069114, + "learning_rate": 2.010036530897359e-06, + "loss": 0.2523, + "step": 5055 + }, + { + "epoch": 2.415745773848635, + "grad_norm": 0.44478509964099755, + "learning_rate": 2.0068705773618937e-06, + "loss": 0.2494, + "step": 5056 + }, + { + "epoch": 2.416223642554208, + "grad_norm": 0.4558953495461828, + "learning_rate": 2.003706840979531e-06, + "loss": 0.2716, + "step": 5057 + }, + { + "epoch": 2.4167015112597814, + "grad_norm": 0.7657464247162521, + "learning_rate": 2.0005453226278403e-06, + "loss": 0.2683, + "step": 5058 + }, + { + "epoch": 2.4171793799653547, + "grad_norm": 0.4719546599335128, + "learning_rate": 1.9973860231837705e-06, + "loss": 0.256, + "step": 5059 + }, + { + "epoch": 2.4176572486709276, + "grad_norm": 0.46006978419118905, + "learning_rate": 1.9942289435236506e-06, + "loss": 0.2772, + "step": 5060 + }, + { + "epoch": 2.418135117376501, + "grad_norm": 0.5094940178312064, + "learning_rate": 1.9910740845232058e-06, + "loss": 0.2713, + "step": 5061 + }, + { + "epoch": 2.4186129860820738, + "grad_norm": 0.46698646284000606, + "learning_rate": 1.9879214470575347e-06, + "loss": 0.274, + "step": 5062 + }, + { + "epoch": 2.419090854787647, + "grad_norm": 0.4589782752776786, + "learning_rate": 1.9847710320011206e-06, + "loss": 0.2499, + "step": 5063 + }, + { + "epoch": 2.4195687234932204, + "grad_norm": 0.469003199954616, + "learning_rate": 1.9816228402278392e-06, + "loss": 0.2681, + "step": 5064 + }, + { + "epoch": 2.4200465921987933, + "grad_norm": 0.4481176604472585, + "learning_rate": 1.978476872610939e-06, + "loss": 0.2577, + "step": 5065 + }, + { + "epoch": 2.4205244609043666, + "grad_norm": 0.4659618384134064, + "learning_rate": 1.975333130023056e-06, + "loss": 0.2647, + "step": 5066 + }, + { + "epoch": 2.42100232960994, + "grad_norm": 0.4828500059187709, + "learning_rate": 1.972191613336212e-06, + "loss": 0.2622, + "step": 5067 + }, + { + "epoch": 2.4214801983155128, + "grad_norm": 0.4551712653015207, + "learning_rate": 1.969052323421806e-06, + "loss": 0.2661, + "step": 5068 + }, + { + "epoch": 2.421958067021086, + "grad_norm": 0.46637572776776237, + "learning_rate": 1.9659152611506193e-06, + "loss": 0.2722, + "step": 5069 + }, + { + "epoch": 2.422435935726659, + "grad_norm": 0.4486269710544213, + "learning_rate": 1.962780427392823e-06, + "loss": 0.2742, + "step": 5070 + }, + { + "epoch": 2.4229138044322323, + "grad_norm": 0.4481995215441604, + "learning_rate": 1.959647823017963e-06, + "loss": 0.2709, + "step": 5071 + }, + { + "epoch": 2.4233916731378056, + "grad_norm": 0.4637020847420594, + "learning_rate": 1.9565174488949636e-06, + "loss": 0.268, + "step": 5072 + }, + { + "epoch": 2.4238695418433784, + "grad_norm": 0.4522661575040067, + "learning_rate": 1.953389305892143e-06, + "loss": 0.2714, + "step": 5073 + }, + { + "epoch": 2.4243474105489518, + "grad_norm": 0.5188133426193942, + "learning_rate": 1.9502633948771888e-06, + "loss": 0.2682, + "step": 5074 + }, + { + "epoch": 2.4248252792545246, + "grad_norm": 0.4570248324569066, + "learning_rate": 1.9471397167171714e-06, + "loss": 0.2652, + "step": 5075 + }, + { + "epoch": 2.425303147960098, + "grad_norm": 0.459560178829993, + "learning_rate": 1.944018272278548e-06, + "loss": 0.2511, + "step": 5076 + }, + { + "epoch": 2.4257810166656713, + "grad_norm": 0.466238298627355, + "learning_rate": 1.9408990624271516e-06, + "loss": 0.2687, + "step": 5077 + }, + { + "epoch": 2.426258885371244, + "grad_norm": 0.49223929370166464, + "learning_rate": 1.9377820880281928e-06, + "loss": 0.2739, + "step": 5078 + }, + { + "epoch": 2.4267367540768174, + "grad_norm": 0.46613653698623686, + "learning_rate": 1.934667349946271e-06, + "loss": 0.2676, + "step": 5079 + }, + { + "epoch": 2.4272146227823903, + "grad_norm": 0.44196513176630936, + "learning_rate": 1.931554849045355e-06, + "loss": 0.2662, + "step": 5080 + }, + { + "epoch": 2.4276924914879636, + "grad_norm": 0.45591063669478454, + "learning_rate": 1.9284445861887966e-06, + "loss": 0.2799, + "step": 5081 + }, + { + "epoch": 2.428170360193537, + "grad_norm": 0.47960546577577334, + "learning_rate": 1.9253365622393337e-06, + "loss": 0.2612, + "step": 5082 + }, + { + "epoch": 2.42864822889911, + "grad_norm": 0.49874783389566263, + "learning_rate": 1.9222307780590734e-06, + "loss": 0.259, + "step": 5083 + }, + { + "epoch": 2.429126097604683, + "grad_norm": 0.48649273799607196, + "learning_rate": 1.9191272345095025e-06, + "loss": 0.2626, + "step": 5084 + }, + { + "epoch": 2.4296039663102564, + "grad_norm": 0.46265212095706404, + "learning_rate": 1.916025932451493e-06, + "loss": 0.2673, + "step": 5085 + }, + { + "epoch": 2.4300818350158293, + "grad_norm": 0.44902330400044727, + "learning_rate": 1.912926872745294e-06, + "loss": 0.2641, + "step": 5086 + }, + { + "epoch": 2.4305597037214026, + "grad_norm": 0.48237243869527174, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.2677, + "step": 5087 + }, + { + "epoch": 2.431037572426976, + "grad_norm": 0.47922262239457364, + "learning_rate": 1.9067354838261908e-06, + "loss": 0.239, + "step": 5088 + }, + { + "epoch": 2.431515441132549, + "grad_norm": 0.4484227748244973, + "learning_rate": 1.9036431563306723e-06, + "loss": 0.2676, + "step": 5089 + }, + { + "epoch": 2.431993309838122, + "grad_norm": 0.4481486490497434, + "learning_rate": 1.9005530746217238e-06, + "loss": 0.2612, + "step": 5090 + }, + { + "epoch": 2.432471178543695, + "grad_norm": 0.4751868893945265, + "learning_rate": 1.8974652395564785e-06, + "loss": 0.2658, + "step": 5091 + }, + { + "epoch": 2.4329490472492683, + "grad_norm": 0.4910217856403449, + "learning_rate": 1.8943796519914525e-06, + "loss": 0.2632, + "step": 5092 + }, + { + "epoch": 2.4334269159548416, + "grad_norm": 0.4671224266607131, + "learning_rate": 1.89129631278253e-06, + "loss": 0.2804, + "step": 5093 + }, + { + "epoch": 2.4339047846604145, + "grad_norm": 0.5193462465411075, + "learning_rate": 1.8882152227849727e-06, + "loss": 0.2845, + "step": 5094 + }, + { + "epoch": 2.434382653365988, + "grad_norm": 0.5153634268527411, + "learning_rate": 1.8851363828534253e-06, + "loss": 0.263, + "step": 5095 + }, + { + "epoch": 2.4348605220715607, + "grad_norm": 0.45007658573232884, + "learning_rate": 1.8820597938419028e-06, + "loss": 0.2653, + "step": 5096 + }, + { + "epoch": 2.435338390777134, + "grad_norm": 0.4708101712892127, + "learning_rate": 1.8789854566037912e-06, + "loss": 0.2681, + "step": 5097 + }, + { + "epoch": 2.4358162594827073, + "grad_norm": 0.466359009946716, + "learning_rate": 1.8759133719918654e-06, + "loss": 0.2683, + "step": 5098 + }, + { + "epoch": 2.43629412818828, + "grad_norm": 0.5260130493552598, + "learning_rate": 1.8728435408582634e-06, + "loss": 0.2664, + "step": 5099 + }, + { + "epoch": 2.4367719968938535, + "grad_norm": 0.4471972729113941, + "learning_rate": 1.869775964054501e-06, + "loss": 0.2667, + "step": 5100 + }, + { + "epoch": 2.4372498655994264, + "grad_norm": 0.4605895897583309, + "learning_rate": 1.866710642431473e-06, + "loss": 0.2794, + "step": 5101 + }, + { + "epoch": 2.4377277343049997, + "grad_norm": 0.49147254180946887, + "learning_rate": 1.8636475768394446e-06, + "loss": 0.2812, + "step": 5102 + }, + { + "epoch": 2.438205603010573, + "grad_norm": 0.44497897456741325, + "learning_rate": 1.860586768128052e-06, + "loss": 0.2679, + "step": 5103 + }, + { + "epoch": 2.438683471716146, + "grad_norm": 0.45664419543781765, + "learning_rate": 1.857528217146317e-06, + "loss": 0.2631, + "step": 5104 + }, + { + "epoch": 2.439161340421719, + "grad_norm": 0.463599939436042, + "learning_rate": 1.8544719247426224e-06, + "loss": 0.2664, + "step": 5105 + }, + { + "epoch": 2.439639209127292, + "grad_norm": 0.5020393530396908, + "learning_rate": 1.8514178917647297e-06, + "loss": 0.2751, + "step": 5106 + }, + { + "epoch": 2.4401170778328654, + "grad_norm": 0.6334592929322761, + "learning_rate": 1.8483661190597778e-06, + "loss": 0.273, + "step": 5107 + }, + { + "epoch": 2.4405949465384387, + "grad_norm": 0.44949715968087817, + "learning_rate": 1.8453166074742723e-06, + "loss": 0.2626, + "step": 5108 + }, + { + "epoch": 2.4410728152440115, + "grad_norm": 0.4601875206689804, + "learning_rate": 1.8422693578540907e-06, + "loss": 0.2524, + "step": 5109 + }, + { + "epoch": 2.441550683949585, + "grad_norm": 0.4432035611183387, + "learning_rate": 1.8392243710444911e-06, + "loss": 0.2568, + "step": 5110 + }, + { + "epoch": 2.442028552655158, + "grad_norm": 0.4420828252613538, + "learning_rate": 1.8361816478900986e-06, + "loss": 0.2687, + "step": 5111 + }, + { + "epoch": 2.442506421360731, + "grad_norm": 0.4698539634957639, + "learning_rate": 1.833141189234907e-06, + "loss": 0.2556, + "step": 5112 + }, + { + "epoch": 2.4429842900663044, + "grad_norm": 0.4454077363137819, + "learning_rate": 1.8301029959222916e-06, + "loss": 0.2644, + "step": 5113 + }, + { + "epoch": 2.4434621587718777, + "grad_norm": 0.4694638175516713, + "learning_rate": 1.8270670687949898e-06, + "loss": 0.2725, + "step": 5114 + }, + { + "epoch": 2.4439400274774505, + "grad_norm": 0.4608350257915752, + "learning_rate": 1.8240334086951117e-06, + "loss": 0.2696, + "step": 5115 + }, + { + "epoch": 2.444417896183024, + "grad_norm": 0.44781409506792363, + "learning_rate": 1.8210020164641483e-06, + "loss": 0.2716, + "step": 5116 + }, + { + "epoch": 2.4448957648885967, + "grad_norm": 0.4969053175588786, + "learning_rate": 1.8179728929429507e-06, + "loss": 0.2643, + "step": 5117 + }, + { + "epoch": 2.44537363359417, + "grad_norm": 0.4559222255934472, + "learning_rate": 1.814946038971741e-06, + "loss": 0.2713, + "step": 5118 + }, + { + "epoch": 2.4458515022997434, + "grad_norm": 0.44970009381193676, + "learning_rate": 1.8119214553901177e-06, + "loss": 0.2622, + "step": 5119 + }, + { + "epoch": 2.4463293710053162, + "grad_norm": 0.4559369781065068, + "learning_rate": 1.8088991430370506e-06, + "loss": 0.2666, + "step": 5120 + }, + { + "epoch": 2.4468072397108895, + "grad_norm": 0.45234671729475956, + "learning_rate": 1.8058791027508726e-06, + "loss": 0.2686, + "step": 5121 + }, + { + "epoch": 2.4472851084164624, + "grad_norm": 0.44137882355941305, + "learning_rate": 1.802861335369287e-06, + "loss": 0.2779, + "step": 5122 + }, + { + "epoch": 2.4477629771220357, + "grad_norm": 0.48440328157222995, + "learning_rate": 1.799845841729375e-06, + "loss": 0.283, + "step": 5123 + }, + { + "epoch": 2.448240845827609, + "grad_norm": 0.494486208129345, + "learning_rate": 1.796832622667578e-06, + "loss": 0.2656, + "step": 5124 + }, + { + "epoch": 2.448718714533182, + "grad_norm": 0.47590323638368665, + "learning_rate": 1.7938216790197071e-06, + "loss": 0.2497, + "step": 5125 + }, + { + "epoch": 2.449196583238755, + "grad_norm": 0.449129199222899, + "learning_rate": 1.790813011620951e-06, + "loss": 0.2654, + "step": 5126 + }, + { + "epoch": 2.449674451944328, + "grad_norm": 0.4626004740804751, + "learning_rate": 1.7878066213058575e-06, + "loss": 0.2708, + "step": 5127 + }, + { + "epoch": 2.4501523206499014, + "grad_norm": 0.4702626534514859, + "learning_rate": 1.7848025089083442e-06, + "loss": 0.2782, + "step": 5128 + }, + { + "epoch": 2.4506301893554747, + "grad_norm": 0.44737255344314847, + "learning_rate": 1.7818006752617034e-06, + "loss": 0.2791, + "step": 5129 + }, + { + "epoch": 2.4511080580610476, + "grad_norm": 0.454394369273916, + "learning_rate": 1.77880112119859e-06, + "loss": 0.2579, + "step": 5130 + }, + { + "epoch": 2.451585926766621, + "grad_norm": 0.4418070881214252, + "learning_rate": 1.7758038475510232e-06, + "loss": 0.2703, + "step": 5131 + }, + { + "epoch": 2.4520637954721938, + "grad_norm": 0.4336894058264998, + "learning_rate": 1.7728088551503986e-06, + "loss": 0.2688, + "step": 5132 + }, + { + "epoch": 2.452541664177767, + "grad_norm": 0.46001029127787707, + "learning_rate": 1.769816144827472e-06, + "loss": 0.2532, + "step": 5133 + }, + { + "epoch": 2.4530195328833404, + "grad_norm": 0.4507732662463414, + "learning_rate": 1.7668257174123672e-06, + "loss": 0.2725, + "step": 5134 + }, + { + "epoch": 2.4534974015889133, + "grad_norm": 0.5054107742226862, + "learning_rate": 1.7638375737345804e-06, + "loss": 0.2612, + "step": 5135 + }, + { + "epoch": 2.4539752702944866, + "grad_norm": 0.5719525646703848, + "learning_rate": 1.7608517146229677e-06, + "loss": 0.2622, + "step": 5136 + }, + { + "epoch": 2.45445313900006, + "grad_norm": 0.45263029420038425, + "learning_rate": 1.7578681409057497e-06, + "loss": 0.2571, + "step": 5137 + }, + { + "epoch": 2.4549310077056328, + "grad_norm": 0.4469493461520194, + "learning_rate": 1.7548868534105234e-06, + "loss": 0.2795, + "step": 5138 + }, + { + "epoch": 2.455408876411206, + "grad_norm": 0.47460938094773264, + "learning_rate": 1.751907852964243e-06, + "loss": 0.2535, + "step": 5139 + }, + { + "epoch": 2.4558867451167794, + "grad_norm": 0.43572000742600314, + "learning_rate": 1.7489311403932274e-06, + "loss": 0.2616, + "step": 5140 + }, + { + "epoch": 2.4563646138223523, + "grad_norm": 0.49154307546187714, + "learning_rate": 1.7459567165231695e-06, + "loss": 0.2679, + "step": 5141 + }, + { + "epoch": 2.4568424825279256, + "grad_norm": 0.43369075353473246, + "learning_rate": 1.7429845821791202e-06, + "loss": 0.2641, + "step": 5142 + }, + { + "epoch": 2.4573203512334985, + "grad_norm": 0.44569959028444417, + "learning_rate": 1.7400147381854936e-06, + "loss": 0.2556, + "step": 5143 + }, + { + "epoch": 2.4577982199390718, + "grad_norm": 0.44301724734443204, + "learning_rate": 1.7370471853660775e-06, + "loss": 0.2594, + "step": 5144 + }, + { + "epoch": 2.458276088644645, + "grad_norm": 0.4331333804341633, + "learning_rate": 1.7340819245440166e-06, + "loss": 0.2698, + "step": 5145 + }, + { + "epoch": 2.458753957350218, + "grad_norm": 0.44930485499724493, + "learning_rate": 1.7311189565418186e-06, + "loss": 0.2694, + "step": 5146 + }, + { + "epoch": 2.4592318260557913, + "grad_norm": 0.4431936634915009, + "learning_rate": 1.7281582821813647e-06, + "loss": 0.2539, + "step": 5147 + }, + { + "epoch": 2.459709694761364, + "grad_norm": 0.4801540861517632, + "learning_rate": 1.7251999022838895e-06, + "loss": 0.26, + "step": 5148 + }, + { + "epoch": 2.4601875634669375, + "grad_norm": 0.44046656799275724, + "learning_rate": 1.722243817669994e-06, + "loss": 0.2602, + "step": 5149 + }, + { + "epoch": 2.4606654321725108, + "grad_norm": 0.4569712842143844, + "learning_rate": 1.7192900291596493e-06, + "loss": 0.2677, + "step": 5150 + }, + { + "epoch": 2.4611433008780836, + "grad_norm": 0.5414744354026383, + "learning_rate": 1.7163385375721819e-06, + "loss": 0.2657, + "step": 5151 + }, + { + "epoch": 2.461621169583657, + "grad_norm": 0.5030301613387165, + "learning_rate": 1.7133893437262771e-06, + "loss": 0.2694, + "step": 5152 + }, + { + "epoch": 2.46209903828923, + "grad_norm": 0.46022742529581706, + "learning_rate": 1.7104424484400006e-06, + "loss": 0.2657, + "step": 5153 + }, + { + "epoch": 2.462576906994803, + "grad_norm": 0.4480168925969945, + "learning_rate": 1.707497852530763e-06, + "loss": 0.2678, + "step": 5154 + }, + { + "epoch": 2.4630547757003765, + "grad_norm": 0.45217430934005776, + "learning_rate": 1.7045555568153415e-06, + "loss": 0.2573, + "step": 5155 + }, + { + "epoch": 2.4635326444059493, + "grad_norm": 0.4771262260653937, + "learning_rate": 1.7016155621098818e-06, + "loss": 0.2663, + "step": 5156 + }, + { + "epoch": 2.4640105131115226, + "grad_norm": 0.43652614153589214, + "learning_rate": 1.6986778692298843e-06, + "loss": 0.257, + "step": 5157 + }, + { + "epoch": 2.4644883818170955, + "grad_norm": 0.45508857688246773, + "learning_rate": 1.695742478990211e-06, + "loss": 0.2625, + "step": 5158 + }, + { + "epoch": 2.464966250522669, + "grad_norm": 0.44398221371433727, + "learning_rate": 1.6928093922050913e-06, + "loss": 0.259, + "step": 5159 + }, + { + "epoch": 2.465444119228242, + "grad_norm": 0.46877325097045486, + "learning_rate": 1.6898786096881104e-06, + "loss": 0.2515, + "step": 5160 + }, + { + "epoch": 2.465921987933815, + "grad_norm": 0.47517976840848003, + "learning_rate": 1.686950132252213e-06, + "loss": 0.2668, + "step": 5161 + }, + { + "epoch": 2.4663998566393883, + "grad_norm": 0.47013178542557804, + "learning_rate": 1.6840239607097109e-06, + "loss": 0.273, + "step": 5162 + }, + { + "epoch": 2.4668777253449616, + "grad_norm": 0.6731321265086605, + "learning_rate": 1.6811000958722713e-06, + "loss": 0.2653, + "step": 5163 + }, + { + "epoch": 2.4673555940505345, + "grad_norm": 0.500612444710717, + "learning_rate": 1.6781785385509197e-06, + "loss": 0.2553, + "step": 5164 + }, + { + "epoch": 2.467833462756108, + "grad_norm": 0.44968808398968174, + "learning_rate": 1.6752592895560493e-06, + "loss": 0.2651, + "step": 5165 + }, + { + "epoch": 2.468311331461681, + "grad_norm": 0.4625732004203681, + "learning_rate": 1.6723423496974057e-06, + "loss": 0.2808, + "step": 5166 + }, + { + "epoch": 2.468789200167254, + "grad_norm": 0.4489275940956645, + "learning_rate": 1.6694277197840947e-06, + "loss": 0.2628, + "step": 5167 + }, + { + "epoch": 2.4692670688728273, + "grad_norm": 0.535652789404287, + "learning_rate": 1.6665154006245888e-06, + "loss": 0.272, + "step": 5168 + }, + { + "epoch": 2.4697449375784, + "grad_norm": 0.47206952227811455, + "learning_rate": 1.6636053930267093e-06, + "loss": 0.2618, + "step": 5169 + }, + { + "epoch": 2.4702228062839735, + "grad_norm": 0.4583292667257452, + "learning_rate": 1.6606976977976408e-06, + "loss": 0.2575, + "step": 5170 + }, + { + "epoch": 2.470700674989547, + "grad_norm": 0.4784423605188315, + "learning_rate": 1.6577923157439302e-06, + "loss": 0.2642, + "step": 5171 + }, + { + "epoch": 2.4711785436951197, + "grad_norm": 0.4666671362911929, + "learning_rate": 1.6548892476714772e-06, + "loss": 0.2663, + "step": 5172 + }, + { + "epoch": 2.471656412400693, + "grad_norm": 0.4821043795677321, + "learning_rate": 1.6519884943855391e-06, + "loss": 0.2718, + "step": 5173 + }, + { + "epoch": 2.472134281106266, + "grad_norm": 0.43488458866311225, + "learning_rate": 1.6490900566907396e-06, + "loss": 0.2441, + "step": 5174 + }, + { + "epoch": 2.472612149811839, + "grad_norm": 0.4670451983441338, + "learning_rate": 1.6461939353910494e-06, + "loss": 0.2845, + "step": 5175 + }, + { + "epoch": 2.4730900185174125, + "grad_norm": 0.45536226914155875, + "learning_rate": 1.643300131289801e-06, + "loss": 0.2494, + "step": 5176 + }, + { + "epoch": 2.4735678872229854, + "grad_norm": 0.4517133543283046, + "learning_rate": 1.6404086451896896e-06, + "loss": 0.2609, + "step": 5177 + }, + { + "epoch": 2.4740457559285587, + "grad_norm": 0.453282470130923, + "learning_rate": 1.6375194778927594e-06, + "loss": 0.274, + "step": 5178 + }, + { + "epoch": 2.4745236246341316, + "grad_norm": 0.4758223874514832, + "learning_rate": 1.6346326302004111e-06, + "loss": 0.2682, + "step": 5179 + }, + { + "epoch": 2.475001493339705, + "grad_norm": 0.4522472979675584, + "learning_rate": 1.631748102913412e-06, + "loss": 0.2838, + "step": 5180 + }, + { + "epoch": 2.475479362045278, + "grad_norm": 0.4389102055107512, + "learning_rate": 1.6288658968318748e-06, + "loss": 0.2756, + "step": 5181 + }, + { + "epoch": 2.475957230750851, + "grad_norm": 0.4561620443865254, + "learning_rate": 1.6259860127552718e-06, + "loss": 0.2704, + "step": 5182 + }, + { + "epoch": 2.4764350994564244, + "grad_norm": 0.46070242911962683, + "learning_rate": 1.6231084514824357e-06, + "loss": 0.248, + "step": 5183 + }, + { + "epoch": 2.4769129681619972, + "grad_norm": 0.4500952574278607, + "learning_rate": 1.6202332138115495e-06, + "loss": 0.2714, + "step": 5184 + }, + { + "epoch": 2.4773908368675706, + "grad_norm": 0.44423358778364996, + "learning_rate": 1.6173603005401505e-06, + "loss": 0.2694, + "step": 5185 + }, + { + "epoch": 2.477868705573144, + "grad_norm": 0.4463179849081536, + "learning_rate": 1.614489712465137e-06, + "loss": 0.27, + "step": 5186 + }, + { + "epoch": 2.4783465742787167, + "grad_norm": 0.4983501684463973, + "learning_rate": 1.6116214503827632e-06, + "loss": 0.2621, + "step": 5187 + }, + { + "epoch": 2.47882444298429, + "grad_norm": 0.443262010521369, + "learning_rate": 1.6087555150886291e-06, + "loss": 0.2722, + "step": 5188 + }, + { + "epoch": 2.4793023116898634, + "grad_norm": 0.4558884591228606, + "learning_rate": 1.6058919073776936e-06, + "loss": 0.2629, + "step": 5189 + }, + { + "epoch": 2.4797801803954362, + "grad_norm": 0.4495529234567257, + "learning_rate": 1.6030306280442764e-06, + "loss": 0.2516, + "step": 5190 + }, + { + "epoch": 2.4802580491010096, + "grad_norm": 0.4572305669305225, + "learning_rate": 1.6001716778820432e-06, + "loss": 0.2662, + "step": 5191 + }, + { + "epoch": 2.480735917806583, + "grad_norm": 0.4459974662375841, + "learning_rate": 1.5973150576840134e-06, + "loss": 0.2657, + "step": 5192 + }, + { + "epoch": 2.4812137865121557, + "grad_norm": 0.5110956161774068, + "learning_rate": 1.5944607682425684e-06, + "loss": 0.2771, + "step": 5193 + }, + { + "epoch": 2.481691655217729, + "grad_norm": 0.5018127615583312, + "learning_rate": 1.5916088103494353e-06, + "loss": 0.2706, + "step": 5194 + }, + { + "epoch": 2.482169523923302, + "grad_norm": 0.46765637570821916, + "learning_rate": 1.588759184795694e-06, + "loss": 0.2681, + "step": 5195 + }, + { + "epoch": 2.4826473926288752, + "grad_norm": 0.4455104728164743, + "learning_rate": 1.5859118923717853e-06, + "loss": 0.2543, + "step": 5196 + }, + { + "epoch": 2.4831252613344486, + "grad_norm": 0.4943599831926817, + "learning_rate": 1.5830669338674953e-06, + "loss": 0.2632, + "step": 5197 + }, + { + "epoch": 2.4836031300400214, + "grad_norm": 0.45460304300287496, + "learning_rate": 1.580224310071964e-06, + "loss": 0.24, + "step": 5198 + }, + { + "epoch": 2.4840809987455947, + "grad_norm": 0.4470853999975232, + "learning_rate": 1.577384021773689e-06, + "loss": 0.2559, + "step": 5199 + }, + { + "epoch": 2.4845588674511676, + "grad_norm": 0.4435833286013539, + "learning_rate": 1.574546069760514e-06, + "loss": 0.2511, + "step": 5200 + }, + { + "epoch": 2.485036736156741, + "grad_norm": 0.4342162740665016, + "learning_rate": 1.571710454819635e-06, + "loss": 0.2632, + "step": 5201 + }, + { + "epoch": 2.4855146048623142, + "grad_norm": 0.47609127386587247, + "learning_rate": 1.5688771777376044e-06, + "loss": 0.2887, + "step": 5202 + }, + { + "epoch": 2.485992473567887, + "grad_norm": 0.4763885822985041, + "learning_rate": 1.5660462393003228e-06, + "loss": 0.283, + "step": 5203 + }, + { + "epoch": 2.4864703422734604, + "grad_norm": 0.45641941404129543, + "learning_rate": 1.56321764029304e-06, + "loss": 0.2764, + "step": 5204 + }, + { + "epoch": 2.4869482109790333, + "grad_norm": 0.45134971688590414, + "learning_rate": 1.5603913815003634e-06, + "loss": 0.2781, + "step": 5205 + }, + { + "epoch": 2.4874260796846066, + "grad_norm": 0.48230919161187935, + "learning_rate": 1.5575674637062465e-06, + "loss": 0.2715, + "step": 5206 + }, + { + "epoch": 2.48790394839018, + "grad_norm": 0.4498323903981208, + "learning_rate": 1.5547458876939902e-06, + "loss": 0.2675, + "step": 5207 + }, + { + "epoch": 2.488381817095753, + "grad_norm": 0.5874324453501406, + "learning_rate": 1.5519266542462552e-06, + "loss": 0.2554, + "step": 5208 + }, + { + "epoch": 2.488859685801326, + "grad_norm": 0.4541224538296049, + "learning_rate": 1.5491097641450448e-06, + "loss": 0.2506, + "step": 5209 + }, + { + "epoch": 2.4893375545068994, + "grad_norm": 0.4646704460437421, + "learning_rate": 1.5462952181717117e-06, + "loss": 0.267, + "step": 5210 + }, + { + "epoch": 2.4898154232124723, + "grad_norm": 0.462874473285813, + "learning_rate": 1.543483017106967e-06, + "loss": 0.2682, + "step": 5211 + }, + { + "epoch": 2.4902932919180456, + "grad_norm": 0.4602131270975226, + "learning_rate": 1.5406731617308635e-06, + "loss": 0.2705, + "step": 5212 + }, + { + "epoch": 2.4907711606236185, + "grad_norm": 0.48190334430500326, + "learning_rate": 1.5378656528228032e-06, + "loss": 0.2515, + "step": 5213 + }, + { + "epoch": 2.491249029329192, + "grad_norm": 0.4591216279627935, + "learning_rate": 1.535060491161542e-06, + "loss": 0.2675, + "step": 5214 + }, + { + "epoch": 2.491726898034765, + "grad_norm": 0.4619766580382836, + "learning_rate": 1.532257677525183e-06, + "loss": 0.265, + "step": 5215 + }, + { + "epoch": 2.492204766740338, + "grad_norm": 0.46916882001283083, + "learning_rate": 1.5294572126911723e-06, + "loss": 0.2526, + "step": 5216 + }, + { + "epoch": 2.4926826354459113, + "grad_norm": 0.4622413318042114, + "learning_rate": 1.526659097436316e-06, + "loss": 0.2688, + "step": 5217 + }, + { + "epoch": 2.4931605041514846, + "grad_norm": 0.4593352758045286, + "learning_rate": 1.5238633325367592e-06, + "loss": 0.2721, + "step": 5218 + }, + { + "epoch": 2.4936383728570575, + "grad_norm": 0.4453905535036544, + "learning_rate": 1.5210699187679945e-06, + "loss": 0.2707, + "step": 5219 + }, + { + "epoch": 2.494116241562631, + "grad_norm": 0.4360154512554449, + "learning_rate": 1.5182788569048689e-06, + "loss": 0.2547, + "step": 5220 + }, + { + "epoch": 2.4945941102682037, + "grad_norm": 0.4673303368639725, + "learning_rate": 1.5154901477215756e-06, + "loss": 0.2697, + "step": 5221 + }, + { + "epoch": 2.495071978973777, + "grad_norm": 0.9738069702656994, + "learning_rate": 1.512703791991651e-06, + "loss": 0.2795, + "step": 5222 + }, + { + "epoch": 2.4955498476793503, + "grad_norm": 0.47649155832990264, + "learning_rate": 1.5099197904879792e-06, + "loss": 0.2777, + "step": 5223 + }, + { + "epoch": 2.496027716384923, + "grad_norm": 0.454990764062367, + "learning_rate": 1.507138143982797e-06, + "loss": 0.252, + "step": 5224 + }, + { + "epoch": 2.4965055850904965, + "grad_norm": 0.4352195318005461, + "learning_rate": 1.5043588532476827e-06, + "loss": 0.2558, + "step": 5225 + }, + { + "epoch": 2.4969834537960693, + "grad_norm": 0.4444748687649701, + "learning_rate": 1.5015819190535586e-06, + "loss": 0.2738, + "step": 5226 + }, + { + "epoch": 2.4974613225016427, + "grad_norm": 0.45483690731958265, + "learning_rate": 1.498807342170704e-06, + "loss": 0.2872, + "step": 5227 + }, + { + "epoch": 2.497939191207216, + "grad_norm": 0.4561626305870393, + "learning_rate": 1.4960351233687342e-06, + "loss": 0.2676, + "step": 5228 + }, + { + "epoch": 2.498417059912789, + "grad_norm": 0.5047010790116747, + "learning_rate": 1.493265263416611e-06, + "loss": 0.2802, + "step": 5229 + }, + { + "epoch": 2.498894928618362, + "grad_norm": 0.43638203904528666, + "learning_rate": 1.49049776308265e-06, + "loss": 0.264, + "step": 5230 + }, + { + "epoch": 2.499372797323935, + "grad_norm": 0.4343364990309924, + "learning_rate": 1.4877326231345046e-06, + "loss": 0.2681, + "step": 5231 + }, + { + "epoch": 2.4998506660295083, + "grad_norm": 0.8690286716299779, + "learning_rate": 1.4849698443391724e-06, + "loss": 0.2644, + "step": 5232 + }, + { + "epoch": 2.5003285347350817, + "grad_norm": 0.47147128875554356, + "learning_rate": 1.4822094274630062e-06, + "loss": 0.2457, + "step": 5233 + }, + { + "epoch": 2.5008064034406545, + "grad_norm": 0.4673984226297333, + "learning_rate": 1.479451373271693e-06, + "loss": 0.2765, + "step": 5234 + }, + { + "epoch": 2.501284272146228, + "grad_norm": 0.4542815587395272, + "learning_rate": 1.476695682530268e-06, + "loss": 0.2895, + "step": 5235 + }, + { + "epoch": 2.5017621408518007, + "grad_norm": 0.4454228012735007, + "learning_rate": 1.473942356003113e-06, + "loss": 0.2511, + "step": 5236 + }, + { + "epoch": 2.502240009557374, + "grad_norm": 0.47387095366988674, + "learning_rate": 1.4711913944539524e-06, + "loss": 0.2591, + "step": 5237 + }, + { + "epoch": 2.5027178782629473, + "grad_norm": 0.44223677795487265, + "learning_rate": 1.4684427986458506e-06, + "loss": 0.2741, + "step": 5238 + }, + { + "epoch": 2.5031957469685207, + "grad_norm": 0.4744918967505802, + "learning_rate": 1.465696569341224e-06, + "loss": 0.2664, + "step": 5239 + }, + { + "epoch": 2.5036736156740935, + "grad_norm": 0.4452348551981471, + "learning_rate": 1.4629527073018267e-06, + "loss": 0.2449, + "step": 5240 + }, + { + "epoch": 2.504151484379667, + "grad_norm": 0.449269605401987, + "learning_rate": 1.4602112132887558e-06, + "loss": 0.2602, + "step": 5241 + }, + { + "epoch": 2.5046293530852397, + "grad_norm": 0.4633053489713711, + "learning_rate": 1.457472088062457e-06, + "loss": 0.2636, + "step": 5242 + }, + { + "epoch": 2.505107221790813, + "grad_norm": 0.4873950617138543, + "learning_rate": 1.4547353323827141e-06, + "loss": 0.2725, + "step": 5243 + }, + { + "epoch": 2.5055850904963863, + "grad_norm": 0.47479818038514116, + "learning_rate": 1.4520009470086505e-06, + "loss": 0.2502, + "step": 5244 + }, + { + "epoch": 2.506062959201959, + "grad_norm": 0.4685977267735002, + "learning_rate": 1.449268932698743e-06, + "loss": 0.2608, + "step": 5245 + }, + { + "epoch": 2.5065408279075325, + "grad_norm": 0.4536044909575744, + "learning_rate": 1.4465392902108011e-06, + "loss": 0.2439, + "step": 5246 + }, + { + "epoch": 2.5070186966131054, + "grad_norm": 0.4361436758103319, + "learning_rate": 1.4438120203019779e-06, + "loss": 0.26, + "step": 5247 + }, + { + "epoch": 2.5074965653186787, + "grad_norm": 0.47056425580013644, + "learning_rate": 1.4410871237287738e-06, + "loss": 0.2575, + "step": 5248 + }, + { + "epoch": 2.507974434024252, + "grad_norm": 0.48001156924264604, + "learning_rate": 1.4383646012470254e-06, + "loss": 0.2782, + "step": 5249 + }, + { + "epoch": 2.508452302729825, + "grad_norm": 0.4657177084103505, + "learning_rate": 1.4356444536119085e-06, + "loss": 0.2592, + "step": 5250 + }, + { + "epoch": 2.508930171435398, + "grad_norm": 0.4329970043724182, + "learning_rate": 1.4329266815779507e-06, + "loss": 0.2594, + "step": 5251 + }, + { + "epoch": 2.509408040140971, + "grad_norm": 0.4624450792688428, + "learning_rate": 1.4302112858990103e-06, + "loss": 0.2633, + "step": 5252 + }, + { + "epoch": 2.5098859088465444, + "grad_norm": 0.45995574276765083, + "learning_rate": 1.4274982673282867e-06, + "loss": 0.2872, + "step": 5253 + }, + { + "epoch": 2.5103637775521177, + "grad_norm": 0.5483488802392914, + "learning_rate": 1.4247876266183314e-06, + "loss": 0.2601, + "step": 5254 + }, + { + "epoch": 2.5108416462576906, + "grad_norm": 0.5288243133437358, + "learning_rate": 1.422079364521024e-06, + "loss": 0.2625, + "step": 5255 + }, + { + "epoch": 2.511319514963264, + "grad_norm": 0.44122821307838805, + "learning_rate": 1.419373481787587e-06, + "loss": 0.2498, + "step": 5256 + }, + { + "epoch": 2.5117973836688368, + "grad_norm": 0.4805211349693034, + "learning_rate": 1.41666997916859e-06, + "loss": 0.2691, + "step": 5257 + }, + { + "epoch": 2.51227525237441, + "grad_norm": 0.4828655990618, + "learning_rate": 1.413968857413932e-06, + "loss": 0.2589, + "step": 5258 + }, + { + "epoch": 2.5127531210799834, + "grad_norm": 0.45262455510116995, + "learning_rate": 1.411270117272856e-06, + "loss": 0.268, + "step": 5259 + }, + { + "epoch": 2.5132309897855563, + "grad_norm": 0.4517397475555426, + "learning_rate": 1.4085737594939497e-06, + "loss": 0.2474, + "step": 5260 + }, + { + "epoch": 2.5137088584911296, + "grad_norm": 0.4581435924810488, + "learning_rate": 1.4058797848251315e-06, + "loss": 0.2687, + "step": 5261 + }, + { + "epoch": 2.5141867271967024, + "grad_norm": 0.48238520728771184, + "learning_rate": 1.4031881940136615e-06, + "loss": 0.275, + "step": 5262 + }, + { + "epoch": 2.5146645959022758, + "grad_norm": 0.4780046524911139, + "learning_rate": 1.4004989878061437e-06, + "loss": 0.2711, + "step": 5263 + }, + { + "epoch": 2.515142464607849, + "grad_norm": 0.45615713033317606, + "learning_rate": 1.3978121669485135e-06, + "loss": 0.279, + "step": 5264 + }, + { + "epoch": 2.5156203333134224, + "grad_norm": 0.48685067829542655, + "learning_rate": 1.3951277321860468e-06, + "loss": 0.2517, + "step": 5265 + }, + { + "epoch": 2.5160982020189953, + "grad_norm": 0.44894317063915157, + "learning_rate": 1.3924456842633615e-06, + "loss": 0.267, + "step": 5266 + }, + { + "epoch": 2.5165760707245686, + "grad_norm": 0.8685504993927438, + "learning_rate": 1.3897660239244093e-06, + "loss": 0.266, + "step": 5267 + }, + { + "epoch": 2.5170539394301414, + "grad_norm": 0.4609315916656919, + "learning_rate": 1.3870887519124777e-06, + "loss": 0.2784, + "step": 5268 + }, + { + "epoch": 2.5175318081357148, + "grad_norm": 0.4691311012646939, + "learning_rate": 1.384413868970199e-06, + "loss": 0.2626, + "step": 5269 + }, + { + "epoch": 2.518009676841288, + "grad_norm": 0.47785266314222014, + "learning_rate": 1.381741375839537e-06, + "loss": 0.2656, + "step": 5270 + }, + { + "epoch": 2.518487545546861, + "grad_norm": 0.45350612752353825, + "learning_rate": 1.3790712732617918e-06, + "loss": 0.2717, + "step": 5271 + }, + { + "epoch": 2.5189654142524343, + "grad_norm": 0.4465246048292593, + "learning_rate": 1.3764035619776062e-06, + "loss": 0.2549, + "step": 5272 + }, + { + "epoch": 2.519443282958007, + "grad_norm": 0.4413523350209254, + "learning_rate": 1.3737382427269551e-06, + "loss": 0.2622, + "step": 5273 + }, + { + "epoch": 2.5199211516635804, + "grad_norm": 0.4402416126165139, + "learning_rate": 1.3710753162491498e-06, + "loss": 0.2425, + "step": 5274 + }, + { + "epoch": 2.5203990203691538, + "grad_norm": 0.4986151441542075, + "learning_rate": 1.3684147832828409e-06, + "loss": 0.2519, + "step": 5275 + }, + { + "epoch": 2.5208768890747266, + "grad_norm": 0.5457742238724771, + "learning_rate": 1.365756644566013e-06, + "loss": 0.2611, + "step": 5276 + }, + { + "epoch": 2.5213547577803, + "grad_norm": 0.4435897317652216, + "learning_rate": 1.3631009008359874e-06, + "loss": 0.2667, + "step": 5277 + }, + { + "epoch": 2.521832626485873, + "grad_norm": 0.45681886545784145, + "learning_rate": 1.360447552829417e-06, + "loss": 0.2539, + "step": 5278 + }, + { + "epoch": 2.522310495191446, + "grad_norm": 0.45114564080703556, + "learning_rate": 1.3577966012822974e-06, + "loss": 0.2526, + "step": 5279 + }, + { + "epoch": 2.5227883638970194, + "grad_norm": 0.5748577158549483, + "learning_rate": 1.355148046929956e-06, + "loss": 0.2846, + "step": 5280 + }, + { + "epoch": 2.5232662326025923, + "grad_norm": 0.5478505157579419, + "learning_rate": 1.352501890507051e-06, + "loss": 0.2546, + "step": 5281 + }, + { + "epoch": 2.5237441013081656, + "grad_norm": 1.090704314845147, + "learning_rate": 1.3498581327475847e-06, + "loss": 0.2617, + "step": 5282 + }, + { + "epoch": 2.5242219700137385, + "grad_norm": 0.44651134484658483, + "learning_rate": 1.3472167743848863e-06, + "loss": 0.2797, + "step": 5283 + }, + { + "epoch": 2.524699838719312, + "grad_norm": 0.4332379133919312, + "learning_rate": 1.344577816151621e-06, + "loss": 0.2693, + "step": 5284 + }, + { + "epoch": 2.525177707424885, + "grad_norm": 0.4454197457889718, + "learning_rate": 1.3419412587797908e-06, + "loss": 0.2913, + "step": 5285 + }, + { + "epoch": 2.525655576130458, + "grad_norm": 0.5539819066751364, + "learning_rate": 1.3393071030007298e-06, + "loss": 0.2734, + "step": 5286 + }, + { + "epoch": 2.5261334448360313, + "grad_norm": 0.4907491307225207, + "learning_rate": 1.3366753495451046e-06, + "loss": 0.2559, + "step": 5287 + }, + { + "epoch": 2.526611313541604, + "grad_norm": 0.44486691210900325, + "learning_rate": 1.3340459991429221e-06, + "loss": 0.2717, + "step": 5288 + }, + { + "epoch": 2.5270891822471775, + "grad_norm": 0.524341426871989, + "learning_rate": 1.3314190525235148e-06, + "loss": 0.2785, + "step": 5289 + }, + { + "epoch": 2.527567050952751, + "grad_norm": 0.4507961520371896, + "learning_rate": 1.3287945104155487e-06, + "loss": 0.2692, + "step": 5290 + }, + { + "epoch": 2.528044919658324, + "grad_norm": 0.4931269433170495, + "learning_rate": 1.32617237354703e-06, + "loss": 0.2577, + "step": 5291 + }, + { + "epoch": 2.528522788363897, + "grad_norm": 0.44578474408366947, + "learning_rate": 1.3235526426452916e-06, + "loss": 0.2853, + "step": 5292 + }, + { + "epoch": 2.5290006570694703, + "grad_norm": 0.4549319341008648, + "learning_rate": 1.3209353184369978e-06, + "loss": 0.2599, + "step": 5293 + }, + { + "epoch": 2.529478525775043, + "grad_norm": 0.44026422411476585, + "learning_rate": 1.318320401648152e-06, + "loss": 0.274, + "step": 5294 + }, + { + "epoch": 2.5299563944806165, + "grad_norm": 0.44736941554716725, + "learning_rate": 1.3157078930040856e-06, + "loss": 0.2668, + "step": 5295 + }, + { + "epoch": 2.53043426318619, + "grad_norm": 0.4735831103712228, + "learning_rate": 1.3130977932294597e-06, + "loss": 0.2688, + "step": 5296 + }, + { + "epoch": 2.5309121318917627, + "grad_norm": 0.4685212925748311, + "learning_rate": 1.310490103048273e-06, + "loss": 0.2482, + "step": 5297 + }, + { + "epoch": 2.531390000597336, + "grad_norm": 0.4593947890008595, + "learning_rate": 1.3078848231838514e-06, + "loss": 0.2581, + "step": 5298 + }, + { + "epoch": 2.531867869302909, + "grad_norm": 0.4311892224206543, + "learning_rate": 1.3052819543588512e-06, + "loss": 0.2465, + "step": 5299 + }, + { + "epoch": 2.532345738008482, + "grad_norm": 0.46314465945990146, + "learning_rate": 1.3026814972952674e-06, + "loss": 0.2684, + "step": 5300 + }, + { + "epoch": 2.5328236067140555, + "grad_norm": 0.46309469275162923, + "learning_rate": 1.300083452714418e-06, + "loss": 0.2615, + "step": 5301 + }, + { + "epoch": 2.5333014754196284, + "grad_norm": 0.4437569382154475, + "learning_rate": 1.2974878213369523e-06, + "loss": 0.2639, + "step": 5302 + }, + { + "epoch": 2.5337793441252017, + "grad_norm": 0.5664502089285215, + "learning_rate": 1.294894603882858e-06, + "loss": 0.2615, + "step": 5303 + }, + { + "epoch": 2.5342572128307745, + "grad_norm": 0.4592275859545169, + "learning_rate": 1.2923038010714451e-06, + "loss": 0.2601, + "step": 5304 + }, + { + "epoch": 2.534735081536348, + "grad_norm": 0.4309337745483858, + "learning_rate": 1.2897154136213542e-06, + "loss": 0.2823, + "step": 5305 + }, + { + "epoch": 2.535212950241921, + "grad_norm": 0.48862862670470414, + "learning_rate": 1.287129442250562e-06, + "loss": 0.2745, + "step": 5306 + }, + { + "epoch": 2.535690818947494, + "grad_norm": 0.83841385727078, + "learning_rate": 1.2845458876763718e-06, + "loss": 0.2782, + "step": 5307 + }, + { + "epoch": 2.5361686876530674, + "grad_norm": 0.43581783536876467, + "learning_rate": 1.281964750615412e-06, + "loss": 0.2651, + "step": 5308 + }, + { + "epoch": 2.5366465563586402, + "grad_norm": 0.4411137035318371, + "learning_rate": 1.2793860317836482e-06, + "loss": 0.2884, + "step": 5309 + }, + { + "epoch": 2.5371244250642135, + "grad_norm": 0.46287211410540297, + "learning_rate": 1.2768097318963701e-06, + "loss": 0.2376, + "step": 5310 + }, + { + "epoch": 2.537602293769787, + "grad_norm": 0.44652943411212714, + "learning_rate": 1.2742358516681963e-06, + "loss": 0.2763, + "step": 5311 + }, + { + "epoch": 2.53808016247536, + "grad_norm": 0.45278372197354316, + "learning_rate": 1.27166439181308e-06, + "loss": 0.2682, + "step": 5312 + }, + { + "epoch": 2.538558031180933, + "grad_norm": 0.44885245556664793, + "learning_rate": 1.2690953530442963e-06, + "loss": 0.2417, + "step": 5313 + }, + { + "epoch": 2.539035899886506, + "grad_norm": 0.43904932734636665, + "learning_rate": 1.2665287360744482e-06, + "loss": 0.2647, + "step": 5314 + }, + { + "epoch": 2.5395137685920792, + "grad_norm": 0.42657730106507596, + "learning_rate": 1.2639645416154744e-06, + "loss": 0.2546, + "step": 5315 + }, + { + "epoch": 2.5399916372976525, + "grad_norm": 0.5240326634712088, + "learning_rate": 1.2614027703786369e-06, + "loss": 0.249, + "step": 5316 + }, + { + "epoch": 2.540469506003226, + "grad_norm": 0.44736006043482424, + "learning_rate": 1.2588434230745228e-06, + "loss": 0.2483, + "step": 5317 + }, + { + "epoch": 2.5409473747087987, + "grad_norm": 0.4431785975651459, + "learning_rate": 1.2562865004130532e-06, + "loss": 0.2623, + "step": 5318 + }, + { + "epoch": 2.541425243414372, + "grad_norm": 0.4599390657644793, + "learning_rate": 1.2537320031034717e-06, + "loss": 0.2704, + "step": 5319 + }, + { + "epoch": 2.541903112119945, + "grad_norm": 0.4588576873276534, + "learning_rate": 1.2511799318543493e-06, + "loss": 0.2862, + "step": 5320 + }, + { + "epoch": 2.542380980825518, + "grad_norm": 0.4409900007452561, + "learning_rate": 1.2486302873735878e-06, + "loss": 0.2639, + "step": 5321 + }, + { + "epoch": 2.5428588495310915, + "grad_norm": 0.4463947225335007, + "learning_rate": 1.2460830703684147e-06, + "loss": 0.2563, + "step": 5322 + }, + { + "epoch": 2.5433367182366644, + "grad_norm": 0.44554797570869803, + "learning_rate": 1.243538281545381e-06, + "loss": 0.2618, + "step": 5323 + }, + { + "epoch": 2.5438145869422377, + "grad_norm": 0.4366283293412151, + "learning_rate": 1.2409959216103651e-06, + "loss": 0.2727, + "step": 5324 + }, + { + "epoch": 2.5442924556478106, + "grad_norm": 0.6447095546778173, + "learning_rate": 1.2384559912685768e-06, + "loss": 0.2596, + "step": 5325 + }, + { + "epoch": 2.544770324353384, + "grad_norm": 0.46130620935874256, + "learning_rate": 1.2359184912245448e-06, + "loss": 0.2674, + "step": 5326 + }, + { + "epoch": 2.545248193058957, + "grad_norm": 0.45602539579070933, + "learning_rate": 1.2333834221821262e-06, + "loss": 0.2573, + "step": 5327 + }, + { + "epoch": 2.54572606176453, + "grad_norm": 0.4549646150650348, + "learning_rate": 1.2308507848445072e-06, + "loss": 0.2579, + "step": 5328 + }, + { + "epoch": 2.5462039304701034, + "grad_norm": 0.4415752044414283, + "learning_rate": 1.228320579914195e-06, + "loss": 0.2794, + "step": 5329 + }, + { + "epoch": 2.5466817991756763, + "grad_norm": 0.47703375879035975, + "learning_rate": 1.2257928080930236e-06, + "loss": 0.2729, + "step": 5330 + }, + { + "epoch": 2.5471596678812496, + "grad_norm": 0.6782750426427807, + "learning_rate": 1.2232674700821535e-06, + "loss": 0.2681, + "step": 5331 + }, + { + "epoch": 2.547637536586823, + "grad_norm": 0.44620672912194137, + "learning_rate": 1.2207445665820695e-06, + "loss": 0.2655, + "step": 5332 + }, + { + "epoch": 2.5481154052923958, + "grad_norm": 0.49483270413584723, + "learning_rate": 1.2182240982925764e-06, + "loss": 0.255, + "step": 5333 + }, + { + "epoch": 2.548593273997969, + "grad_norm": 0.4760394124784553, + "learning_rate": 1.2157060659128128e-06, + "loss": 0.261, + "step": 5334 + }, + { + "epoch": 2.549071142703542, + "grad_norm": 0.43536619499616924, + "learning_rate": 1.2131904701412345e-06, + "loss": 0.2699, + "step": 5335 + }, + { + "epoch": 2.5495490114091153, + "grad_norm": 0.4567672339365635, + "learning_rate": 1.2106773116756198e-06, + "loss": 0.2887, + "step": 5336 + }, + { + "epoch": 2.5500268801146886, + "grad_norm": 0.44424439879811406, + "learning_rate": 1.2081665912130813e-06, + "loss": 0.2715, + "step": 5337 + }, + { + "epoch": 2.550504748820262, + "grad_norm": 0.47455117509378236, + "learning_rate": 1.2056583094500451e-06, + "loss": 0.2692, + "step": 5338 + }, + { + "epoch": 2.5509826175258348, + "grad_norm": 0.4577068157459303, + "learning_rate": 1.2031524670822613e-06, + "loss": 0.2527, + "step": 5339 + }, + { + "epoch": 2.551460486231408, + "grad_norm": 0.4635419551668722, + "learning_rate": 1.2006490648048118e-06, + "loss": 0.2616, + "step": 5340 + }, + { + "epoch": 2.551938354936981, + "grad_norm": 0.4872478703098568, + "learning_rate": 1.1981481033120945e-06, + "loss": 0.2779, + "step": 5341 + }, + { + "epoch": 2.5524162236425543, + "grad_norm": 0.43900058178403417, + "learning_rate": 1.1956495832978289e-06, + "loss": 0.2635, + "step": 5342 + }, + { + "epoch": 2.5528940923481276, + "grad_norm": 0.48199098433370163, + "learning_rate": 1.1931535054550647e-06, + "loss": 0.2598, + "step": 5343 + }, + { + "epoch": 2.5533719610537005, + "grad_norm": 0.476582985861211, + "learning_rate": 1.1906598704761685e-06, + "loss": 0.2681, + "step": 5344 + }, + { + "epoch": 2.5538498297592738, + "grad_norm": 0.43490218370942263, + "learning_rate": 1.1881686790528279e-06, + "loss": 0.2522, + "step": 5345 + }, + { + "epoch": 2.5543276984648466, + "grad_norm": 0.435352449223695, + "learning_rate": 1.1856799318760592e-06, + "loss": 0.2582, + "step": 5346 + }, + { + "epoch": 2.55480556717042, + "grad_norm": 0.48299189717460383, + "learning_rate": 1.1831936296361957e-06, + "loss": 0.2582, + "step": 5347 + }, + { + "epoch": 2.5552834358759933, + "grad_norm": 0.45700996353152185, + "learning_rate": 1.1807097730228912e-06, + "loss": 0.2652, + "step": 5348 + }, + { + "epoch": 2.555761304581566, + "grad_norm": 0.45128915246269297, + "learning_rate": 1.178228362725129e-06, + "loss": 0.2566, + "step": 5349 + }, + { + "epoch": 2.5562391732871395, + "grad_norm": 0.49437093690101924, + "learning_rate": 1.1757493994312052e-06, + "loss": 0.2716, + "step": 5350 + }, + { + "epoch": 2.5567170419927123, + "grad_norm": 0.5541061982972699, + "learning_rate": 1.1732728838287388e-06, + "loss": 0.2563, + "step": 5351 + }, + { + "epoch": 2.5571949106982856, + "grad_norm": 0.46593343771916595, + "learning_rate": 1.1707988166046757e-06, + "loss": 0.2553, + "step": 5352 + }, + { + "epoch": 2.557672779403859, + "grad_norm": 0.48119739643546555, + "learning_rate": 1.168327198445276e-06, + "loss": 0.2692, + "step": 5353 + }, + { + "epoch": 2.558150648109432, + "grad_norm": 0.44694892042251333, + "learning_rate": 1.1658580300361223e-06, + "loss": 0.2623, + "step": 5354 + }, + { + "epoch": 2.558628516815005, + "grad_norm": 0.7466740830810888, + "learning_rate": 1.1633913120621188e-06, + "loss": 0.275, + "step": 5355 + }, + { + "epoch": 2.559106385520578, + "grad_norm": 0.4510071850128832, + "learning_rate": 1.1609270452074917e-06, + "loss": 0.2514, + "step": 5356 + }, + { + "epoch": 2.5595842542261513, + "grad_norm": 0.4667471409690589, + "learning_rate": 1.158465230155784e-06, + "loss": 0.2806, + "step": 5357 + }, + { + "epoch": 2.5600621229317246, + "grad_norm": 0.4836523364080265, + "learning_rate": 1.1560058675898577e-06, + "loss": 0.277, + "step": 5358 + }, + { + "epoch": 2.5605399916372975, + "grad_norm": 0.4864707078033067, + "learning_rate": 1.1535489581919012e-06, + "loss": 0.2656, + "step": 5359 + }, + { + "epoch": 2.561017860342871, + "grad_norm": 0.43787020641937124, + "learning_rate": 1.151094502643414e-06, + "loss": 0.2451, + "step": 5360 + }, + { + "epoch": 2.5614957290484437, + "grad_norm": 0.456591257908703, + "learning_rate": 1.148642501625218e-06, + "loss": 0.2525, + "step": 5361 + }, + { + "epoch": 2.561973597754017, + "grad_norm": 0.45414715242812, + "learning_rate": 1.1461929558174589e-06, + "loss": 0.2576, + "step": 5362 + }, + { + "epoch": 2.5624514664595903, + "grad_norm": 0.4297472089254829, + "learning_rate": 1.1437458658995947e-06, + "loss": 0.2565, + "step": 5363 + }, + { + "epoch": 2.5629293351651636, + "grad_norm": 0.6242800594293806, + "learning_rate": 1.1413012325504048e-06, + "loss": 0.252, + "step": 5364 + }, + { + "epoch": 2.5634072038707365, + "grad_norm": 0.44685469395106875, + "learning_rate": 1.1388590564479895e-06, + "loss": 0.2716, + "step": 5365 + }, + { + "epoch": 2.56388507257631, + "grad_norm": 0.43936844793345164, + "learning_rate": 1.1364193382697642e-06, + "loss": 0.2564, + "step": 5366 + }, + { + "epoch": 2.5643629412818827, + "grad_norm": 0.4655861680277552, + "learning_rate": 1.1339820786924616e-06, + "loss": 0.2878, + "step": 5367 + }, + { + "epoch": 2.564840809987456, + "grad_norm": 0.46292732997498603, + "learning_rate": 1.1315472783921378e-06, + "loss": 0.2688, + "step": 5368 + }, + { + "epoch": 2.5653186786930293, + "grad_norm": 0.4598254943620399, + "learning_rate": 1.1291149380441636e-06, + "loss": 0.2673, + "step": 5369 + }, + { + "epoch": 2.565796547398602, + "grad_norm": 0.454472063477016, + "learning_rate": 1.1266850583232224e-06, + "loss": 0.2478, + "step": 5370 + }, + { + "epoch": 2.5662744161041755, + "grad_norm": 0.4325026111247085, + "learning_rate": 1.1242576399033267e-06, + "loss": 0.2562, + "step": 5371 + }, + { + "epoch": 2.5667522848097484, + "grad_norm": 0.4453897324065896, + "learning_rate": 1.1218326834577953e-06, + "loss": 0.2518, + "step": 5372 + }, + { + "epoch": 2.5672301535153217, + "grad_norm": 0.4650279923070856, + "learning_rate": 1.119410189659268e-06, + "loss": 0.2888, + "step": 5373 + }, + { + "epoch": 2.567708022220895, + "grad_norm": 0.4475210944515468, + "learning_rate": 1.116990159179705e-06, + "loss": 0.2607, + "step": 5374 + }, + { + "epoch": 2.568185890926468, + "grad_norm": 0.4867286499909107, + "learning_rate": 1.1145725926903772e-06, + "loss": 0.2759, + "step": 5375 + }, + { + "epoch": 2.568663759632041, + "grad_norm": 0.5000116826228374, + "learning_rate": 1.112157490861875e-06, + "loss": 0.2527, + "step": 5376 + }, + { + "epoch": 2.569141628337614, + "grad_norm": 0.4353063417332031, + "learning_rate": 1.1097448543641077e-06, + "loss": 0.257, + "step": 5377 + }, + { + "epoch": 2.5696194970431874, + "grad_norm": 0.5646763109551065, + "learning_rate": 1.107334683866297e-06, + "loss": 0.2804, + "step": 5378 + }, + { + "epoch": 2.5700973657487607, + "grad_norm": 0.4829620257030537, + "learning_rate": 1.1049269800369787e-06, + "loss": 0.2713, + "step": 5379 + }, + { + "epoch": 2.5705752344543336, + "grad_norm": 0.44373746829302196, + "learning_rate": 1.1025217435440116e-06, + "loss": 0.2708, + "step": 5380 + }, + { + "epoch": 2.571053103159907, + "grad_norm": 0.45314693225438013, + "learning_rate": 1.1001189750545637e-06, + "loss": 0.2661, + "step": 5381 + }, + { + "epoch": 2.5715309718654797, + "grad_norm": 0.4396802093044451, + "learning_rate": 1.0977186752351187e-06, + "loss": 0.2518, + "step": 5382 + }, + { + "epoch": 2.572008840571053, + "grad_norm": 0.43552638814311895, + "learning_rate": 1.095320844751483e-06, + "loss": 0.2547, + "step": 5383 + }, + { + "epoch": 2.5724867092766264, + "grad_norm": 0.46000133848749897, + "learning_rate": 1.0929254842687676e-06, + "loss": 0.2746, + "step": 5384 + }, + { + "epoch": 2.5729645779821992, + "grad_norm": 0.514481355649078, + "learning_rate": 1.0905325944514034e-06, + "loss": 0.2714, + "step": 5385 + }, + { + "epoch": 2.5734424466877726, + "grad_norm": 0.44321694655322663, + "learning_rate": 1.0881421759631394e-06, + "loss": 0.2642, + "step": 5386 + }, + { + "epoch": 2.5739203153933454, + "grad_norm": 0.4519160941249428, + "learning_rate": 1.085754229467032e-06, + "loss": 0.2406, + "step": 5387 + }, + { + "epoch": 2.5743981840989187, + "grad_norm": 0.45129680242728143, + "learning_rate": 1.0833687556254558e-06, + "loss": 0.2617, + "step": 5388 + }, + { + "epoch": 2.574876052804492, + "grad_norm": 0.47689305663408194, + "learning_rate": 1.0809857551001013e-06, + "loss": 0.2532, + "step": 5389 + }, + { + "epoch": 2.5753539215100654, + "grad_norm": 0.46337528524382954, + "learning_rate": 1.078605228551971e-06, + "loss": 0.2718, + "step": 5390 + }, + { + "epoch": 2.5758317902156382, + "grad_norm": 0.4470813207728647, + "learning_rate": 1.0762271766413768e-06, + "loss": 0.243, + "step": 5391 + }, + { + "epoch": 2.5763096589212116, + "grad_norm": 0.44289569253985883, + "learning_rate": 1.0738516000279542e-06, + "loss": 0.2492, + "step": 5392 + }, + { + "epoch": 2.5767875276267844, + "grad_norm": 0.4728394742707393, + "learning_rate": 1.0714784993706418e-06, + "loss": 0.2685, + "step": 5393 + }, + { + "epoch": 2.5772653963323577, + "grad_norm": 0.4549229472146639, + "learning_rate": 1.0691078753276962e-06, + "loss": 0.2512, + "step": 5394 + }, + { + "epoch": 2.577743265037931, + "grad_norm": 0.44764612742438686, + "learning_rate": 1.0667397285566893e-06, + "loss": 0.2549, + "step": 5395 + }, + { + "epoch": 2.578221133743504, + "grad_norm": 0.4822527109733988, + "learning_rate": 1.0643740597145025e-06, + "loss": 0.2631, + "step": 5396 + }, + { + "epoch": 2.5786990024490772, + "grad_norm": 0.4572397580448997, + "learning_rate": 1.0620108694573272e-06, + "loss": 0.2842, + "step": 5397 + }, + { + "epoch": 2.57917687115465, + "grad_norm": 0.42473552272997284, + "learning_rate": 1.0596501584406749e-06, + "loss": 0.2633, + "step": 5398 + }, + { + "epoch": 2.5796547398602234, + "grad_norm": 0.4314598541362525, + "learning_rate": 1.0572919273193639e-06, + "loss": 0.2598, + "step": 5399 + }, + { + "epoch": 2.5801326085657967, + "grad_norm": 0.43609204699648757, + "learning_rate": 1.0549361767475241e-06, + "loss": 0.2537, + "step": 5400 + }, + { + "epoch": 2.5806104772713696, + "grad_norm": 0.448934958642449, + "learning_rate": 1.052582907378602e-06, + "loss": 0.2655, + "step": 5401 + }, + { + "epoch": 2.581088345976943, + "grad_norm": 0.44431074903974976, + "learning_rate": 1.050232119865352e-06, + "loss": 0.266, + "step": 5402 + }, + { + "epoch": 2.581566214682516, + "grad_norm": 0.4752805202026781, + "learning_rate": 1.047883814859838e-06, + "loss": 0.2891, + "step": 5403 + }, + { + "epoch": 2.582044083388089, + "grad_norm": 0.4475172447028092, + "learning_rate": 1.0455379930134435e-06, + "loss": 0.2694, + "step": 5404 + }, + { + "epoch": 2.5825219520936624, + "grad_norm": 0.5428975699380926, + "learning_rate": 1.0431946549768567e-06, + "loss": 0.2692, + "step": 5405 + }, + { + "epoch": 2.5829998207992353, + "grad_norm": 0.46361910025632774, + "learning_rate": 1.0408538014000747e-06, + "loss": 0.2635, + "step": 5406 + }, + { + "epoch": 2.5834776895048086, + "grad_norm": 0.4420791110581533, + "learning_rate": 1.0385154329324132e-06, + "loss": 0.2624, + "step": 5407 + }, + { + "epoch": 2.5839555582103815, + "grad_norm": 0.4595527525010734, + "learning_rate": 1.0361795502224925e-06, + "loss": 0.2608, + "step": 5408 + }, + { + "epoch": 2.584433426915955, + "grad_norm": 0.45101202999040235, + "learning_rate": 1.0338461539182443e-06, + "loss": 0.2618, + "step": 5409 + }, + { + "epoch": 2.584911295621528, + "grad_norm": 0.4599073300017213, + "learning_rate": 1.0315152446669142e-06, + "loss": 0.2703, + "step": 5410 + }, + { + "epoch": 2.585389164327101, + "grad_norm": 0.5079598888681588, + "learning_rate": 1.0291868231150537e-06, + "loss": 0.2681, + "step": 5411 + }, + { + "epoch": 2.5858670330326743, + "grad_norm": 0.48499751233901656, + "learning_rate": 1.0268608899085241e-06, + "loss": 0.2548, + "step": 5412 + }, + { + "epoch": 2.586344901738247, + "grad_norm": 0.4422616538627771, + "learning_rate": 1.0245374456925029e-06, + "loss": 0.2696, + "step": 5413 + }, + { + "epoch": 2.5868227704438205, + "grad_norm": 0.4443272586161862, + "learning_rate": 1.0222164911114697e-06, + "loss": 0.2582, + "step": 5414 + }, + { + "epoch": 2.587300639149394, + "grad_norm": 0.44296846825131514, + "learning_rate": 1.019898026809214e-06, + "loss": 0.2803, + "step": 5415 + }, + { + "epoch": 2.587778507854967, + "grad_norm": 0.4517679987221167, + "learning_rate": 1.0175820534288416e-06, + "loss": 0.2806, + "step": 5416 + }, + { + "epoch": 2.58825637656054, + "grad_norm": 0.48611487954550253, + "learning_rate": 1.0152685716127598e-06, + "loss": 0.2602, + "step": 5417 + }, + { + "epoch": 2.5887342452661133, + "grad_norm": 0.4628864103901881, + "learning_rate": 1.0129575820026872e-06, + "loss": 0.272, + "step": 5418 + }, + { + "epoch": 2.589212113971686, + "grad_norm": 0.4649352347539501, + "learning_rate": 1.0106490852396544e-06, + "loss": 0.2539, + "step": 5419 + }, + { + "epoch": 2.5896899826772595, + "grad_norm": 0.43939173718242563, + "learning_rate": 1.0083430819639962e-06, + "loss": 0.2475, + "step": 5420 + }, + { + "epoch": 2.590167851382833, + "grad_norm": 0.44410831432228415, + "learning_rate": 1.0060395728153539e-06, + "loss": 0.2722, + "step": 5421 + }, + { + "epoch": 2.5906457200884057, + "grad_norm": 0.5288794827685146, + "learning_rate": 1.0037385584326843e-06, + "loss": 0.2566, + "step": 5422 + }, + { + "epoch": 2.591123588793979, + "grad_norm": 0.43733087257688036, + "learning_rate": 1.0014400394542489e-06, + "loss": 0.2572, + "step": 5423 + }, + { + "epoch": 2.591601457499552, + "grad_norm": 0.4783741515835818, + "learning_rate": 9.991440165176147e-07, + "loss": 0.251, + "step": 5424 + }, + { + "epoch": 2.592079326205125, + "grad_norm": 0.45766909636052455, + "learning_rate": 9.968504902596566e-07, + "loss": 0.2809, + "step": 5425 + }, + { + "epoch": 2.5925571949106985, + "grad_norm": 0.47258848724591235, + "learning_rate": 9.94559461316561e-07, + "loss": 0.262, + "step": 5426 + }, + { + "epoch": 2.5930350636162713, + "grad_norm": 0.4790702365770836, + "learning_rate": 9.922709303238175e-07, + "loss": 0.262, + "step": 5427 + }, + { + "epoch": 2.5935129323218447, + "grad_norm": 0.676497020285866, + "learning_rate": 9.899848979162218e-07, + "loss": 0.2711, + "step": 5428 + }, + { + "epoch": 2.5939908010274175, + "grad_norm": 0.44747328409752335, + "learning_rate": 9.877013647278844e-07, + "loss": 0.2647, + "step": 5429 + }, + { + "epoch": 2.594468669732991, + "grad_norm": 0.4606135261704647, + "learning_rate": 9.85420331392214e-07, + "loss": 0.2532, + "step": 5430 + }, + { + "epoch": 2.594946538438564, + "grad_norm": 0.43604837240539995, + "learning_rate": 9.831417985419278e-07, + "loss": 0.2645, + "step": 5431 + }, + { + "epoch": 2.595424407144137, + "grad_norm": 0.4373132967875298, + "learning_rate": 9.808657668090527e-07, + "loss": 0.2579, + "step": 5432 + }, + { + "epoch": 2.5959022758497103, + "grad_norm": 0.4573400005148769, + "learning_rate": 9.785922368249201e-07, + "loss": 0.2649, + "step": 5433 + }, + { + "epoch": 2.596380144555283, + "grad_norm": 0.4286328321592479, + "learning_rate": 9.763212092201634e-07, + "loss": 0.2771, + "step": 5434 + }, + { + "epoch": 2.5968580132608565, + "grad_norm": 0.46487186359736915, + "learning_rate": 9.74052684624731e-07, + "loss": 0.2605, + "step": 5435 + }, + { + "epoch": 2.59733588196643, + "grad_norm": 0.45134027488832124, + "learning_rate": 9.717866636678685e-07, + "loss": 0.2696, + "step": 5436 + }, + { + "epoch": 2.5978137506720027, + "grad_norm": 0.4315621650790117, + "learning_rate": 9.695231469781285e-07, + "loss": 0.2595, + "step": 5437 + }, + { + "epoch": 2.598291619377576, + "grad_norm": 0.43086059451199493, + "learning_rate": 9.672621351833754e-07, + "loss": 0.2563, + "step": 5438 + }, + { + "epoch": 2.598769488083149, + "grad_norm": 0.4510589878533883, + "learning_rate": 9.6500362891077e-07, + "loss": 0.2753, + "step": 5439 + }, + { + "epoch": 2.599247356788722, + "grad_norm": 0.47484782844891105, + "learning_rate": 9.62747628786782e-07, + "loss": 0.2804, + "step": 5440 + }, + { + "epoch": 2.5997252254942955, + "grad_norm": 0.44734361221136104, + "learning_rate": 9.604941354371899e-07, + "loss": 0.2565, + "step": 5441 + }, + { + "epoch": 2.600203094199869, + "grad_norm": 0.4523591857593359, + "learning_rate": 9.582431494870693e-07, + "loss": 0.2537, + "step": 5442 + }, + { + "epoch": 2.6006809629054417, + "grad_norm": 0.45376389340573675, + "learning_rate": 9.559946715608037e-07, + "loss": 0.2473, + "step": 5443 + }, + { + "epoch": 2.601158831611015, + "grad_norm": 0.5018534323943927, + "learning_rate": 9.537487022820846e-07, + "loss": 0.2692, + "step": 5444 + }, + { + "epoch": 2.601636700316588, + "grad_norm": 0.571660106743449, + "learning_rate": 9.515052422739035e-07, + "loss": 0.2762, + "step": 5445 + }, + { + "epoch": 2.602114569022161, + "grad_norm": 0.4478057549739928, + "learning_rate": 9.492642921585526e-07, + "loss": 0.2676, + "step": 5446 + }, + { + "epoch": 2.6025924377277345, + "grad_norm": 0.4695545143748728, + "learning_rate": 9.47025852557636e-07, + "loss": 0.2771, + "step": 5447 + }, + { + "epoch": 2.6030703064333074, + "grad_norm": 0.4452493287419494, + "learning_rate": 9.447899240920566e-07, + "loss": 0.2578, + "step": 5448 + }, + { + "epoch": 2.6035481751388807, + "grad_norm": 0.440730727424376, + "learning_rate": 9.425565073820198e-07, + "loss": 0.3075, + "step": 5449 + }, + { + "epoch": 2.6040260438444536, + "grad_norm": 0.44510497848773967, + "learning_rate": 9.403256030470386e-07, + "loss": 0.2695, + "step": 5450 + }, + { + "epoch": 2.604503912550027, + "grad_norm": 0.4519704788457889, + "learning_rate": 9.380972117059262e-07, + "loss": 0.2568, + "step": 5451 + }, + { + "epoch": 2.6049817812556, + "grad_norm": 0.5598137706232549, + "learning_rate": 9.358713339767955e-07, + "loss": 0.252, + "step": 5452 + }, + { + "epoch": 2.605459649961173, + "grad_norm": 0.46188600754569076, + "learning_rate": 9.336479704770696e-07, + "loss": 0.2665, + "step": 5453 + }, + { + "epoch": 2.6059375186667464, + "grad_norm": 0.49788442318547643, + "learning_rate": 9.314271218234693e-07, + "loss": 0.2529, + "step": 5454 + }, + { + "epoch": 2.6064153873723193, + "grad_norm": 0.4345623662569078, + "learning_rate": 9.292087886320166e-07, + "loss": 0.2615, + "step": 5455 + }, + { + "epoch": 2.6068932560778926, + "grad_norm": 0.44139821572916704, + "learning_rate": 9.269929715180404e-07, + "loss": 0.2822, + "step": 5456 + }, + { + "epoch": 2.607371124783466, + "grad_norm": 0.4252933161869911, + "learning_rate": 9.247796710961699e-07, + "loss": 0.2483, + "step": 5457 + }, + { + "epoch": 2.6078489934890388, + "grad_norm": 0.4412914528127249, + "learning_rate": 9.225688879803351e-07, + "loss": 0.2647, + "step": 5458 + }, + { + "epoch": 2.608326862194612, + "grad_norm": 0.4357571583911059, + "learning_rate": 9.203606227837658e-07, + "loss": 0.2616, + "step": 5459 + }, + { + "epoch": 2.608804730900185, + "grad_norm": 0.4994871920703874, + "learning_rate": 9.181548761189996e-07, + "loss": 0.2688, + "step": 5460 + }, + { + "epoch": 2.6092825996057583, + "grad_norm": 0.4351987789797297, + "learning_rate": 9.159516485978692e-07, + "loss": 0.2691, + "step": 5461 + }, + { + "epoch": 2.6097604683113316, + "grad_norm": 0.4430845065598318, + "learning_rate": 9.137509408315104e-07, + "loss": 0.2593, + "step": 5462 + }, + { + "epoch": 2.6102383370169044, + "grad_norm": 0.47947175182055163, + "learning_rate": 9.115527534303637e-07, + "loss": 0.2645, + "step": 5463 + }, + { + "epoch": 2.6107162057224778, + "grad_norm": 0.4428209512787218, + "learning_rate": 9.093570870041645e-07, + "loss": 0.2655, + "step": 5464 + }, + { + "epoch": 2.6111940744280506, + "grad_norm": 0.46538192551551194, + "learning_rate": 9.071639421619527e-07, + "loss": 0.2834, + "step": 5465 + }, + { + "epoch": 2.611671943133624, + "grad_norm": 0.42759421702996847, + "learning_rate": 9.049733195120703e-07, + "loss": 0.2607, + "step": 5466 + }, + { + "epoch": 2.6121498118391973, + "grad_norm": 0.4208547298808128, + "learning_rate": 9.027852196621545e-07, + "loss": 0.2422, + "step": 5467 + }, + { + "epoch": 2.6126276805447706, + "grad_norm": 0.4382513214899705, + "learning_rate": 9.005996432191455e-07, + "loss": 0.2649, + "step": 5468 + }, + { + "epoch": 2.6131055492503434, + "grad_norm": 0.4914340398062526, + "learning_rate": 8.984165907892872e-07, + "loss": 0.2635, + "step": 5469 + }, + { + "epoch": 2.6135834179559168, + "grad_norm": 0.42931017707652125, + "learning_rate": 8.962360629781164e-07, + "loss": 0.2506, + "step": 5470 + }, + { + "epoch": 2.6140612866614896, + "grad_norm": 0.46339088752838625, + "learning_rate": 8.940580603904736e-07, + "loss": 0.2733, + "step": 5471 + }, + { + "epoch": 2.614539155367063, + "grad_norm": 0.48172702720835325, + "learning_rate": 8.918825836304989e-07, + "loss": 0.2797, + "step": 5472 + }, + { + "epoch": 2.6150170240726363, + "grad_norm": 0.4507180023251733, + "learning_rate": 8.89709633301632e-07, + "loss": 0.2698, + "step": 5473 + }, + { + "epoch": 2.615494892778209, + "grad_norm": 0.4542073635472215, + "learning_rate": 8.875392100066082e-07, + "loss": 0.2796, + "step": 5474 + }, + { + "epoch": 2.6159727614837824, + "grad_norm": 0.4512619657743737, + "learning_rate": 8.853713143474685e-07, + "loss": 0.2457, + "step": 5475 + }, + { + "epoch": 2.6164506301893553, + "grad_norm": 0.43470496799876945, + "learning_rate": 8.832059469255461e-07, + "loss": 0.2638, + "step": 5476 + }, + { + "epoch": 2.6169284988949286, + "grad_norm": 0.4696707255099846, + "learning_rate": 8.810431083414761e-07, + "loss": 0.2794, + "step": 5477 + }, + { + "epoch": 2.617406367600502, + "grad_norm": 0.4654756321536549, + "learning_rate": 8.788827991951932e-07, + "loss": 0.2772, + "step": 5478 + }, + { + "epoch": 2.617884236306075, + "grad_norm": 1.1496364863049888, + "learning_rate": 8.767250200859278e-07, + "loss": 0.2443, + "step": 5479 + }, + { + "epoch": 2.618362105011648, + "grad_norm": 0.5038462450910971, + "learning_rate": 8.745697716122081e-07, + "loss": 0.2584, + "step": 5480 + }, + { + "epoch": 2.618839973717221, + "grad_norm": 0.4531843698900754, + "learning_rate": 8.724170543718657e-07, + "loss": 0.2617, + "step": 5481 + }, + { + "epoch": 2.6193178424227943, + "grad_norm": 0.44068312743815813, + "learning_rate": 8.702668689620252e-07, + "loss": 0.2664, + "step": 5482 + }, + { + "epoch": 2.6197957111283676, + "grad_norm": 0.4527311915552218, + "learning_rate": 8.681192159791074e-07, + "loss": 0.2439, + "step": 5483 + }, + { + "epoch": 2.6202735798339405, + "grad_norm": 0.46007711834116877, + "learning_rate": 8.659740960188379e-07, + "loss": 0.2632, + "step": 5484 + }, + { + "epoch": 2.620751448539514, + "grad_norm": 0.4654714237772124, + "learning_rate": 8.638315096762318e-07, + "loss": 0.264, + "step": 5485 + }, + { + "epoch": 2.6212293172450867, + "grad_norm": 0.4468898821749278, + "learning_rate": 8.616914575456048e-07, + "loss": 0.259, + "step": 5486 + }, + { + "epoch": 2.62170718595066, + "grad_norm": 0.45382995132457704, + "learning_rate": 8.595539402205711e-07, + "loss": 0.26, + "step": 5487 + }, + { + "epoch": 2.6221850546562333, + "grad_norm": 0.4466920475534839, + "learning_rate": 8.574189582940407e-07, + "loss": 0.2509, + "step": 5488 + }, + { + "epoch": 2.6226629233618066, + "grad_norm": 0.4340782648580306, + "learning_rate": 8.552865123582143e-07, + "loss": 0.2648, + "step": 5489 + }, + { + "epoch": 2.6231407920673795, + "grad_norm": 0.44864783353774906, + "learning_rate": 8.531566030046035e-07, + "loss": 0.2591, + "step": 5490 + }, + { + "epoch": 2.6236186607729524, + "grad_norm": 0.48371815986744804, + "learning_rate": 8.510292308240043e-07, + "loss": 0.2819, + "step": 5491 + }, + { + "epoch": 2.6240965294785257, + "grad_norm": 0.4732488135933287, + "learning_rate": 8.489043964065091e-07, + "loss": 0.2735, + "step": 5492 + }, + { + "epoch": 2.624574398184099, + "grad_norm": 0.5366303704759061, + "learning_rate": 8.467821003415133e-07, + "loss": 0.2659, + "step": 5493 + }, + { + "epoch": 2.6250522668896723, + "grad_norm": 0.5036292178699638, + "learning_rate": 8.446623432177025e-07, + "loss": 0.2566, + "step": 5494 + }, + { + "epoch": 2.625530135595245, + "grad_norm": 0.4953780048542685, + "learning_rate": 8.425451256230588e-07, + "loss": 0.2638, + "step": 5495 + }, + { + "epoch": 2.6260080043008185, + "grad_norm": 0.4490484978515238, + "learning_rate": 8.404304481448644e-07, + "loss": 0.2801, + "step": 5496 + }, + { + "epoch": 2.6264858730063914, + "grad_norm": 0.48605177710536757, + "learning_rate": 8.383183113696914e-07, + "loss": 0.2751, + "step": 5497 + }, + { + "epoch": 2.6269637417119647, + "grad_norm": 0.42633172550873427, + "learning_rate": 8.362087158834087e-07, + "loss": 0.266, + "step": 5498 + }, + { + "epoch": 2.627441610417538, + "grad_norm": 0.4472570484490212, + "learning_rate": 8.341016622711829e-07, + "loss": 0.264, + "step": 5499 + }, + { + "epoch": 2.627919479123111, + "grad_norm": 0.45968606482345986, + "learning_rate": 8.319971511174718e-07, + "loss": 0.2736, + "step": 5500 + }, + { + "epoch": 2.628397347828684, + "grad_norm": 0.5381860245104173, + "learning_rate": 8.298951830060286e-07, + "loss": 0.2638, + "step": 5501 + }, + { + "epoch": 2.628875216534257, + "grad_norm": 0.4439356147429359, + "learning_rate": 8.277957585199059e-07, + "loss": 0.2721, + "step": 5502 + }, + { + "epoch": 2.6293530852398304, + "grad_norm": 0.46056697797714147, + "learning_rate": 8.256988782414454e-07, + "loss": 0.2774, + "step": 5503 + }, + { + "epoch": 2.6298309539454037, + "grad_norm": 0.4429290965835113, + "learning_rate": 8.23604542752281e-07, + "loss": 0.2608, + "step": 5504 + }, + { + "epoch": 2.6303088226509765, + "grad_norm": 0.46251564537739054, + "learning_rate": 8.215127526333499e-07, + "loss": 0.2772, + "step": 5505 + }, + { + "epoch": 2.63078669135655, + "grad_norm": 0.4896830023615981, + "learning_rate": 8.19423508464876e-07, + "loss": 0.2595, + "step": 5506 + }, + { + "epoch": 2.6312645600621227, + "grad_norm": 0.44334268809073074, + "learning_rate": 8.173368108263768e-07, + "loss": 0.2604, + "step": 5507 + }, + { + "epoch": 2.631742428767696, + "grad_norm": 0.47811781456110813, + "learning_rate": 8.15252660296667e-07, + "loss": 0.2692, + "step": 5508 + }, + { + "epoch": 2.6322202974732694, + "grad_norm": 0.4363044164663179, + "learning_rate": 8.131710574538543e-07, + "loss": 0.2454, + "step": 5509 + }, + { + "epoch": 2.6326981661788422, + "grad_norm": 0.449330353460876, + "learning_rate": 8.110920028753355e-07, + "loss": 0.277, + "step": 5510 + }, + { + "epoch": 2.6331760348844155, + "grad_norm": 0.4961190817267826, + "learning_rate": 8.090154971378073e-07, + "loss": 0.27, + "step": 5511 + }, + { + "epoch": 2.6336539035899884, + "grad_norm": 0.45171261984580413, + "learning_rate": 8.069415408172543e-07, + "loss": 0.2712, + "step": 5512 + }, + { + "epoch": 2.6341317722955617, + "grad_norm": 0.4434767897838476, + "learning_rate": 8.048701344889531e-07, + "loss": 0.2508, + "step": 5513 + }, + { + "epoch": 2.634609641001135, + "grad_norm": 0.4623705120513708, + "learning_rate": 8.028012787274786e-07, + "loss": 0.2844, + "step": 5514 + }, + { + "epoch": 2.6350875097067084, + "grad_norm": 0.4574657819597167, + "learning_rate": 8.007349741066939e-07, + "loss": 0.2568, + "step": 5515 + }, + { + "epoch": 2.6355653784122812, + "grad_norm": 0.4450702051786837, + "learning_rate": 7.986712211997538e-07, + "loss": 0.2759, + "step": 5516 + }, + { + "epoch": 2.636043247117854, + "grad_norm": 0.6165437153757278, + "learning_rate": 7.966100205791094e-07, + "loss": 0.2445, + "step": 5517 + }, + { + "epoch": 2.6365211158234274, + "grad_norm": 0.45144330242103814, + "learning_rate": 7.945513728164999e-07, + "loss": 0.2495, + "step": 5518 + }, + { + "epoch": 2.6369989845290007, + "grad_norm": 0.4564427161692554, + "learning_rate": 7.924952784829576e-07, + "loss": 0.2617, + "step": 5519 + }, + { + "epoch": 2.637476853234574, + "grad_norm": 0.48463913280050425, + "learning_rate": 7.904417381488083e-07, + "loss": 0.2655, + "step": 5520 + }, + { + "epoch": 2.637954721940147, + "grad_norm": 0.4516813676731921, + "learning_rate": 7.883907523836676e-07, + "loss": 0.2695, + "step": 5521 + }, + { + "epoch": 2.63843259064572, + "grad_norm": 0.4604887951307146, + "learning_rate": 7.863423217564403e-07, + "loss": 0.2529, + "step": 5522 + }, + { + "epoch": 2.638910459351293, + "grad_norm": 0.46157386421059765, + "learning_rate": 7.842964468353265e-07, + "loss": 0.2618, + "step": 5523 + }, + { + "epoch": 2.6393883280568664, + "grad_norm": 0.4539399407409506, + "learning_rate": 7.822531281878188e-07, + "loss": 0.2526, + "step": 5524 + }, + { + "epoch": 2.6398661967624397, + "grad_norm": 0.44721471315508016, + "learning_rate": 7.802123663806938e-07, + "loss": 0.2355, + "step": 5525 + }, + { + "epoch": 2.6403440654680126, + "grad_norm": 0.45922430751841153, + "learning_rate": 7.781741619800231e-07, + "loss": 0.261, + "step": 5526 + }, + { + "epoch": 2.640821934173586, + "grad_norm": 0.46053983816343896, + "learning_rate": 7.761385155511714e-07, + "loss": 0.2486, + "step": 5527 + }, + { + "epoch": 2.6412998028791588, + "grad_norm": 0.45385759627956784, + "learning_rate": 7.741054276587889e-07, + "loss": 0.2623, + "step": 5528 + }, + { + "epoch": 2.641777671584732, + "grad_norm": 0.44483157426363334, + "learning_rate": 7.72074898866817e-07, + "loss": 0.2903, + "step": 5529 + }, + { + "epoch": 2.6422555402903054, + "grad_norm": 0.46467024535160295, + "learning_rate": 7.700469297384927e-07, + "loss": 0.27, + "step": 5530 + }, + { + "epoch": 2.6427334089958783, + "grad_norm": 0.46336583237822215, + "learning_rate": 7.680215208363362e-07, + "loss": 0.2571, + "step": 5531 + }, + { + "epoch": 2.6432112777014516, + "grad_norm": 0.4589731194382133, + "learning_rate": 7.659986727221591e-07, + "loss": 0.2748, + "step": 5532 + }, + { + "epoch": 2.6436891464070245, + "grad_norm": 0.4479455221268227, + "learning_rate": 7.63978385957066e-07, + "loss": 0.2579, + "step": 5533 + }, + { + "epoch": 2.6441670151125978, + "grad_norm": 0.45680582982603707, + "learning_rate": 7.619606611014485e-07, + "loss": 0.2563, + "step": 5534 + }, + { + "epoch": 2.644644883818171, + "grad_norm": 0.4341476061003986, + "learning_rate": 7.599454987149868e-07, + "loss": 0.2633, + "step": 5535 + }, + { + "epoch": 2.645122752523744, + "grad_norm": 0.5906143297103831, + "learning_rate": 7.579328993566526e-07, + "loss": 0.287, + "step": 5536 + }, + { + "epoch": 2.6456006212293173, + "grad_norm": 0.5449600193189947, + "learning_rate": 7.559228635847049e-07, + "loss": 0.267, + "step": 5537 + }, + { + "epoch": 2.64607848993489, + "grad_norm": 0.46047021853305603, + "learning_rate": 7.5391539195669e-07, + "loss": 0.2745, + "step": 5538 + }, + { + "epoch": 2.6465563586404635, + "grad_norm": 0.4884388346418383, + "learning_rate": 7.5191048502945e-07, + "loss": 0.2482, + "step": 5539 + }, + { + "epoch": 2.6470342273460368, + "grad_norm": 0.4774890364230652, + "learning_rate": 7.499081433591071e-07, + "loss": 0.2819, + "step": 5540 + }, + { + "epoch": 2.64751209605161, + "grad_norm": 0.43773957361680527, + "learning_rate": 7.479083675010746e-07, + "loss": 0.2711, + "step": 5541 + }, + { + "epoch": 2.647989964757183, + "grad_norm": 0.4400542304427017, + "learning_rate": 7.459111580100587e-07, + "loss": 0.2584, + "step": 5542 + }, + { + "epoch": 2.6484678334627563, + "grad_norm": 0.47571447666450534, + "learning_rate": 7.439165154400485e-07, + "loss": 0.2619, + "step": 5543 + }, + { + "epoch": 2.648945702168329, + "grad_norm": 0.45317629524346903, + "learning_rate": 7.419244403443215e-07, + "loss": 0.2426, + "step": 5544 + }, + { + "epoch": 2.6494235708739025, + "grad_norm": 0.43903772149485587, + "learning_rate": 7.399349332754458e-07, + "loss": 0.2731, + "step": 5545 + }, + { + "epoch": 2.6499014395794758, + "grad_norm": 0.45753347345809436, + "learning_rate": 7.379479947852752e-07, + "loss": 0.2752, + "step": 5546 + }, + { + "epoch": 2.6503793082850486, + "grad_norm": 0.4465731511139684, + "learning_rate": 7.359636254249491e-07, + "loss": 0.2749, + "step": 5547 + }, + { + "epoch": 2.650857176990622, + "grad_norm": 0.45206339925897226, + "learning_rate": 7.339818257448994e-07, + "loss": 0.2641, + "step": 5548 + }, + { + "epoch": 2.651335045696195, + "grad_norm": 0.44301665799534984, + "learning_rate": 7.320025962948429e-07, + "loss": 0.266, + "step": 5549 + }, + { + "epoch": 2.651812914401768, + "grad_norm": 0.4510933254303754, + "learning_rate": 7.300259376237795e-07, + "loss": 0.2658, + "step": 5550 + }, + { + "epoch": 2.6522907831073415, + "grad_norm": 0.45072706019066416, + "learning_rate": 7.280518502800027e-07, + "loss": 0.276, + "step": 5551 + }, + { + "epoch": 2.6527686518129143, + "grad_norm": 0.5069678986700116, + "learning_rate": 7.260803348110879e-07, + "loss": 0.2749, + "step": 5552 + }, + { + "epoch": 2.6532465205184876, + "grad_norm": 0.4204064472652244, + "learning_rate": 7.241113917638987e-07, + "loss": 0.2482, + "step": 5553 + }, + { + "epoch": 2.6537243892240605, + "grad_norm": 0.4606764828883414, + "learning_rate": 7.22145021684586e-07, + "loss": 0.2686, + "step": 5554 + }, + { + "epoch": 2.654202257929634, + "grad_norm": 0.43007497445227916, + "learning_rate": 7.201812251185869e-07, + "loss": 0.2331, + "step": 5555 + }, + { + "epoch": 2.654680126635207, + "grad_norm": 0.43763153678237027, + "learning_rate": 7.182200026106201e-07, + "loss": 0.2569, + "step": 5556 + }, + { + "epoch": 2.65515799534078, + "grad_norm": 0.4720679073397097, + "learning_rate": 7.16261354704697e-07, + "loss": 0.2645, + "step": 5557 + }, + { + "epoch": 2.6556358640463533, + "grad_norm": 0.4342444064007902, + "learning_rate": 7.143052819441143e-07, + "loss": 0.2586, + "step": 5558 + }, + { + "epoch": 2.656113732751926, + "grad_norm": 0.4761061871295211, + "learning_rate": 7.123517848714479e-07, + "loss": 0.2519, + "step": 5559 + }, + { + "epoch": 2.6565916014574995, + "grad_norm": 0.4487477696170997, + "learning_rate": 7.104008640285642e-07, + "loss": 0.2691, + "step": 5560 + }, + { + "epoch": 2.657069470163073, + "grad_norm": 0.43960627882338654, + "learning_rate": 7.084525199566172e-07, + "loss": 0.2735, + "step": 5561 + }, + { + "epoch": 2.6575473388686457, + "grad_norm": 0.43396104580229183, + "learning_rate": 7.065067531960412e-07, + "loss": 0.2416, + "step": 5562 + }, + { + "epoch": 2.658025207574219, + "grad_norm": 0.4262202664604116, + "learning_rate": 7.045635642865555e-07, + "loss": 0.246, + "step": 5563 + }, + { + "epoch": 2.658503076279792, + "grad_norm": 0.4305745545871823, + "learning_rate": 7.026229537671692e-07, + "loss": 0.2754, + "step": 5564 + }, + { + "epoch": 2.658980944985365, + "grad_norm": 0.595294358705173, + "learning_rate": 7.006849221761736e-07, + "loss": 0.2681, + "step": 5565 + }, + { + "epoch": 2.6594588136909385, + "grad_norm": 0.4579698632907277, + "learning_rate": 6.987494700511411e-07, + "loss": 0.2691, + "step": 5566 + }, + { + "epoch": 2.659936682396512, + "grad_norm": 0.42289834526883935, + "learning_rate": 6.968165979289365e-07, + "loss": 0.2346, + "step": 5567 + }, + { + "epoch": 2.6604145511020847, + "grad_norm": 0.43997486144720366, + "learning_rate": 6.948863063457023e-07, + "loss": 0.2627, + "step": 5568 + }, + { + "epoch": 2.660892419807658, + "grad_norm": 0.5245364644243271, + "learning_rate": 6.929585958368656e-07, + "loss": 0.2744, + "step": 5569 + }, + { + "epoch": 2.661370288513231, + "grad_norm": 0.44871184731230485, + "learning_rate": 6.910334669371433e-07, + "loss": 0.2597, + "step": 5570 + }, + { + "epoch": 2.661848157218804, + "grad_norm": 0.4530485747359608, + "learning_rate": 6.891109201805291e-07, + "loss": 0.289, + "step": 5571 + }, + { + "epoch": 2.6623260259243775, + "grad_norm": 0.4459777138982337, + "learning_rate": 6.871909561003032e-07, + "loss": 0.2773, + "step": 5572 + }, + { + "epoch": 2.6628038946299504, + "grad_norm": 0.44624182034655313, + "learning_rate": 6.852735752290318e-07, + "loss": 0.2535, + "step": 5573 + }, + { + "epoch": 2.6632817633355237, + "grad_norm": 0.4408183385200215, + "learning_rate": 6.833587780985618e-07, + "loss": 0.2609, + "step": 5574 + }, + { + "epoch": 2.6637596320410966, + "grad_norm": 0.48483246105806804, + "learning_rate": 6.814465652400237e-07, + "loss": 0.2797, + "step": 5575 + }, + { + "epoch": 2.66423750074667, + "grad_norm": 0.4412804280001876, + "learning_rate": 6.795369371838323e-07, + "loss": 0.2561, + "step": 5576 + }, + { + "epoch": 2.664715369452243, + "grad_norm": 0.4401284623732288, + "learning_rate": 6.776298944596849e-07, + "loss": 0.2667, + "step": 5577 + }, + { + "epoch": 2.665193238157816, + "grad_norm": 0.49218861155591376, + "learning_rate": 6.757254375965583e-07, + "loss": 0.2639, + "step": 5578 + }, + { + "epoch": 2.6656711068633894, + "grad_norm": 0.4452641669010618, + "learning_rate": 6.738235671227212e-07, + "loss": 0.2782, + "step": 5579 + }, + { + "epoch": 2.6661489755689622, + "grad_norm": 0.4385273221330306, + "learning_rate": 6.719242835657147e-07, + "loss": 0.2631, + "step": 5580 + }, + { + "epoch": 2.6666268442745356, + "grad_norm": 0.42111306891766304, + "learning_rate": 6.700275874523665e-07, + "loss": 0.262, + "step": 5581 + }, + { + "epoch": 2.667104712980109, + "grad_norm": 0.4348075279027199, + "learning_rate": 6.681334793087879e-07, + "loss": 0.2421, + "step": 5582 + }, + { + "epoch": 2.6675825816856817, + "grad_norm": 0.5049628037720003, + "learning_rate": 6.66241959660372e-07, + "loss": 0.2847, + "step": 5583 + }, + { + "epoch": 2.668060450391255, + "grad_norm": 0.4683050522187598, + "learning_rate": 6.643530290317901e-07, + "loss": 0.2568, + "step": 5584 + }, + { + "epoch": 2.668538319096828, + "grad_norm": 0.4496767613831931, + "learning_rate": 6.62466687947001e-07, + "loss": 0.255, + "step": 5585 + }, + { + "epoch": 2.6690161878024012, + "grad_norm": 0.4383351210567021, + "learning_rate": 6.605829369292427e-07, + "loss": 0.2624, + "step": 5586 + }, + { + "epoch": 2.6694940565079746, + "grad_norm": 0.4633981201266098, + "learning_rate": 6.587017765010306e-07, + "loss": 0.2696, + "step": 5587 + }, + { + "epoch": 2.6699719252135474, + "grad_norm": 0.43636703634880375, + "learning_rate": 6.568232071841695e-07, + "loss": 0.2474, + "step": 5588 + }, + { + "epoch": 2.6704497939191207, + "grad_norm": 0.4639743358839676, + "learning_rate": 6.549472294997405e-07, + "loss": 0.2789, + "step": 5589 + }, + { + "epoch": 2.6709276626246936, + "grad_norm": 0.48119723931800046, + "learning_rate": 6.530738439681017e-07, + "loss": 0.2675, + "step": 5590 + }, + { + "epoch": 2.671405531330267, + "grad_norm": 0.4541784315517988, + "learning_rate": 6.512030511089063e-07, + "loss": 0.2663, + "step": 5591 + }, + { + "epoch": 2.6718834000358402, + "grad_norm": 0.4386851803015807, + "learning_rate": 6.493348514410735e-07, + "loss": 0.2664, + "step": 5592 + }, + { + "epoch": 2.6723612687414136, + "grad_norm": 0.443049021233755, + "learning_rate": 6.474692454828091e-07, + "loss": 0.2634, + "step": 5593 + }, + { + "epoch": 2.6728391374469864, + "grad_norm": 0.5368940349519489, + "learning_rate": 6.456062337516023e-07, + "loss": 0.2691, + "step": 5594 + }, + { + "epoch": 2.6733170061525597, + "grad_norm": 0.4362459871804461, + "learning_rate": 6.437458167642164e-07, + "loss": 0.2768, + "step": 5595 + }, + { + "epoch": 2.6737948748581326, + "grad_norm": 0.43405302870615214, + "learning_rate": 6.418879950366986e-07, + "loss": 0.2628, + "step": 5596 + }, + { + "epoch": 2.674272743563706, + "grad_norm": 0.4537944478141257, + "learning_rate": 6.400327690843777e-07, + "loss": 0.2537, + "step": 5597 + }, + { + "epoch": 2.6747506122692792, + "grad_norm": 0.44225140880189395, + "learning_rate": 6.3818013942186e-07, + "loss": 0.2623, + "step": 5598 + }, + { + "epoch": 2.675228480974852, + "grad_norm": 0.48940459598044767, + "learning_rate": 6.363301065630301e-07, + "loss": 0.262, + "step": 5599 + }, + { + "epoch": 2.6757063496804254, + "grad_norm": 0.4290257298591847, + "learning_rate": 6.344826710210584e-07, + "loss": 0.2574, + "step": 5600 + }, + { + "epoch": 2.6761842183859983, + "grad_norm": 0.44260393461554365, + "learning_rate": 6.326378333083883e-07, + "loss": 0.2613, + "step": 5601 + }, + { + "epoch": 2.6766620870915716, + "grad_norm": 0.47868079245627504, + "learning_rate": 6.307955939367449e-07, + "loss": 0.2636, + "step": 5602 + }, + { + "epoch": 2.677139955797145, + "grad_norm": 0.43232776082930185, + "learning_rate": 6.289559534171353e-07, + "loss": 0.2682, + "step": 5603 + }, + { + "epoch": 2.677617824502718, + "grad_norm": 0.44295068309351476, + "learning_rate": 6.27118912259842e-07, + "loss": 0.2532, + "step": 5604 + }, + { + "epoch": 2.678095693208291, + "grad_norm": 0.43112065814472633, + "learning_rate": 6.252844709744255e-07, + "loss": 0.2847, + "step": 5605 + }, + { + "epoch": 2.678573561913864, + "grad_norm": 0.44589686576732196, + "learning_rate": 6.234526300697308e-07, + "loss": 0.2586, + "step": 5606 + }, + { + "epoch": 2.6790514306194373, + "grad_norm": 0.46936007169833993, + "learning_rate": 6.216233900538782e-07, + "loss": 0.262, + "step": 5607 + }, + { + "epoch": 2.6795292993250106, + "grad_norm": 0.45133407836159795, + "learning_rate": 6.197967514342628e-07, + "loss": 0.2667, + "step": 5608 + }, + { + "epoch": 2.6800071680305835, + "grad_norm": 0.4988501134193684, + "learning_rate": 6.179727147175663e-07, + "loss": 0.2666, + "step": 5609 + }, + { + "epoch": 2.680485036736157, + "grad_norm": 0.43159011005236786, + "learning_rate": 6.161512804097436e-07, + "loss": 0.2696, + "step": 5610 + }, + { + "epoch": 2.6809629054417297, + "grad_norm": 0.4392692032042625, + "learning_rate": 6.143324490160252e-07, + "loss": 0.2564, + "step": 5611 + }, + { + "epoch": 2.681440774147303, + "grad_norm": 0.5262468308872646, + "learning_rate": 6.125162210409263e-07, + "loss": 0.2881, + "step": 5612 + }, + { + "epoch": 2.6819186428528763, + "grad_norm": 0.4429786878821737, + "learning_rate": 6.107025969882363e-07, + "loss": 0.2745, + "step": 5613 + }, + { + "epoch": 2.682396511558449, + "grad_norm": 0.4290197449049834, + "learning_rate": 6.088915773610194e-07, + "loss": 0.2735, + "step": 5614 + }, + { + "epoch": 2.6828743802640225, + "grad_norm": 0.44057175272421367, + "learning_rate": 6.070831626616236e-07, + "loss": 0.2694, + "step": 5615 + }, + { + "epoch": 2.6833522489695953, + "grad_norm": 0.46685871466805356, + "learning_rate": 6.052773533916712e-07, + "loss": 0.2663, + "step": 5616 + }, + { + "epoch": 2.6838301176751687, + "grad_norm": 0.44112906840605176, + "learning_rate": 6.034741500520591e-07, + "loss": 0.2731, + "step": 5617 + }, + { + "epoch": 2.684307986380742, + "grad_norm": 0.4480978993764805, + "learning_rate": 6.016735531429674e-07, + "loss": 0.2638, + "step": 5618 + }, + { + "epoch": 2.6847858550863153, + "grad_norm": 0.4757023777345707, + "learning_rate": 5.998755631638486e-07, + "loss": 0.2625, + "step": 5619 + }, + { + "epoch": 2.685263723791888, + "grad_norm": 0.43772130701917733, + "learning_rate": 5.980801806134318e-07, + "loss": 0.2628, + "step": 5620 + }, + { + "epoch": 2.6857415924974615, + "grad_norm": 0.4408707212239334, + "learning_rate": 5.962874059897273e-07, + "loss": 0.259, + "step": 5621 + }, + { + "epoch": 2.6862194612030343, + "grad_norm": 0.44875603266564296, + "learning_rate": 5.944972397900173e-07, + "loss": 0.252, + "step": 5622 + }, + { + "epoch": 2.6866973299086077, + "grad_norm": 0.7243707606230397, + "learning_rate": 5.927096825108614e-07, + "loss": 0.2644, + "step": 5623 + }, + { + "epoch": 2.687175198614181, + "grad_norm": 0.4437660770119595, + "learning_rate": 5.909247346480973e-07, + "loss": 0.2584, + "step": 5624 + }, + { + "epoch": 2.687653067319754, + "grad_norm": 0.4428332177909088, + "learning_rate": 5.891423966968413e-07, + "loss": 0.2785, + "step": 5625 + }, + { + "epoch": 2.688130936025327, + "grad_norm": 0.5234803476776241, + "learning_rate": 5.873626691514789e-07, + "loss": 0.249, + "step": 5626 + }, + { + "epoch": 2.6886088047309, + "grad_norm": 0.47236649678503034, + "learning_rate": 5.855855525056742e-07, + "loss": 0.2761, + "step": 5627 + }, + { + "epoch": 2.6890866734364733, + "grad_norm": 0.45396864953408195, + "learning_rate": 5.838110472523728e-07, + "loss": 0.2694, + "step": 5628 + }, + { + "epoch": 2.6895645421420467, + "grad_norm": 0.44956507553613845, + "learning_rate": 5.820391538837866e-07, + "loss": 0.2608, + "step": 5629 + }, + { + "epoch": 2.6900424108476195, + "grad_norm": 0.4401864408494058, + "learning_rate": 5.80269872891408e-07, + "loss": 0.2655, + "step": 5630 + }, + { + "epoch": 2.690520279553193, + "grad_norm": 0.4886914370387613, + "learning_rate": 5.785032047660077e-07, + "loss": 0.2549, + "step": 5631 + }, + { + "epoch": 2.6909981482587657, + "grad_norm": 0.44296980817168874, + "learning_rate": 5.76739149997625e-07, + "loss": 0.247, + "step": 5632 + }, + { + "epoch": 2.691476016964339, + "grad_norm": 0.44550057207332994, + "learning_rate": 5.749777090755781e-07, + "loss": 0.2618, + "step": 5633 + }, + { + "epoch": 2.6919538856699123, + "grad_norm": 0.43268666197260663, + "learning_rate": 5.73218882488461e-07, + "loss": 0.2688, + "step": 5634 + }, + { + "epoch": 2.692431754375485, + "grad_norm": 0.4461412386675045, + "learning_rate": 5.714626707241411e-07, + "loss": 0.2718, + "step": 5635 + }, + { + "epoch": 2.6929096230810585, + "grad_norm": 0.4558906789502105, + "learning_rate": 5.697090742697576e-07, + "loss": 0.2792, + "step": 5636 + }, + { + "epoch": 2.6933874917866314, + "grad_norm": 0.4744314781330036, + "learning_rate": 5.679580936117312e-07, + "loss": 0.2591, + "step": 5637 + }, + { + "epoch": 2.6938653604922047, + "grad_norm": 0.5192870350309746, + "learning_rate": 5.662097292357505e-07, + "loss": 0.2734, + "step": 5638 + }, + { + "epoch": 2.694343229197778, + "grad_norm": 0.4435569578566844, + "learning_rate": 5.644639816267817e-07, + "loss": 0.2666, + "step": 5639 + }, + { + "epoch": 2.694821097903351, + "grad_norm": 0.43856696864544303, + "learning_rate": 5.627208512690641e-07, + "loss": 0.2705, + "step": 5640 + }, + { + "epoch": 2.695298966608924, + "grad_norm": 0.45553085802947646, + "learning_rate": 5.609803386461133e-07, + "loss": 0.2656, + "step": 5641 + }, + { + "epoch": 2.695776835314497, + "grad_norm": 0.4666048945211158, + "learning_rate": 5.59242444240713e-07, + "loss": 0.2738, + "step": 5642 + }, + { + "epoch": 2.6962547040200704, + "grad_norm": 0.43113345766045935, + "learning_rate": 5.575071685349276e-07, + "loss": 0.2572, + "step": 5643 + }, + { + "epoch": 2.6967325727256437, + "grad_norm": 1.125132833480618, + "learning_rate": 5.55774512010091e-07, + "loss": 0.2558, + "step": 5644 + }, + { + "epoch": 2.697210441431217, + "grad_norm": 0.608140367809618, + "learning_rate": 5.5404447514681e-07, + "loss": 0.2737, + "step": 5645 + }, + { + "epoch": 2.69768831013679, + "grad_norm": 0.4533877869576038, + "learning_rate": 5.523170584249704e-07, + "loss": 0.2498, + "step": 5646 + }, + { + "epoch": 2.698166178842363, + "grad_norm": 0.4415778575924575, + "learning_rate": 5.505922623237237e-07, + "loss": 0.2639, + "step": 5647 + }, + { + "epoch": 2.698644047547936, + "grad_norm": 0.42774906759972836, + "learning_rate": 5.488700873214969e-07, + "loss": 0.2692, + "step": 5648 + }, + { + "epoch": 2.6991219162535094, + "grad_norm": 0.5330417311187563, + "learning_rate": 5.471505338959948e-07, + "loss": 0.2707, + "step": 5649 + }, + { + "epoch": 2.6995997849590827, + "grad_norm": 0.5461645932501131, + "learning_rate": 5.45433602524188e-07, + "loss": 0.2588, + "step": 5650 + }, + { + "epoch": 2.7000776536646556, + "grad_norm": 0.4242076160737116, + "learning_rate": 5.437192936823243e-07, + "loss": 0.2651, + "step": 5651 + }, + { + "epoch": 2.700555522370229, + "grad_norm": 0.4676634194490461, + "learning_rate": 5.420076078459236e-07, + "loss": 0.27, + "step": 5652 + }, + { + "epoch": 2.7010333910758018, + "grad_norm": 0.43263645783838944, + "learning_rate": 5.402985454897758e-07, + "loss": 0.2589, + "step": 5653 + }, + { + "epoch": 2.701511259781375, + "grad_norm": 0.45382249392362367, + "learning_rate": 5.385921070879441e-07, + "loss": 0.2487, + "step": 5654 + }, + { + "epoch": 2.7019891284869484, + "grad_norm": 0.49441942637541253, + "learning_rate": 5.368882931137675e-07, + "loss": 0.2565, + "step": 5655 + }, + { + "epoch": 2.7024669971925213, + "grad_norm": 0.4405834049292135, + "learning_rate": 5.351871040398515e-07, + "loss": 0.2744, + "step": 5656 + }, + { + "epoch": 2.7029448658980946, + "grad_norm": 0.5559524037795076, + "learning_rate": 5.33488540338074e-07, + "loss": 0.2583, + "step": 5657 + }, + { + "epoch": 2.7034227346036674, + "grad_norm": 0.43260600511156877, + "learning_rate": 5.317926024795906e-07, + "loss": 0.2572, + "step": 5658 + }, + { + "epoch": 2.7039006033092408, + "grad_norm": 0.4394492413785682, + "learning_rate": 5.300992909348234e-07, + "loss": 0.2545, + "step": 5659 + }, + { + "epoch": 2.704378472014814, + "grad_norm": 0.4362896925210051, + "learning_rate": 5.284086061734672e-07, + "loss": 0.2733, + "step": 5660 + }, + { + "epoch": 2.704856340720387, + "grad_norm": 0.5093789250306926, + "learning_rate": 5.267205486644866e-07, + "loss": 0.2701, + "step": 5661 + }, + { + "epoch": 2.7053342094259603, + "grad_norm": 0.48183431324608156, + "learning_rate": 5.250351188761204e-07, + "loss": 0.2541, + "step": 5662 + }, + { + "epoch": 2.705812078131533, + "grad_norm": 0.4409302418627837, + "learning_rate": 5.23352317275877e-07, + "loss": 0.2602, + "step": 5663 + }, + { + "epoch": 2.7062899468371064, + "grad_norm": 0.46481073306379156, + "learning_rate": 5.21672144330535e-07, + "loss": 0.2668, + "step": 5664 + }, + { + "epoch": 2.7067678155426798, + "grad_norm": 0.573145605718102, + "learning_rate": 5.199946005061462e-07, + "loss": 0.248, + "step": 5665 + }, + { + "epoch": 2.7072456842482526, + "grad_norm": 0.4458113575270228, + "learning_rate": 5.183196862680307e-07, + "loss": 0.2495, + "step": 5666 + }, + { + "epoch": 2.707723552953826, + "grad_norm": 0.566881416643556, + "learning_rate": 5.166474020807788e-07, + "loss": 0.2699, + "step": 5667 + }, + { + "epoch": 2.708201421659399, + "grad_norm": 0.49148904309220365, + "learning_rate": 5.149777484082552e-07, + "loss": 0.2686, + "step": 5668 + }, + { + "epoch": 2.708679290364972, + "grad_norm": 0.4396165691920815, + "learning_rate": 5.133107257135917e-07, + "loss": 0.2695, + "step": 5669 + }, + { + "epoch": 2.7091571590705454, + "grad_norm": 0.4733614560975121, + "learning_rate": 5.116463344591893e-07, + "loss": 0.2687, + "step": 5670 + }, + { + "epoch": 2.7096350277761188, + "grad_norm": 0.4756203545300059, + "learning_rate": 5.099845751067234e-07, + "loss": 0.2652, + "step": 5671 + }, + { + "epoch": 2.7101128964816916, + "grad_norm": 0.4877805042234815, + "learning_rate": 5.083254481171352e-07, + "loss": 0.2786, + "step": 5672 + }, + { + "epoch": 2.710590765187265, + "grad_norm": 0.43232222654764546, + "learning_rate": 5.066689539506353e-07, + "loss": 0.2713, + "step": 5673 + }, + { + "epoch": 2.711068633892838, + "grad_norm": 0.44816531622174133, + "learning_rate": 5.050150930667108e-07, + "loss": 0.2755, + "step": 5674 + }, + { + "epoch": 2.711546502598411, + "grad_norm": 0.4627071189494935, + "learning_rate": 5.033638659241102e-07, + "loss": 0.2574, + "step": 5675 + }, + { + "epoch": 2.7120243713039844, + "grad_norm": 0.433698330766837, + "learning_rate": 5.017152729808539e-07, + "loss": 0.2659, + "step": 5676 + }, + { + "epoch": 2.7125022400095573, + "grad_norm": 0.4537219642809981, + "learning_rate": 5.000693146942359e-07, + "loss": 0.2628, + "step": 5677 + }, + { + "epoch": 2.7129801087151306, + "grad_norm": 0.7128807971801155, + "learning_rate": 4.984259915208134e-07, + "loss": 0.2826, + "step": 5678 + }, + { + "epoch": 2.7134579774207035, + "grad_norm": 0.5197321996631703, + "learning_rate": 4.96785303916415e-07, + "loss": 0.2838, + "step": 5679 + }, + { + "epoch": 2.713935846126277, + "grad_norm": 0.43144572298262895, + "learning_rate": 4.951472523361401e-07, + "loss": 0.2493, + "step": 5680 + }, + { + "epoch": 2.71441371483185, + "grad_norm": 0.47874783163343665, + "learning_rate": 4.935118372343561e-07, + "loss": 0.2728, + "step": 5681 + }, + { + "epoch": 2.714891583537423, + "grad_norm": 0.4506171864407369, + "learning_rate": 4.918790590646938e-07, + "loss": 0.24, + "step": 5682 + }, + { + "epoch": 2.7153694522429963, + "grad_norm": 0.4365104968626452, + "learning_rate": 4.90248918280063e-07, + "loss": 0.2807, + "step": 5683 + }, + { + "epoch": 2.715847320948569, + "grad_norm": 0.43550955708683503, + "learning_rate": 4.88621415332633e-07, + "loss": 0.2549, + "step": 5684 + }, + { + "epoch": 2.7163251896541425, + "grad_norm": 0.43640296493089487, + "learning_rate": 4.869965506738416e-07, + "loss": 0.2524, + "step": 5685 + }, + { + "epoch": 2.716803058359716, + "grad_norm": 0.466624117863505, + "learning_rate": 4.85374324754404e-07, + "loss": 0.2825, + "step": 5686 + }, + { + "epoch": 2.7172809270652887, + "grad_norm": 0.4280281243864085, + "learning_rate": 4.837547380242924e-07, + "loss": 0.2718, + "step": 5687 + }, + { + "epoch": 2.717758795770862, + "grad_norm": 0.48966203018348153, + "learning_rate": 4.821377909327518e-07, + "loss": 0.2496, + "step": 5688 + }, + { + "epoch": 2.718236664476435, + "grad_norm": 0.43749253431245166, + "learning_rate": 4.805234839282968e-07, + "loss": 0.2738, + "step": 5689 + }, + { + "epoch": 2.718714533182008, + "grad_norm": 0.457351266972444, + "learning_rate": 4.789118174587071e-07, + "loss": 0.2867, + "step": 5690 + }, + { + "epoch": 2.7191924018875815, + "grad_norm": 0.4324107322349869, + "learning_rate": 4.773027919710272e-07, + "loss": 0.2698, + "step": 5691 + }, + { + "epoch": 2.719670270593155, + "grad_norm": 0.4372967375001939, + "learning_rate": 4.756964079115778e-07, + "loss": 0.2454, + "step": 5692 + }, + { + "epoch": 2.7201481392987277, + "grad_norm": 0.44502655752254144, + "learning_rate": 4.740926657259393e-07, + "loss": 0.27, + "step": 5693 + }, + { + "epoch": 2.7206260080043005, + "grad_norm": 0.4773080512151135, + "learning_rate": 4.7249156585895904e-07, + "loss": 0.273, + "step": 5694 + }, + { + "epoch": 2.721103876709874, + "grad_norm": 0.43589564759727806, + "learning_rate": 4.7089310875475856e-07, + "loss": 0.2702, + "step": 5695 + }, + { + "epoch": 2.721581745415447, + "grad_norm": 0.435891297678135, + "learning_rate": 4.692972948567187e-07, + "loss": 0.2721, + "step": 5696 + }, + { + "epoch": 2.7220596141210205, + "grad_norm": 0.4556795601807038, + "learning_rate": 4.677041246074887e-07, + "loss": 0.268, + "step": 5697 + }, + { + "epoch": 2.7225374828265934, + "grad_norm": 0.43816241757476965, + "learning_rate": 4.661135984489895e-07, + "loss": 0.2822, + "step": 5698 + }, + { + "epoch": 2.7230153515321667, + "grad_norm": 0.45203308436452894, + "learning_rate": 4.645257168224038e-07, + "loss": 0.2646, + "step": 5699 + }, + { + "epoch": 2.7234932202377395, + "grad_norm": 0.43051792277467377, + "learning_rate": 4.6294048016817917e-07, + "loss": 0.2576, + "step": 5700 + }, + { + "epoch": 2.723971088943313, + "grad_norm": 0.4279847606367255, + "learning_rate": 4.6135788892603615e-07, + "loss": 0.2505, + "step": 5701 + }, + { + "epoch": 2.724448957648886, + "grad_norm": 0.43586951533967105, + "learning_rate": 4.5977794353495584e-07, + "loss": 0.2667, + "step": 5702 + }, + { + "epoch": 2.724926826354459, + "grad_norm": 0.4436479201943215, + "learning_rate": 4.582006444331866e-07, + "loss": 0.2584, + "step": 5703 + }, + { + "epoch": 2.7254046950600324, + "grad_norm": 0.46315434030840746, + "learning_rate": 4.56625992058245e-07, + "loss": 0.276, + "step": 5704 + }, + { + "epoch": 2.7258825637656052, + "grad_norm": 0.46920092210161374, + "learning_rate": 4.550539868469106e-07, + "loss": 0.2829, + "step": 5705 + }, + { + "epoch": 2.7263604324711785, + "grad_norm": 0.49465749907516765, + "learning_rate": 4.5348462923523017e-07, + "loss": 0.2528, + "step": 5706 + }, + { + "epoch": 2.726838301176752, + "grad_norm": 0.5070139510002986, + "learning_rate": 4.519179196585166e-07, + "loss": 0.2578, + "step": 5707 + }, + { + "epoch": 2.7273161698823247, + "grad_norm": 0.4548552543610518, + "learning_rate": 4.5035385855134674e-07, + "loss": 0.2647, + "step": 5708 + }, + { + "epoch": 2.727794038587898, + "grad_norm": 0.46825179549102325, + "learning_rate": 4.4879244634756125e-07, + "loss": 0.2892, + "step": 5709 + }, + { + "epoch": 2.728271907293471, + "grad_norm": 0.42982413711336054, + "learning_rate": 4.4723368348027375e-07, + "loss": 0.2394, + "step": 5710 + }, + { + "epoch": 2.7287497759990442, + "grad_norm": 0.5254320930661824, + "learning_rate": 4.4567757038185387e-07, + "loss": 0.2501, + "step": 5711 + }, + { + "epoch": 2.7292276447046175, + "grad_norm": 0.43831906460766556, + "learning_rate": 4.4412410748393973e-07, + "loss": 0.2721, + "step": 5712 + }, + { + "epoch": 2.7297055134101904, + "grad_norm": 0.44601508670964723, + "learning_rate": 4.4257329521743554e-07, + "loss": 0.2687, + "step": 5713 + }, + { + "epoch": 2.7301833821157637, + "grad_norm": 0.4244997363062748, + "learning_rate": 4.4102513401251047e-07, + "loss": 0.2593, + "step": 5714 + }, + { + "epoch": 2.7306612508213366, + "grad_norm": 0.43283927918633053, + "learning_rate": 4.394796242985933e-07, + "loss": 0.2397, + "step": 5715 + }, + { + "epoch": 2.73113911952691, + "grad_norm": 0.4532562420987665, + "learning_rate": 4.3793676650438545e-07, + "loss": 0.2622, + "step": 5716 + }, + { + "epoch": 2.7316169882324832, + "grad_norm": 0.4699173683543495, + "learning_rate": 4.363965610578469e-07, + "loss": 0.2618, + "step": 5717 + }, + { + "epoch": 2.7320948569380565, + "grad_norm": 0.44230687360379534, + "learning_rate": 4.348590083862025e-07, + "loss": 0.2777, + "step": 5718 + }, + { + "epoch": 2.7325727256436294, + "grad_norm": 0.42842808714224573, + "learning_rate": 4.3332410891594346e-07, + "loss": 0.2648, + "step": 5719 + }, + { + "epoch": 2.7330505943492027, + "grad_norm": 0.4448453155274851, + "learning_rate": 4.317918630728235e-07, + "loss": 0.2507, + "step": 5720 + }, + { + "epoch": 2.7335284630547756, + "grad_norm": 0.45643300317990476, + "learning_rate": 4.302622712818594e-07, + "loss": 0.2588, + "step": 5721 + }, + { + "epoch": 2.734006331760349, + "grad_norm": 0.4357425686343285, + "learning_rate": 4.28735333967335e-07, + "loss": 0.2761, + "step": 5722 + }, + { + "epoch": 2.734484200465922, + "grad_norm": 0.4304752980285673, + "learning_rate": 4.2721105155279496e-07, + "loss": 0.2722, + "step": 5723 + }, + { + "epoch": 2.734962069171495, + "grad_norm": 0.43442802885881027, + "learning_rate": 4.2568942446104657e-07, + "loss": 0.2603, + "step": 5724 + }, + { + "epoch": 2.7354399378770684, + "grad_norm": 0.47160630821995014, + "learning_rate": 4.241704531141633e-07, + "loss": 0.2688, + "step": 5725 + }, + { + "epoch": 2.7359178065826413, + "grad_norm": 0.4501959887057499, + "learning_rate": 4.2265413793348363e-07, + "loss": 0.2668, + "step": 5726 + }, + { + "epoch": 2.7363956752882146, + "grad_norm": 0.4175734384990899, + "learning_rate": 4.2114047933960453e-07, + "loss": 0.2606, + "step": 5727 + }, + { + "epoch": 2.736873543993788, + "grad_norm": 0.45694768736227065, + "learning_rate": 4.196294777523868e-07, + "loss": 0.2674, + "step": 5728 + }, + { + "epoch": 2.7373514126993608, + "grad_norm": 0.4673946349847667, + "learning_rate": 4.181211335909585e-07, + "loss": 0.279, + "step": 5729 + }, + { + "epoch": 2.737829281404934, + "grad_norm": 0.44503813090400046, + "learning_rate": 4.166154472737061e-07, + "loss": 0.2631, + "step": 5730 + }, + { + "epoch": 2.738307150110507, + "grad_norm": 0.4340503304353276, + "learning_rate": 4.151124192182798e-07, + "loss": 0.2603, + "step": 5731 + }, + { + "epoch": 2.7387850188160803, + "grad_norm": 0.48847952518429966, + "learning_rate": 4.136120498415952e-07, + "loss": 0.2624, + "step": 5732 + }, + { + "epoch": 2.7392628875216536, + "grad_norm": 0.5059660565626302, + "learning_rate": 4.1211433955982707e-07, + "loss": 0.2595, + "step": 5733 + }, + { + "epoch": 2.7397407562272265, + "grad_norm": 0.46797353640218137, + "learning_rate": 4.1061928878841193e-07, + "loss": 0.237, + "step": 5734 + }, + { + "epoch": 2.7402186249327998, + "grad_norm": 0.45219798246778975, + "learning_rate": 4.091268979420537e-07, + "loss": 0.2592, + "step": 5735 + }, + { + "epoch": 2.7406964936383726, + "grad_norm": 0.4447422926180344, + "learning_rate": 4.0763716743471346e-07, + "loss": 0.2664, + "step": 5736 + }, + { + "epoch": 2.741174362343946, + "grad_norm": 0.666905018898308, + "learning_rate": 4.061500976796162e-07, + "loss": 0.2797, + "step": 5737 + }, + { + "epoch": 2.7416522310495193, + "grad_norm": 0.4393683214525845, + "learning_rate": 4.0466568908925087e-07, + "loss": 0.2648, + "step": 5738 + }, + { + "epoch": 2.742130099755092, + "grad_norm": 0.4571492436490682, + "learning_rate": 4.031839420753636e-07, + "loss": 0.2714, + "step": 5739 + }, + { + "epoch": 2.7426079684606655, + "grad_norm": 0.4227610872002852, + "learning_rate": 4.0170485704896453e-07, + "loss": 0.2658, + "step": 5740 + }, + { + "epoch": 2.7430858371662383, + "grad_norm": 0.4436196880313655, + "learning_rate": 4.002284344203289e-07, + "loss": 0.272, + "step": 5741 + }, + { + "epoch": 2.7435637058718116, + "grad_norm": 0.5126894803406438, + "learning_rate": 3.987546745989879e-07, + "loss": 0.2627, + "step": 5742 + }, + { + "epoch": 2.744041574577385, + "grad_norm": 0.5064687140304711, + "learning_rate": 3.9728357799373675e-07, + "loss": 0.2745, + "step": 5743 + }, + { + "epoch": 2.7445194432829583, + "grad_norm": 0.5135425690883777, + "learning_rate": 3.958151450126324e-07, + "loss": 0.2371, + "step": 5744 + }, + { + "epoch": 2.744997311988531, + "grad_norm": 0.45220414835144906, + "learning_rate": 3.943493760629924e-07, + "loss": 0.2614, + "step": 5745 + }, + { + "epoch": 2.7454751806941045, + "grad_norm": 0.43240096665660527, + "learning_rate": 3.928862715513937e-07, + "loss": 0.2611, + "step": 5746 + }, + { + "epoch": 2.7459530493996773, + "grad_norm": 0.43526686209905135, + "learning_rate": 3.914258318836772e-07, + "loss": 0.2631, + "step": 5747 + }, + { + "epoch": 2.7464309181052506, + "grad_norm": 0.47983339284389653, + "learning_rate": 3.8996805746494336e-07, + "loss": 0.2501, + "step": 5748 + }, + { + "epoch": 2.746908786810824, + "grad_norm": 0.48766854570737883, + "learning_rate": 3.885129486995498e-07, + "loss": 0.2514, + "step": 5749 + }, + { + "epoch": 2.747386655516397, + "grad_norm": 0.4342714380611848, + "learning_rate": 3.8706050599112363e-07, + "loss": 0.2641, + "step": 5750 + }, + { + "epoch": 2.74786452422197, + "grad_norm": 0.4493794754550567, + "learning_rate": 3.8561072974254267e-07, + "loss": 0.2584, + "step": 5751 + }, + { + "epoch": 2.748342392927543, + "grad_norm": 0.5181439015969199, + "learning_rate": 3.8416362035594847e-07, + "loss": 0.2698, + "step": 5752 + }, + { + "epoch": 2.7488202616331163, + "grad_norm": 0.6154335078519316, + "learning_rate": 3.827191782327477e-07, + "loss": 0.2791, + "step": 5753 + }, + { + "epoch": 2.7492981303386896, + "grad_norm": 0.43729212520698946, + "learning_rate": 3.812774037736011e-07, + "loss": 0.2656, + "step": 5754 + }, + { + "epoch": 2.7497759990442625, + "grad_norm": 0.45813669988799843, + "learning_rate": 3.798382973784298e-07, + "loss": 0.2571, + "step": 5755 + }, + { + "epoch": 2.750253867749836, + "grad_norm": 0.4464533961762353, + "learning_rate": 3.7840185944641894e-07, + "loss": 0.2612, + "step": 5756 + }, + { + "epoch": 2.7507317364554087, + "grad_norm": 0.4320135449118599, + "learning_rate": 3.769680903760109e-07, + "loss": 0.2753, + "step": 5757 + }, + { + "epoch": 2.751209605160982, + "grad_norm": 0.4585589926522614, + "learning_rate": 3.7553699056490536e-07, + "loss": 0.2537, + "step": 5758 + }, + { + "epoch": 2.7516874738665553, + "grad_norm": 1.3126054515046628, + "learning_rate": 3.7410856041006694e-07, + "loss": 0.2696, + "step": 5759 + }, + { + "epoch": 2.752165342572128, + "grad_norm": 0.48621630808612154, + "learning_rate": 3.7268280030771655e-07, + "loss": 0.2472, + "step": 5760 + }, + { + "epoch": 2.7526432112777015, + "grad_norm": 0.4490574992798825, + "learning_rate": 3.712597106533344e-07, + "loss": 0.2572, + "step": 5761 + }, + { + "epoch": 2.7531210799832744, + "grad_norm": 0.4203205745967875, + "learning_rate": 3.698392918416593e-07, + "loss": 0.2539, + "step": 5762 + }, + { + "epoch": 2.7535989486888477, + "grad_norm": 0.4656550654291942, + "learning_rate": 3.684215442666927e-07, + "loss": 0.2693, + "step": 5763 + }, + { + "epoch": 2.754076817394421, + "grad_norm": 0.45928373313115284, + "learning_rate": 3.670064683216912e-07, + "loss": 0.2512, + "step": 5764 + }, + { + "epoch": 2.754554686099994, + "grad_norm": 0.45661432116739664, + "learning_rate": 3.655940643991718e-07, + "loss": 0.2477, + "step": 5765 + }, + { + "epoch": 2.755032554805567, + "grad_norm": 0.4323656387442569, + "learning_rate": 3.641843328909123e-07, + "loss": 0.2485, + "step": 5766 + }, + { + "epoch": 2.75551042351114, + "grad_norm": 0.4525029456321567, + "learning_rate": 3.6277727418794537e-07, + "loss": 0.2465, + "step": 5767 + }, + { + "epoch": 2.7559882922167134, + "grad_norm": 0.45847229829882746, + "learning_rate": 3.613728886805634e-07, + "loss": 0.2678, + "step": 5768 + }, + { + "epoch": 2.7564661609222867, + "grad_norm": 0.4354223231831446, + "learning_rate": 3.599711767583214e-07, + "loss": 0.2592, + "step": 5769 + }, + { + "epoch": 2.75694402962786, + "grad_norm": 0.45784981558251925, + "learning_rate": 3.585721388100283e-07, + "loss": 0.2722, + "step": 5770 + }, + { + "epoch": 2.757421898333433, + "grad_norm": 0.7302615178327121, + "learning_rate": 3.5717577522375037e-07, + "loss": 0.2564, + "step": 5771 + }, + { + "epoch": 2.757899767039006, + "grad_norm": 0.43796173111069236, + "learning_rate": 3.557820863868167e-07, + "loss": 0.2622, + "step": 5772 + }, + { + "epoch": 2.758377635744579, + "grad_norm": 0.462073755028673, + "learning_rate": 3.543910726858113e-07, + "loss": 0.2728, + "step": 5773 + }, + { + "epoch": 2.7588555044501524, + "grad_norm": 0.5233980811044266, + "learning_rate": 3.5300273450657564e-07, + "loss": 0.2708, + "step": 5774 + }, + { + "epoch": 2.7593333731557257, + "grad_norm": 0.5993407016424378, + "learning_rate": 3.516170722342127e-07, + "loss": 0.2667, + "step": 5775 + }, + { + "epoch": 2.7598112418612986, + "grad_norm": 0.425632476030798, + "learning_rate": 3.5023408625307844e-07, + "loss": 0.2765, + "step": 5776 + }, + { + "epoch": 2.760289110566872, + "grad_norm": 0.4264176424109537, + "learning_rate": 3.488537769467892e-07, + "loss": 0.2528, + "step": 5777 + }, + { + "epoch": 2.7607669792724447, + "grad_norm": 0.4322253085066871, + "learning_rate": 3.4747614469822e-07, + "loss": 0.271, + "step": 5778 + }, + { + "epoch": 2.761244847978018, + "grad_norm": 0.44062241649420975, + "learning_rate": 3.461011898895017e-07, + "loss": 0.2597, + "step": 5779 + }, + { + "epoch": 2.7617227166835914, + "grad_norm": 0.4288740826814237, + "learning_rate": 3.4472891290201927e-07, + "loss": 0.2659, + "step": 5780 + }, + { + "epoch": 2.7622005853891642, + "grad_norm": 0.441917086276448, + "learning_rate": 3.4335931411642153e-07, + "loss": 0.2716, + "step": 5781 + }, + { + "epoch": 2.7626784540947376, + "grad_norm": 0.7227881237165457, + "learning_rate": 3.419923939126102e-07, + "loss": 0.2642, + "step": 5782 + }, + { + "epoch": 2.7631563228003104, + "grad_norm": 0.4262654766844021, + "learning_rate": 3.4062815266974304e-07, + "loss": 0.2533, + "step": 5783 + }, + { + "epoch": 2.7636341915058837, + "grad_norm": 0.44298432828730744, + "learning_rate": 3.3926659076623846e-07, + "loss": 0.278, + "step": 5784 + }, + { + "epoch": 2.764112060211457, + "grad_norm": 0.43947410330039516, + "learning_rate": 3.3790770857976995e-07, + "loss": 0.2623, + "step": 5785 + }, + { + "epoch": 2.76458992891703, + "grad_norm": 0.42339510096523475, + "learning_rate": 3.3655150648726485e-07, + "loss": 0.279, + "step": 5786 + }, + { + "epoch": 2.7650677976226032, + "grad_norm": 0.45899672855736107, + "learning_rate": 3.351979848649134e-07, + "loss": 0.257, + "step": 5787 + }, + { + "epoch": 2.765545666328176, + "grad_norm": 0.45322359078395513, + "learning_rate": 3.3384714408815745e-07, + "loss": 0.2552, + "step": 5788 + }, + { + "epoch": 2.7660235350337494, + "grad_norm": 0.42446680964479644, + "learning_rate": 3.324989845316928e-07, + "loss": 0.2699, + "step": 5789 + }, + { + "epoch": 2.7665014037393227, + "grad_norm": 0.42230616756573375, + "learning_rate": 3.3115350656948043e-07, + "loss": 0.2662, + "step": 5790 + }, + { + "epoch": 2.7669792724448956, + "grad_norm": 0.5214966776605962, + "learning_rate": 3.298107105747295e-07, + "loss": 0.2722, + "step": 5791 + }, + { + "epoch": 2.767457141150469, + "grad_norm": 0.4648415035471319, + "learning_rate": 3.2847059691990644e-07, + "loss": 0.2555, + "step": 5792 + }, + { + "epoch": 2.767935009856042, + "grad_norm": 0.4484544390935882, + "learning_rate": 3.271331659767385e-07, + "loss": 0.2464, + "step": 5793 + }, + { + "epoch": 2.768412878561615, + "grad_norm": 0.5447610391648413, + "learning_rate": 3.257984181162044e-07, + "loss": 0.2698, + "step": 5794 + }, + { + "epoch": 2.7688907472671884, + "grad_norm": 0.4602729737342174, + "learning_rate": 3.2446635370853686e-07, + "loss": 0.2563, + "step": 5795 + }, + { + "epoch": 2.7693686159727617, + "grad_norm": 0.4937625519242985, + "learning_rate": 3.2313697312323143e-07, + "loss": 0.2561, + "step": 5796 + }, + { + "epoch": 2.7698464846783346, + "grad_norm": 0.47243649147806527, + "learning_rate": 3.218102767290332e-07, + "loss": 0.2559, + "step": 5797 + }, + { + "epoch": 2.770324353383908, + "grad_norm": 0.5039359434080682, + "learning_rate": 3.204862648939422e-07, + "loss": 0.2622, + "step": 5798 + }, + { + "epoch": 2.770802222089481, + "grad_norm": 0.438956565568596, + "learning_rate": 3.19164937985218e-07, + "loss": 0.2659, + "step": 5799 + }, + { + "epoch": 2.771280090795054, + "grad_norm": 0.4635475451765875, + "learning_rate": 3.1784629636937404e-07, + "loss": 0.2584, + "step": 5800 + }, + { + "epoch": 2.7717579595006274, + "grad_norm": 0.4439760668552996, + "learning_rate": 3.1653034041217555e-07, + "loss": 0.2537, + "step": 5801 + }, + { + "epoch": 2.7722358282062003, + "grad_norm": 0.42748355624347, + "learning_rate": 3.1521707047864836e-07, + "loss": 0.266, + "step": 5802 + }, + { + "epoch": 2.7727136969117736, + "grad_norm": 0.45627260236223416, + "learning_rate": 3.139064869330699e-07, + "loss": 0.2684, + "step": 5803 + }, + { + "epoch": 2.7731915656173465, + "grad_norm": 0.45339200404546826, + "learning_rate": 3.125985901389694e-07, + "loss": 0.2669, + "step": 5804 + }, + { + "epoch": 2.77366943432292, + "grad_norm": 0.45868520533855495, + "learning_rate": 3.1129338045914004e-07, + "loss": 0.2461, + "step": 5805 + }, + { + "epoch": 2.774147303028493, + "grad_norm": 0.4882764629386277, + "learning_rate": 3.099908582556199e-07, + "loss": 0.2725, + "step": 5806 + }, + { + "epoch": 2.774625171734066, + "grad_norm": 0.46500740654242584, + "learning_rate": 3.0869102388970673e-07, + "loss": 0.2506, + "step": 5807 + }, + { + "epoch": 2.7751030404396393, + "grad_norm": 0.452079842828752, + "learning_rate": 3.0739387772195205e-07, + "loss": 0.2631, + "step": 5808 + }, + { + "epoch": 2.775580909145212, + "grad_norm": 0.4381477199767286, + "learning_rate": 3.0609942011216144e-07, + "loss": 0.2754, + "step": 5809 + }, + { + "epoch": 2.7760587778507855, + "grad_norm": 0.4408816665348534, + "learning_rate": 3.0480765141939316e-07, + "loss": 0.2743, + "step": 5810 + }, + { + "epoch": 2.776536646556359, + "grad_norm": 0.46086546867548916, + "learning_rate": 3.035185720019629e-07, + "loss": 0.2704, + "step": 5811 + }, + { + "epoch": 2.7770145152619317, + "grad_norm": 0.44332016111656897, + "learning_rate": 3.022321822174379e-07, + "loss": 0.2682, + "step": 5812 + }, + { + "epoch": 2.777492383967505, + "grad_norm": 0.4492380713862776, + "learning_rate": 3.0094848242263943e-07, + "loss": 0.2673, + "step": 5813 + }, + { + "epoch": 2.777970252673078, + "grad_norm": 0.43400035663218267, + "learning_rate": 2.9966747297364375e-07, + "loss": 0.261, + "step": 5814 + }, + { + "epoch": 2.778448121378651, + "grad_norm": 0.44189833048994365, + "learning_rate": 2.9838915422578e-07, + "loss": 0.2563, + "step": 5815 + }, + { + "epoch": 2.7789259900842245, + "grad_norm": 0.4644677539843851, + "learning_rate": 2.9711352653363115e-07, + "loss": 0.2566, + "step": 5816 + }, + { + "epoch": 2.7794038587897973, + "grad_norm": 0.4475255303780797, + "learning_rate": 2.9584059025103415e-07, + "loss": 0.2636, + "step": 5817 + }, + { + "epoch": 2.7798817274953707, + "grad_norm": 0.443274368634649, + "learning_rate": 2.9457034573108e-07, + "loss": 0.2544, + "step": 5818 + }, + { + "epoch": 2.7803595962009435, + "grad_norm": 0.4450610430568845, + "learning_rate": 2.933027933261101e-07, + "loss": 0.2714, + "step": 5819 + }, + { + "epoch": 2.780837464906517, + "grad_norm": 0.42837406809925016, + "learning_rate": 2.920379333877221e-07, + "loss": 0.2525, + "step": 5820 + }, + { + "epoch": 2.78131533361209, + "grad_norm": 0.45240402207297176, + "learning_rate": 2.907757662667665e-07, + "loss": 0.2589, + "step": 5821 + }, + { + "epoch": 2.7817932023176635, + "grad_norm": 0.46379312040070836, + "learning_rate": 2.8951629231334434e-07, + "loss": 0.2472, + "step": 5822 + }, + { + "epoch": 2.7822710710232363, + "grad_norm": 0.4412306348002121, + "learning_rate": 2.8825951187681387e-07, + "loss": 0.276, + "step": 5823 + }, + { + "epoch": 2.7827489397288097, + "grad_norm": 0.4603946932137324, + "learning_rate": 2.87005425305783e-07, + "loss": 0.2722, + "step": 5824 + }, + { + "epoch": 2.7832268084343825, + "grad_norm": 0.45987250970095805, + "learning_rate": 2.8575403294811123e-07, + "loss": 0.2744, + "step": 5825 + }, + { + "epoch": 2.783704677139956, + "grad_norm": 0.43880186402447113, + "learning_rate": 2.845053351509142e-07, + "loss": 0.2384, + "step": 5826 + }, + { + "epoch": 2.784182545845529, + "grad_norm": 0.45035442317121277, + "learning_rate": 2.8325933226056033e-07, + "loss": 0.2759, + "step": 5827 + }, + { + "epoch": 2.784660414551102, + "grad_norm": 0.4669047993561521, + "learning_rate": 2.8201602462266775e-07, + "loss": 0.2575, + "step": 5828 + }, + { + "epoch": 2.7851382832566753, + "grad_norm": 0.5303829700742767, + "learning_rate": 2.8077541258210607e-07, + "loss": 0.2591, + "step": 5829 + }, + { + "epoch": 2.785616151962248, + "grad_norm": 0.6146920950572496, + "learning_rate": 2.795374964830022e-07, + "loss": 0.2634, + "step": 5830 + }, + { + "epoch": 2.7860940206678215, + "grad_norm": 0.4537712641615584, + "learning_rate": 2.7830227666872933e-07, + "loss": 0.2754, + "step": 5831 + }, + { + "epoch": 2.786571889373395, + "grad_norm": 0.4379198918938658, + "learning_rate": 2.770697534819178e-07, + "loss": 0.2787, + "step": 5832 + }, + { + "epoch": 2.7870497580789677, + "grad_norm": 0.5202726893687436, + "learning_rate": 2.758399272644474e-07, + "loss": 0.284, + "step": 5833 + }, + { + "epoch": 2.787527626784541, + "grad_norm": 0.45661668436412567, + "learning_rate": 2.746127983574498e-07, + "loss": 0.2649, + "step": 5834 + }, + { + "epoch": 2.788005495490114, + "grad_norm": 0.4649814028316174, + "learning_rate": 2.733883671013082e-07, + "loss": 0.2505, + "step": 5835 + }, + { + "epoch": 2.788483364195687, + "grad_norm": 0.5661711250377324, + "learning_rate": 2.721666338356599e-07, + "loss": 0.2538, + "step": 5836 + }, + { + "epoch": 2.7889612329012605, + "grad_norm": 0.4777341668360208, + "learning_rate": 2.709475988993915e-07, + "loss": 0.2666, + "step": 5837 + }, + { + "epoch": 2.7894391016068334, + "grad_norm": 0.47234324125709043, + "learning_rate": 2.6973126263064143e-07, + "loss": 0.2677, + "step": 5838 + }, + { + "epoch": 2.7899169703124067, + "grad_norm": 0.4313068768665777, + "learning_rate": 2.685176253667998e-07, + "loss": 0.2648, + "step": 5839 + }, + { + "epoch": 2.7903948390179796, + "grad_norm": 0.44321493365822373, + "learning_rate": 2.673066874445096e-07, + "loss": 0.2479, + "step": 5840 + }, + { + "epoch": 2.790872707723553, + "grad_norm": 0.47299846939177753, + "learning_rate": 2.66098449199661e-07, + "loss": 0.256, + "step": 5841 + }, + { + "epoch": 2.791350576429126, + "grad_norm": 0.44183881798639174, + "learning_rate": 2.648929109674003e-07, + "loss": 0.2559, + "step": 5842 + }, + { + "epoch": 2.791828445134699, + "grad_norm": 0.48856394471690884, + "learning_rate": 2.6369007308212233e-07, + "loss": 0.2528, + "step": 5843 + }, + { + "epoch": 2.7923063138402724, + "grad_norm": 0.42875317220473846, + "learning_rate": 2.6248993587747017e-07, + "loss": 0.2632, + "step": 5844 + }, + { + "epoch": 2.7927841825458453, + "grad_norm": 0.439006550935603, + "learning_rate": 2.612924996863453e-07, + "loss": 0.275, + "step": 5845 + }, + { + "epoch": 2.7932620512514186, + "grad_norm": 0.502986151150374, + "learning_rate": 2.600977648408931e-07, + "loss": 0.2728, + "step": 5846 + }, + { + "epoch": 2.793739919956992, + "grad_norm": 0.502221402752791, + "learning_rate": 2.5890573167251076e-07, + "loss": 0.2759, + "step": 5847 + }, + { + "epoch": 2.794217788662565, + "grad_norm": 0.46120467386193476, + "learning_rate": 2.5771640051184933e-07, + "loss": 0.2901, + "step": 5848 + }, + { + "epoch": 2.794695657368138, + "grad_norm": 0.46525631074461465, + "learning_rate": 2.565297716888082e-07, + "loss": 0.2537, + "step": 5849 + }, + { + "epoch": 2.7951735260737114, + "grad_norm": 0.5367360951991326, + "learning_rate": 2.5534584553253526e-07, + "loss": 0.2737, + "step": 5850 + }, + { + "epoch": 2.7956513947792843, + "grad_norm": 0.477928653419597, + "learning_rate": 2.5416462237143224e-07, + "loss": 0.2723, + "step": 5851 + }, + { + "epoch": 2.7961292634848576, + "grad_norm": 0.43779540429422825, + "learning_rate": 2.5298610253315037e-07, + "loss": 0.2492, + "step": 5852 + }, + { + "epoch": 2.796607132190431, + "grad_norm": 0.4473417651730253, + "learning_rate": 2.5181028634458704e-07, + "loss": 0.2591, + "step": 5853 + }, + { + "epoch": 2.7970850008960038, + "grad_norm": 0.4237788400316185, + "learning_rate": 2.5063717413189695e-07, + "loss": 0.2713, + "step": 5854 + }, + { + "epoch": 2.797562869601577, + "grad_norm": 0.46121222455074623, + "learning_rate": 2.494667662204797e-07, + "loss": 0.2587, + "step": 5855 + }, + { + "epoch": 2.79804073830715, + "grad_norm": 0.44337477946493636, + "learning_rate": 2.482990629349824e-07, + "loss": 0.277, + "step": 5856 + }, + { + "epoch": 2.7985186070127233, + "grad_norm": 0.4594080279967684, + "learning_rate": 2.471340645993103e-07, + "loss": 0.2668, + "step": 5857 + }, + { + "epoch": 2.7989964757182966, + "grad_norm": 0.4400931256728512, + "learning_rate": 2.4597177153661056e-07, + "loss": 0.2598, + "step": 5858 + }, + { + "epoch": 2.7994743444238694, + "grad_norm": 0.4426802092154914, + "learning_rate": 2.4481218406928297e-07, + "loss": 0.2763, + "step": 5859 + }, + { + "epoch": 2.7999522131294428, + "grad_norm": 0.4353913914636723, + "learning_rate": 2.436553025189758e-07, + "loss": 0.2573, + "step": 5860 + }, + { + "epoch": 2.8004300818350156, + "grad_norm": 0.5177432480573205, + "learning_rate": 2.4250112720659024e-07, + "loss": 0.2689, + "step": 5861 + }, + { + "epoch": 2.800907950540589, + "grad_norm": 0.5709604532387811, + "learning_rate": 2.413496584522723e-07, + "loss": 0.2526, + "step": 5862 + }, + { + "epoch": 2.8013858192461623, + "grad_norm": 0.45801417663423666, + "learning_rate": 2.402008965754199e-07, + "loss": 0.2537, + "step": 5863 + }, + { + "epoch": 2.801863687951735, + "grad_norm": 0.4707325970556673, + "learning_rate": 2.3905484189467807e-07, + "loss": 0.2759, + "step": 5864 + }, + { + "epoch": 2.8023415566573084, + "grad_norm": 0.4486900242136094, + "learning_rate": 2.3791149472794373e-07, + "loss": 0.257, + "step": 5865 + }, + { + "epoch": 2.8028194253628813, + "grad_norm": 0.45965714254275614, + "learning_rate": 2.3677085539235977e-07, + "loss": 0.2709, + "step": 5866 + }, + { + "epoch": 2.8032972940684546, + "grad_norm": 0.43566747125808025, + "learning_rate": 2.3563292420432094e-07, + "loss": 0.274, + "step": 5867 + }, + { + "epoch": 2.803775162774028, + "grad_norm": 0.4541848106577836, + "learning_rate": 2.3449770147946804e-07, + "loss": 0.2683, + "step": 5868 + }, + { + "epoch": 2.8042530314796013, + "grad_norm": 0.8078669068971982, + "learning_rate": 2.3336518753269144e-07, + "loss": 0.2577, + "step": 5869 + }, + { + "epoch": 2.804730900185174, + "grad_norm": 0.42680658838886604, + "learning_rate": 2.3223538267813317e-07, + "loss": 0.2434, + "step": 5870 + }, + { + "epoch": 2.805208768890747, + "grad_norm": 0.4358025311911329, + "learning_rate": 2.3110828722917812e-07, + "loss": 0.2625, + "step": 5871 + }, + { + "epoch": 2.8056866375963203, + "grad_norm": 0.45786374708678146, + "learning_rate": 2.2998390149846395e-07, + "loss": 0.268, + "step": 5872 + }, + { + "epoch": 2.8061645063018936, + "grad_norm": 0.44034045702230934, + "learning_rate": 2.2886222579787565e-07, + "loss": 0.2593, + "step": 5873 + }, + { + "epoch": 2.806642375007467, + "grad_norm": 0.629034529435717, + "learning_rate": 2.2774326043854656e-07, + "loss": 0.2797, + "step": 5874 + }, + { + "epoch": 2.80712024371304, + "grad_norm": 0.4663816214402025, + "learning_rate": 2.2662700573085505e-07, + "loss": 0.2589, + "step": 5875 + }, + { + "epoch": 2.807598112418613, + "grad_norm": 0.4520335805323787, + "learning_rate": 2.255134619844357e-07, + "loss": 0.2693, + "step": 5876 + }, + { + "epoch": 2.808075981124186, + "grad_norm": 0.45421012246090176, + "learning_rate": 2.2440262950816138e-07, + "loss": 0.2486, + "step": 5877 + }, + { + "epoch": 2.8085538498297593, + "grad_norm": 0.44872612791769856, + "learning_rate": 2.23294508610159e-07, + "loss": 0.2772, + "step": 5878 + }, + { + "epoch": 2.8090317185353326, + "grad_norm": 0.4471553331629049, + "learning_rate": 2.2218909959780265e-07, + "loss": 0.2622, + "step": 5879 + }, + { + "epoch": 2.8095095872409055, + "grad_norm": 0.4326277574229714, + "learning_rate": 2.2108640277771153e-07, + "loss": 0.241, + "step": 5880 + }, + { + "epoch": 2.809987455946479, + "grad_norm": 0.4447810782306284, + "learning_rate": 2.1998641845575542e-07, + "loss": 0.2596, + "step": 5881 + }, + { + "epoch": 2.8104653246520517, + "grad_norm": 0.4438607779450878, + "learning_rate": 2.1888914693705132e-07, + "loss": 0.2781, + "step": 5882 + }, + { + "epoch": 2.810943193357625, + "grad_norm": 0.4515855882029372, + "learning_rate": 2.1779458852596136e-07, + "loss": 0.26, + "step": 5883 + }, + { + "epoch": 2.8114210620631983, + "grad_norm": 0.46722738200529673, + "learning_rate": 2.167027435260971e-07, + "loss": 0.263, + "step": 5884 + }, + { + "epoch": 2.811898930768771, + "grad_norm": 0.8295117627891755, + "learning_rate": 2.156136122403174e-07, + "loss": 0.2524, + "step": 5885 + }, + { + "epoch": 2.8123767994743445, + "grad_norm": 0.4993459060161254, + "learning_rate": 2.1452719497072839e-07, + "loss": 0.2702, + "step": 5886 + }, + { + "epoch": 2.8128546681799174, + "grad_norm": 0.4414682776865743, + "learning_rate": 2.1344349201868232e-07, + "loss": 0.2771, + "step": 5887 + }, + { + "epoch": 2.8133325368854907, + "grad_norm": 0.44309446914027134, + "learning_rate": 2.1236250368477985e-07, + "loss": 0.2827, + "step": 5888 + }, + { + "epoch": 2.813810405591064, + "grad_norm": 0.4392176207768714, + "learning_rate": 2.1128423026886892e-07, + "loss": 0.2721, + "step": 5889 + }, + { + "epoch": 2.814288274296637, + "grad_norm": 0.4418095009271383, + "learning_rate": 2.1020867207004026e-07, + "loss": 0.2673, + "step": 5890 + }, + { + "epoch": 2.81476614300221, + "grad_norm": 0.4422044960477242, + "learning_rate": 2.0913582938663855e-07, + "loss": 0.2686, + "step": 5891 + }, + { + "epoch": 2.815244011707783, + "grad_norm": 0.4697009809230947, + "learning_rate": 2.0806570251625023e-07, + "loss": 0.2533, + "step": 5892 + }, + { + "epoch": 2.8157218804133564, + "grad_norm": 0.4507224779670188, + "learning_rate": 2.0699829175570785e-07, + "loss": 0.2519, + "step": 5893 + }, + { + "epoch": 2.8161997491189297, + "grad_norm": 0.4528792818717239, + "learning_rate": 2.0593359740109452e-07, + "loss": 0.2701, + "step": 5894 + }, + { + "epoch": 2.816677617824503, + "grad_norm": 0.4460463818994321, + "learning_rate": 2.048716197477374e-07, + "loss": 0.2599, + "step": 5895 + }, + { + "epoch": 2.817155486530076, + "grad_norm": 0.49958157563151245, + "learning_rate": 2.038123590902086e-07, + "loss": 0.2625, + "step": 5896 + }, + { + "epoch": 2.8176333552356487, + "grad_norm": 0.5406609588682755, + "learning_rate": 2.0275581572233083e-07, + "loss": 0.2459, + "step": 5897 + }, + { + "epoch": 2.818111223941222, + "grad_norm": 0.4618527016308968, + "learning_rate": 2.017019899371686e-07, + "loss": 0.2725, + "step": 5898 + }, + { + "epoch": 2.8185890926467954, + "grad_norm": 0.43028097781936997, + "learning_rate": 2.0065088202703587e-07, + "loss": 0.2742, + "step": 5899 + }, + { + "epoch": 2.8190669613523687, + "grad_norm": 0.4615866120409888, + "learning_rate": 1.996024922834905e-07, + "loss": 0.2488, + "step": 5900 + }, + { + "epoch": 2.8195448300579415, + "grad_norm": 0.4243814410711499, + "learning_rate": 1.9855682099733876e-07, + "loss": 0.251, + "step": 5901 + }, + { + "epoch": 2.820022698763515, + "grad_norm": 0.42845665939883043, + "learning_rate": 1.9751386845862864e-07, + "loss": 0.2627, + "step": 5902 + }, + { + "epoch": 2.8205005674690877, + "grad_norm": 0.44017495021305203, + "learning_rate": 1.9647363495665983e-07, + "loss": 0.2529, + "step": 5903 + }, + { + "epoch": 2.820978436174661, + "grad_norm": 0.4587273313715358, + "learning_rate": 1.9543612077997376e-07, + "loss": 0.2563, + "step": 5904 + }, + { + "epoch": 2.8214563048802344, + "grad_norm": 0.47195775641267157, + "learning_rate": 1.9440132621635687e-07, + "loss": 0.2654, + "step": 5905 + }, + { + "epoch": 2.8219341735858072, + "grad_norm": 0.43800418779605343, + "learning_rate": 1.9336925155284514e-07, + "loss": 0.2698, + "step": 5906 + }, + { + "epoch": 2.8224120422913805, + "grad_norm": 0.5249547147069987, + "learning_rate": 1.9233989707571732e-07, + "loss": 0.2738, + "step": 5907 + }, + { + "epoch": 2.8228899109969534, + "grad_norm": 0.4479165992631786, + "learning_rate": 1.9131326307049724e-07, + "loss": 0.2583, + "step": 5908 + }, + { + "epoch": 2.8233677797025267, + "grad_norm": 0.4367008126895965, + "learning_rate": 1.9028934982195602e-07, + "loss": 0.2483, + "step": 5909 + }, + { + "epoch": 2.8238456484081, + "grad_norm": 0.4453324046755028, + "learning_rate": 1.8926815761410867e-07, + "loss": 0.2595, + "step": 5910 + }, + { + "epoch": 2.824323517113673, + "grad_norm": 0.4515610654582882, + "learning_rate": 1.8824968673021525e-07, + "loss": 0.2633, + "step": 5911 + }, + { + "epoch": 2.8248013858192462, + "grad_norm": 0.44115993805926473, + "learning_rate": 1.872339374527843e-07, + "loss": 0.2536, + "step": 5912 + }, + { + "epoch": 2.825279254524819, + "grad_norm": 0.4228775353271241, + "learning_rate": 1.8622091006356368e-07, + "loss": 0.2481, + "step": 5913 + }, + { + "epoch": 2.8257571232303924, + "grad_norm": 0.44436167133939714, + "learning_rate": 1.852106048435498e-07, + "loss": 0.2657, + "step": 5914 + }, + { + "epoch": 2.8262349919359657, + "grad_norm": 0.4406708131002175, + "learning_rate": 1.8420302207298623e-07, + "loss": 0.2604, + "step": 5915 + }, + { + "epoch": 2.8267128606415386, + "grad_norm": 0.4654540147793532, + "learning_rate": 1.831981620313561e-07, + "loss": 0.2753, + "step": 5916 + }, + { + "epoch": 2.827190729347112, + "grad_norm": 0.4371867269522205, + "learning_rate": 1.8219602499738863e-07, + "loss": 0.2584, + "step": 5917 + }, + { + "epoch": 2.827668598052685, + "grad_norm": 0.4341168805289483, + "learning_rate": 1.8119661124906262e-07, + "loss": 0.2697, + "step": 5918 + }, + { + "epoch": 2.828146466758258, + "grad_norm": 0.5332475340009322, + "learning_rate": 1.801999210635952e-07, + "loss": 0.2544, + "step": 5919 + }, + { + "epoch": 2.8286243354638314, + "grad_norm": 0.4415760206731807, + "learning_rate": 1.792059547174507e-07, + "loss": 0.2608, + "step": 5920 + }, + { + "epoch": 2.8291022041694047, + "grad_norm": 0.4539316651950751, + "learning_rate": 1.7821471248633982e-07, + "loss": 0.2545, + "step": 5921 + }, + { + "epoch": 2.8295800728749776, + "grad_norm": 0.4269206432587368, + "learning_rate": 1.7722619464521363e-07, + "loss": 0.254, + "step": 5922 + }, + { + "epoch": 2.830057941580551, + "grad_norm": 0.43673477313105225, + "learning_rate": 1.762404014682706e-07, + "loss": 0.2667, + "step": 5923 + }, + { + "epoch": 2.8305358102861238, + "grad_norm": 0.4414874551970224, + "learning_rate": 1.75257333228952e-07, + "loss": 0.2696, + "step": 5924 + }, + { + "epoch": 2.831013678991697, + "grad_norm": 0.4497054490741176, + "learning_rate": 1.7427699019994415e-07, + "loss": 0.2676, + "step": 5925 + }, + { + "epoch": 2.8314915476972704, + "grad_norm": 0.4908756734479855, + "learning_rate": 1.7329937265317508e-07, + "loss": 0.2771, + "step": 5926 + }, + { + "epoch": 2.8319694164028433, + "grad_norm": 0.4454065328919432, + "learning_rate": 1.7232448085982012e-07, + "loss": 0.2597, + "step": 5927 + }, + { + "epoch": 2.8324472851084166, + "grad_norm": 0.42499929417919846, + "learning_rate": 1.713523150902985e-07, + "loss": 0.2689, + "step": 5928 + }, + { + "epoch": 2.8329251538139895, + "grad_norm": 0.48284867449405694, + "learning_rate": 1.7038287561426892e-07, + "loss": 0.2686, + "step": 5929 + }, + { + "epoch": 2.8334030225195628, + "grad_norm": 0.44411974690243017, + "learning_rate": 1.6941616270063854e-07, + "loss": 0.2877, + "step": 5930 + }, + { + "epoch": 2.833880891225136, + "grad_norm": 0.442865545746602, + "learning_rate": 1.684521766175562e-07, + "loss": 0.2575, + "step": 5931 + }, + { + "epoch": 2.834358759930709, + "grad_norm": 0.5103520942204934, + "learning_rate": 1.6749091763241464e-07, + "loss": 0.2606, + "step": 5932 + }, + { + "epoch": 2.8348366286362823, + "grad_norm": 0.447650390063257, + "learning_rate": 1.665323860118495e-07, + "loss": 0.2705, + "step": 5933 + }, + { + "epoch": 2.835314497341855, + "grad_norm": 0.44565808230621423, + "learning_rate": 1.6557658202174254e-07, + "loss": 0.2703, + "step": 5934 + }, + { + "epoch": 2.8357923660474285, + "grad_norm": 0.43861667920681485, + "learning_rate": 1.6462350592721498e-07, + "loss": 0.2447, + "step": 5935 + }, + { + "epoch": 2.8362702347530018, + "grad_norm": 0.4633523191534991, + "learning_rate": 1.6367315799263206e-07, + "loss": 0.2713, + "step": 5936 + }, + { + "epoch": 2.8367481034585746, + "grad_norm": 0.4531309552740638, + "learning_rate": 1.6272553848160733e-07, + "loss": 0.2649, + "step": 5937 + }, + { + "epoch": 2.837225972164148, + "grad_norm": 0.4621397423597151, + "learning_rate": 1.6178064765699052e-07, + "loss": 0.2752, + "step": 5938 + }, + { + "epoch": 2.837703840869721, + "grad_norm": 0.4415497248019033, + "learning_rate": 1.6083848578087868e-07, + "loss": 0.2527, + "step": 5939 + }, + { + "epoch": 2.838181709575294, + "grad_norm": 0.44367203604390565, + "learning_rate": 1.5989905311461274e-07, + "loss": 0.2677, + "step": 5940 + }, + { + "epoch": 2.8386595782808675, + "grad_norm": 0.5809887272446366, + "learning_rate": 1.5896234991877202e-07, + "loss": 0.2713, + "step": 5941 + }, + { + "epoch": 2.8391374469864403, + "grad_norm": 0.4303037195655415, + "learning_rate": 1.5802837645318203e-07, + "loss": 0.2455, + "step": 5942 + }, + { + "epoch": 2.8396153156920136, + "grad_norm": 0.47424596948568043, + "learning_rate": 1.570971329769111e-07, + "loss": 0.2524, + "step": 5943 + }, + { + "epoch": 2.8400931843975865, + "grad_norm": 0.4178692383364968, + "learning_rate": 1.5616861974827036e-07, + "loss": 0.2526, + "step": 5944 + }, + { + "epoch": 2.84057105310316, + "grad_norm": 0.46166989830773797, + "learning_rate": 1.5524283702481158e-07, + "loss": 0.271, + "step": 5945 + }, + { + "epoch": 2.841048921808733, + "grad_norm": 0.4440783366258334, + "learning_rate": 1.5431978506333155e-07, + "loss": 0.2633, + "step": 5946 + }, + { + "epoch": 2.8415267905143065, + "grad_norm": 0.441365885091842, + "learning_rate": 1.5339946411986885e-07, + "loss": 0.2528, + "step": 5947 + }, + { + "epoch": 2.8420046592198793, + "grad_norm": 0.4737324334331007, + "learning_rate": 1.5248187444970252e-07, + "loss": 0.2599, + "step": 5948 + }, + { + "epoch": 2.8424825279254526, + "grad_norm": 0.48811117143680444, + "learning_rate": 1.5156701630735792e-07, + "loss": 0.2453, + "step": 5949 + }, + { + "epoch": 2.8429603966310255, + "grad_norm": 0.43188605131922725, + "learning_rate": 1.5065488994659983e-07, + "loss": 0.2669, + "step": 5950 + }, + { + "epoch": 2.843438265336599, + "grad_norm": 0.434588565968877, + "learning_rate": 1.497454956204347e-07, + "loss": 0.2621, + "step": 5951 + }, + { + "epoch": 2.843916134042172, + "grad_norm": 0.45504777884303677, + "learning_rate": 1.4883883358111418e-07, + "loss": 0.2599, + "step": 5952 + }, + { + "epoch": 2.844394002747745, + "grad_norm": 0.43178757210896046, + "learning_rate": 1.4793490408013033e-07, + "loss": 0.2617, + "step": 5953 + }, + { + "epoch": 2.8448718714533183, + "grad_norm": 0.45654767138484836, + "learning_rate": 1.4703370736821487e-07, + "loss": 0.268, + "step": 5954 + }, + { + "epoch": 2.845349740158891, + "grad_norm": 0.48119573262488585, + "learning_rate": 1.461352436953478e-07, + "loss": 0.2622, + "step": 5955 + }, + { + "epoch": 2.8458276088644645, + "grad_norm": 0.4351617846746113, + "learning_rate": 1.4523951331074426e-07, + "loss": 0.2601, + "step": 5956 + }, + { + "epoch": 2.846305477570038, + "grad_norm": 0.45633355644087814, + "learning_rate": 1.4434651646286325e-07, + "loss": 0.2653, + "step": 5957 + }, + { + "epoch": 2.8467833462756107, + "grad_norm": 0.4736965437562393, + "learning_rate": 1.4345625339940994e-07, + "loss": 0.2781, + "step": 5958 + }, + { + "epoch": 2.847261214981184, + "grad_norm": 0.46355248896208456, + "learning_rate": 1.4256872436732461e-07, + "loss": 0.2591, + "step": 5959 + }, + { + "epoch": 2.847739083686757, + "grad_norm": 0.4283329171153488, + "learning_rate": 1.4168392961279254e-07, + "loss": 0.256, + "step": 5960 + }, + { + "epoch": 2.84821695239233, + "grad_norm": 0.46933751036840643, + "learning_rate": 1.4080186938124074e-07, + "loss": 0.264, + "step": 5961 + }, + { + "epoch": 2.8486948210979035, + "grad_norm": 0.4421456212297339, + "learning_rate": 1.3992254391733794e-07, + "loss": 0.2435, + "step": 5962 + }, + { + "epoch": 2.8491726898034764, + "grad_norm": 0.43361352767650047, + "learning_rate": 1.390459534649924e-07, + "loss": 0.2628, + "step": 5963 + }, + { + "epoch": 2.8496505585090497, + "grad_norm": 0.43095231476269297, + "learning_rate": 1.38172098267354e-07, + "loss": 0.2607, + "step": 5964 + }, + { + "epoch": 2.8501284272146226, + "grad_norm": 0.4703203261155736, + "learning_rate": 1.3730097856681668e-07, + "loss": 0.2504, + "step": 5965 + }, + { + "epoch": 2.850606295920196, + "grad_norm": 0.4376878982675997, + "learning_rate": 1.364325946050138e-07, + "loss": 0.2799, + "step": 5966 + }, + { + "epoch": 2.851084164625769, + "grad_norm": 0.4391097408323728, + "learning_rate": 1.355669466228171e-07, + "loss": 0.2491, + "step": 5967 + }, + { + "epoch": 2.851562033331342, + "grad_norm": 0.5148238033140623, + "learning_rate": 1.3470403486034566e-07, + "loss": 0.268, + "step": 5968 + }, + { + "epoch": 2.8520399020369154, + "grad_norm": 0.44170548346421123, + "learning_rate": 1.3384385955695355e-07, + "loss": 0.2603, + "step": 5969 + }, + { + "epoch": 2.8525177707424882, + "grad_norm": 0.46229977499534447, + "learning_rate": 1.329864209512377e-07, + "loss": 0.2643, + "step": 5970 + }, + { + "epoch": 2.8529956394480616, + "grad_norm": 0.4703215322101558, + "learning_rate": 1.3213171928103785e-07, + "loss": 0.2613, + "step": 5971 + }, + { + "epoch": 2.853473508153635, + "grad_norm": 0.4977139206239717, + "learning_rate": 1.3127975478343435e-07, + "loss": 0.259, + "step": 5972 + }, + { + "epoch": 2.853951376859208, + "grad_norm": 0.44329852164883693, + "learning_rate": 1.3043052769474375e-07, + "loss": 0.25, + "step": 5973 + }, + { + "epoch": 2.854429245564781, + "grad_norm": 0.44218052941146163, + "learning_rate": 1.2958403825052978e-07, + "loss": 0.254, + "step": 5974 + }, + { + "epoch": 2.8549071142703544, + "grad_norm": 0.4466772904005583, + "learning_rate": 1.2874028668559247e-07, + "loss": 0.2651, + "step": 5975 + }, + { + "epoch": 2.8553849829759272, + "grad_norm": 0.43862131247970004, + "learning_rate": 1.2789927323397232e-07, + "loss": 0.2602, + "step": 5976 + }, + { + "epoch": 2.8558628516815006, + "grad_norm": 0.45565339829882245, + "learning_rate": 1.270609981289539e-07, + "loss": 0.2699, + "step": 5977 + }, + { + "epoch": 2.856340720387074, + "grad_norm": 0.45795122735762345, + "learning_rate": 1.2622546160305894e-07, + "loss": 0.2498, + "step": 5978 + }, + { + "epoch": 2.8568185890926467, + "grad_norm": 0.43648336997823883, + "learning_rate": 1.2539266388804981e-07, + "loss": 0.2627, + "step": 5979 + }, + { + "epoch": 2.85729645779822, + "grad_norm": 0.4637171759508475, + "learning_rate": 1.245626052149318e-07, + "loss": 0.2535, + "step": 5980 + }, + { + "epoch": 2.857774326503793, + "grad_norm": 0.45562841177816166, + "learning_rate": 1.2373528581394733e-07, + "loss": 0.2462, + "step": 5981 + }, + { + "epoch": 2.8582521952093662, + "grad_norm": 0.4450586940009108, + "learning_rate": 1.2291070591457842e-07, + "loss": 0.2788, + "step": 5982 + }, + { + "epoch": 2.8587300639149396, + "grad_norm": 0.4386721155226896, + "learning_rate": 1.2208886574555323e-07, + "loss": 0.2725, + "step": 5983 + }, + { + "epoch": 2.8592079326205124, + "grad_norm": 0.42956994998421677, + "learning_rate": 1.212697655348316e-07, + "loss": 0.266, + "step": 5984 + }, + { + "epoch": 2.8596858013260857, + "grad_norm": 0.44622397158469107, + "learning_rate": 1.2045340550961958e-07, + "loss": 0.2606, + "step": 5985 + }, + { + "epoch": 2.8601636700316586, + "grad_norm": 0.8670959918097046, + "learning_rate": 1.196397858963616e-07, + "loss": 0.275, + "step": 5986 + }, + { + "epoch": 2.860641538737232, + "grad_norm": 0.42188794447231875, + "learning_rate": 1.1882890692073933e-07, + "loss": 0.2709, + "step": 5987 + }, + { + "epoch": 2.8611194074428052, + "grad_norm": 0.45699426887028993, + "learning_rate": 1.1802076880767732e-07, + "loss": 0.261, + "step": 5988 + }, + { + "epoch": 2.861597276148378, + "grad_norm": 0.6013209724994314, + "learning_rate": 1.1721537178133958e-07, + "loss": 0.2665, + "step": 5989 + }, + { + "epoch": 2.8620751448539514, + "grad_norm": 0.5033962646495204, + "learning_rate": 1.164127160651285e-07, + "loss": 0.2886, + "step": 5990 + }, + { + "epoch": 2.8625530135595243, + "grad_norm": 0.4387868638958641, + "learning_rate": 1.15612801881686e-07, + "loss": 0.2631, + "step": 5991 + }, + { + "epoch": 2.8630308822650976, + "grad_norm": 0.4470342389927088, + "learning_rate": 1.148156294528946e-07, + "loss": 0.2748, + "step": 5992 + }, + { + "epoch": 2.863508750970671, + "grad_norm": 0.44782132948777004, + "learning_rate": 1.1402119899987629e-07, + "loss": 0.2482, + "step": 5993 + }, + { + "epoch": 2.863986619676244, + "grad_norm": 0.4309117184393657, + "learning_rate": 1.1322951074299149e-07, + "loss": 0.2616, + "step": 5994 + }, + { + "epoch": 2.864464488381817, + "grad_norm": 0.45641574770069654, + "learning_rate": 1.1244056490184008e-07, + "loss": 0.2741, + "step": 5995 + }, + { + "epoch": 2.86494235708739, + "grad_norm": 0.4795660239874649, + "learning_rate": 1.1165436169526366e-07, + "loss": 0.2679, + "step": 5996 + }, + { + "epoch": 2.8654202257929633, + "grad_norm": 0.5323760308177075, + "learning_rate": 1.1087090134134005e-07, + "loss": 0.2589, + "step": 5997 + }, + { + "epoch": 2.8658980944985366, + "grad_norm": 0.504237551040962, + "learning_rate": 1.1009018405738536e-07, + "loss": 0.2625, + "step": 5998 + }, + { + "epoch": 2.86637596320411, + "grad_norm": 0.4402136999052143, + "learning_rate": 1.0931221005996084e-07, + "loss": 0.2755, + "step": 5999 + }, + { + "epoch": 2.866853831909683, + "grad_norm": 0.4491859848447432, + "learning_rate": 1.0853697956485942e-07, + "loss": 0.245, + "step": 6000 + }, + { + "epoch": 2.867331700615256, + "grad_norm": 0.45231341758240234, + "learning_rate": 1.0776449278711686e-07, + "loss": 0.2491, + "step": 6001 + }, + { + "epoch": 2.867809569320829, + "grad_norm": 0.4369763191287368, + "learning_rate": 1.0699474994100845e-07, + "loss": 0.2558, + "step": 6002 + }, + { + "epoch": 2.8682874380264023, + "grad_norm": 0.5358207159433925, + "learning_rate": 1.0622775124004669e-07, + "loss": 0.2668, + "step": 6003 + }, + { + "epoch": 2.8687653067319756, + "grad_norm": 0.471953453207217, + "learning_rate": 1.0546349689698365e-07, + "loss": 0.2685, + "step": 6004 + }, + { + "epoch": 2.8692431754375485, + "grad_norm": 0.44877487345300576, + "learning_rate": 1.0470198712381086e-07, + "loss": 0.2629, + "step": 6005 + }, + { + "epoch": 2.869721044143122, + "grad_norm": 0.43639707706363223, + "learning_rate": 1.03943222131756e-07, + "loss": 0.2643, + "step": 6006 + }, + { + "epoch": 2.8701989128486947, + "grad_norm": 0.43048550859297546, + "learning_rate": 1.0318720213128741e-07, + "loss": 0.2682, + "step": 6007 + }, + { + "epoch": 2.870676781554268, + "grad_norm": 0.4366175477899045, + "learning_rate": 1.0243392733211289e-07, + "loss": 0.2576, + "step": 6008 + }, + { + "epoch": 2.8711546502598413, + "grad_norm": 0.4420799299343235, + "learning_rate": 1.0168339794317638e-07, + "loss": 0.2664, + "step": 6009 + }, + { + "epoch": 2.871632518965414, + "grad_norm": 0.43731535232122004, + "learning_rate": 1.009356141726614e-07, + "loss": 0.2485, + "step": 6010 + }, + { + "epoch": 2.8721103876709875, + "grad_norm": 0.45731961228738927, + "learning_rate": 1.00190576227992e-07, + "loss": 0.2713, + "step": 6011 + }, + { + "epoch": 2.8725882563765603, + "grad_norm": 0.4357113263887429, + "learning_rate": 9.94482843158262e-08, + "loss": 0.2542, + "step": 6012 + }, + { + "epoch": 2.8730661250821337, + "grad_norm": 0.4222045104321912, + "learning_rate": 9.870873864206376e-08, + "loss": 0.2551, + "step": 6013 + }, + { + "epoch": 2.873543993787707, + "grad_norm": 0.4337713135885946, + "learning_rate": 9.797193941184169e-08, + "loss": 0.2286, + "step": 6014 + }, + { + "epoch": 2.87402186249328, + "grad_norm": 0.521268220454681, + "learning_rate": 9.723788682953539e-08, + "loss": 0.274, + "step": 6015 + }, + { + "epoch": 2.874499731198853, + "grad_norm": 0.4312558999875928, + "learning_rate": 9.650658109875533e-08, + "loss": 0.2431, + "step": 6016 + }, + { + "epoch": 2.874977599904426, + "grad_norm": 0.43570324284996176, + "learning_rate": 9.5778022422357e-08, + "loss": 0.2506, + "step": 6017 + }, + { + "epoch": 2.8754554686099993, + "grad_norm": 0.43684524986259893, + "learning_rate": 9.505221100242767e-08, + "loss": 0.2554, + "step": 6018 + }, + { + "epoch": 2.8759333373155727, + "grad_norm": 0.45826436214172095, + "learning_rate": 9.432914704029406e-08, + "loss": 0.2606, + "step": 6019 + }, + { + "epoch": 2.8764112060211455, + "grad_norm": 0.4296388690654118, + "learning_rate": 9.360883073652238e-08, + "loss": 0.2677, + "step": 6020 + }, + { + "epoch": 2.876889074726719, + "grad_norm": 1.0983884559688328, + "learning_rate": 9.289126229091505e-08, + "loss": 0.2423, + "step": 6021 + }, + { + "epoch": 2.8773669434322917, + "grad_norm": 0.4443043294613616, + "learning_rate": 9.217644190251285e-08, + "loss": 0.2727, + "step": 6022 + }, + { + "epoch": 2.877844812137865, + "grad_norm": 0.45206181728205963, + "learning_rate": 9.146436976959605e-08, + "loss": 0.2572, + "step": 6023 + }, + { + "epoch": 2.8783226808434383, + "grad_norm": 0.45123890210077205, + "learning_rate": 9.075504608967889e-08, + "loss": 0.266, + "step": 6024 + }, + { + "epoch": 2.8788005495490117, + "grad_norm": 0.4575185175136139, + "learning_rate": 9.004847105951509e-08, + "loss": 0.2639, + "step": 6025 + }, + { + "epoch": 2.8792784182545845, + "grad_norm": 0.48693714644668074, + "learning_rate": 8.934464487509786e-08, + "loss": 0.2693, + "step": 6026 + }, + { + "epoch": 2.879756286960158, + "grad_norm": 0.4425901100192589, + "learning_rate": 8.86435677316544e-08, + "loss": 0.2525, + "step": 6027 + }, + { + "epoch": 2.8802341556657307, + "grad_norm": 0.44877326238514714, + "learning_rate": 8.794523982365134e-08, + "loss": 0.2641, + "step": 6028 + }, + { + "epoch": 2.880712024371304, + "grad_norm": 0.4285119487781865, + "learning_rate": 8.724966134479374e-08, + "loss": 0.2534, + "step": 6029 + }, + { + "epoch": 2.8811898930768773, + "grad_norm": 0.46436342305670525, + "learning_rate": 8.655683248802282e-08, + "loss": 0.2821, + "step": 6030 + }, + { + "epoch": 2.88166776178245, + "grad_norm": 0.44312616547475814, + "learning_rate": 8.586675344551599e-08, + "loss": 0.2731, + "step": 6031 + }, + { + "epoch": 2.8821456304880235, + "grad_norm": 0.43442743265027634, + "learning_rate": 8.517942440868898e-08, + "loss": 0.2506, + "step": 6032 + }, + { + "epoch": 2.8826234991935964, + "grad_norm": 0.4308115226635295, + "learning_rate": 8.449484556819598e-08, + "loss": 0.2701, + "step": 6033 + }, + { + "epoch": 2.8831013678991697, + "grad_norm": 0.4460495905460458, + "learning_rate": 8.38130171139262e-08, + "loss": 0.2551, + "step": 6034 + }, + { + "epoch": 2.883579236604743, + "grad_norm": 0.5389940715079629, + "learning_rate": 8.313393923500613e-08, + "loss": 0.2657, + "step": 6035 + }, + { + "epoch": 2.884057105310316, + "grad_norm": 0.44293129901679096, + "learning_rate": 8.245761211980174e-08, + "loss": 0.242, + "step": 6036 + }, + { + "epoch": 2.884534974015889, + "grad_norm": 0.47983678443629796, + "learning_rate": 8.1784035955913e-08, + "loss": 0.2629, + "step": 6037 + }, + { + "epoch": 2.885012842721462, + "grad_norm": 0.4318505645360447, + "learning_rate": 8.11132109301782e-08, + "loss": 0.2722, + "step": 6038 + }, + { + "epoch": 2.8854907114270354, + "grad_norm": 0.43438969618864703, + "learning_rate": 8.044513722867298e-08, + "loss": 0.2696, + "step": 6039 + }, + { + "epoch": 2.8859685801326087, + "grad_norm": 0.4569375021145621, + "learning_rate": 7.977981503670795e-08, + "loss": 0.2851, + "step": 6040 + }, + { + "epoch": 2.8864464488381816, + "grad_norm": 0.4479908637746793, + "learning_rate": 7.911724453883329e-08, + "loss": 0.2706, + "step": 6041 + }, + { + "epoch": 2.886924317543755, + "grad_norm": 0.4326153900898914, + "learning_rate": 7.845742591883309e-08, + "loss": 0.2551, + "step": 6042 + }, + { + "epoch": 2.8874021862493278, + "grad_norm": 0.4357317158977019, + "learning_rate": 7.780035935972985e-08, + "loss": 0.2599, + "step": 6043 + }, + { + "epoch": 2.887880054954901, + "grad_norm": 0.47807706881714124, + "learning_rate": 7.714604504378332e-08, + "loss": 0.2515, + "step": 6044 + }, + { + "epoch": 2.8883579236604744, + "grad_norm": 0.48906636149481963, + "learning_rate": 7.64944831524872e-08, + "loss": 0.2628, + "step": 6045 + }, + { + "epoch": 2.8888357923660477, + "grad_norm": 0.48643435510610833, + "learning_rate": 7.584567386657248e-08, + "loss": 0.2653, + "step": 6046 + }, + { + "epoch": 2.8893136610716206, + "grad_norm": 0.4656245737115965, + "learning_rate": 7.519961736601078e-08, + "loss": 0.272, + "step": 6047 + }, + { + "epoch": 2.8897915297771934, + "grad_norm": 0.44143454689073186, + "learning_rate": 7.455631383000428e-08, + "loss": 0.2603, + "step": 6048 + }, + { + "epoch": 2.8902693984827668, + "grad_norm": 0.622134137502078, + "learning_rate": 7.391576343699359e-08, + "loss": 0.2628, + "step": 6049 + }, + { + "epoch": 2.89074726718834, + "grad_norm": 0.4276530571655008, + "learning_rate": 7.327796636465767e-08, + "loss": 0.2689, + "step": 6050 + }, + { + "epoch": 2.8912251358939134, + "grad_norm": 0.4382139339105153, + "learning_rate": 7.264292278990947e-08, + "loss": 0.251, + "step": 6051 + }, + { + "epoch": 2.8917030045994863, + "grad_norm": 0.47876504229735695, + "learning_rate": 7.201063288889809e-08, + "loss": 0.2814, + "step": 6052 + }, + { + "epoch": 2.8921808733050596, + "grad_norm": 0.4448938723916756, + "learning_rate": 7.138109683701211e-08, + "loss": 0.2654, + "step": 6053 + }, + { + "epoch": 2.8926587420106324, + "grad_norm": 0.4453495110101877, + "learning_rate": 7.075431480887074e-08, + "loss": 0.2682, + "step": 6054 + }, + { + "epoch": 2.8931366107162058, + "grad_norm": 0.4498675594265008, + "learning_rate": 7.01302869783338e-08, + "loss": 0.2618, + "step": 6055 + }, + { + "epoch": 2.893614479421779, + "grad_norm": 0.44182308271744547, + "learning_rate": 6.950901351849504e-08, + "loss": 0.2738, + "step": 6056 + }, + { + "epoch": 2.894092348127352, + "grad_norm": 0.4297806934525359, + "learning_rate": 6.889049460168662e-08, + "loss": 0.265, + "step": 6057 + }, + { + "epoch": 2.8945702168329253, + "grad_norm": 0.4270875134609304, + "learning_rate": 6.827473039947131e-08, + "loss": 0.2455, + "step": 6058 + }, + { + "epoch": 2.895048085538498, + "grad_norm": 0.4386255563204099, + "learning_rate": 6.766172108265356e-08, + "loss": 0.272, + "step": 6059 + }, + { + "epoch": 2.8955259542440714, + "grad_norm": 0.44212270253634706, + "learning_rate": 6.705146682127184e-08, + "loss": 0.2708, + "step": 6060 + }, + { + "epoch": 2.8960038229496448, + "grad_norm": 0.4827942892265252, + "learning_rate": 6.64439677845985e-08, + "loss": 0.2766, + "step": 6061 + }, + { + "epoch": 2.8964816916552176, + "grad_norm": 0.4419630496584052, + "learning_rate": 6.583922414114318e-08, + "loss": 0.2609, + "step": 6062 + }, + { + "epoch": 2.896959560360791, + "grad_norm": 0.4750351230744057, + "learning_rate": 6.523723605865174e-08, + "loss": 0.2763, + "step": 6063 + }, + { + "epoch": 2.897437429066364, + "grad_norm": 0.47028028478371914, + "learning_rate": 6.4638003704105e-08, + "loss": 0.2661, + "step": 6064 + }, + { + "epoch": 2.897915297771937, + "grad_norm": 0.4625973271853386, + "learning_rate": 6.404152724371892e-08, + "loss": 0.2702, + "step": 6065 + }, + { + "epoch": 2.8983931664775104, + "grad_norm": 0.7072555980090101, + "learning_rate": 6.344780684294671e-08, + "loss": 0.2669, + "step": 6066 + }, + { + "epoch": 2.8988710351830833, + "grad_norm": 0.4482616347126552, + "learning_rate": 6.28568426664744e-08, + "loss": 0.2533, + "step": 6067 + }, + { + "epoch": 2.8993489038886566, + "grad_norm": 0.4303610137833005, + "learning_rate": 6.226863487822532e-08, + "loss": 0.2702, + "step": 6068 + }, + { + "epoch": 2.8998267725942295, + "grad_norm": 0.4385548652125175, + "learning_rate": 6.168318364135895e-08, + "loss": 0.2586, + "step": 6069 + }, + { + "epoch": 2.900304641299803, + "grad_norm": 0.46216493168944667, + "learning_rate": 6.110048911826871e-08, + "loss": 0.2605, + "step": 6070 + }, + { + "epoch": 2.900782510005376, + "grad_norm": 0.42831561535218843, + "learning_rate": 6.05205514705831e-08, + "loss": 0.2689, + "step": 6071 + }, + { + "epoch": 2.9012603787109494, + "grad_norm": 0.42968742796774145, + "learning_rate": 5.99433708591679e-08, + "loss": 0.2601, + "step": 6072 + }, + { + "epoch": 2.9017382474165223, + "grad_norm": 1.0032255802524332, + "learning_rate": 5.9368947444121695e-08, + "loss": 0.2851, + "step": 6073 + }, + { + "epoch": 2.902216116122095, + "grad_norm": 0.44746843283921595, + "learning_rate": 5.879728138477925e-08, + "loss": 0.2645, + "step": 6074 + }, + { + "epoch": 2.9026939848276685, + "grad_norm": 0.45839779776483, + "learning_rate": 5.82283728397115e-08, + "loss": 0.2928, + "step": 6075 + }, + { + "epoch": 2.903171853533242, + "grad_norm": 0.4426768127258644, + "learning_rate": 5.766222196672333e-08, + "loss": 0.2529, + "step": 6076 + }, + { + "epoch": 2.903649722238815, + "grad_norm": 0.4306421815201952, + "learning_rate": 5.709882892285468e-08, + "loss": 0.2596, + "step": 6077 + }, + { + "epoch": 2.904127590944388, + "grad_norm": 0.4339299174215774, + "learning_rate": 5.653819386438164e-08, + "loss": 0.2543, + "step": 6078 + }, + { + "epoch": 2.9046054596499613, + "grad_norm": 0.46448596548019094, + "learning_rate": 5.598031694681316e-08, + "loss": 0.2698, + "step": 6079 + }, + { + "epoch": 2.905083328355534, + "grad_norm": 0.4319137355146327, + "learning_rate": 5.542519832489546e-08, + "loss": 0.2696, + "step": 6080 + }, + { + "epoch": 2.9055611970611075, + "grad_norm": 0.5684354169120892, + "learning_rate": 5.487283815260869e-08, + "loss": 0.2694, + "step": 6081 + }, + { + "epoch": 2.906039065766681, + "grad_norm": 0.4360547156677553, + "learning_rate": 5.432323658316807e-08, + "loss": 0.2629, + "step": 6082 + }, + { + "epoch": 2.9065169344722537, + "grad_norm": 0.4342812482531317, + "learning_rate": 5.3776393769021664e-08, + "loss": 0.271, + "step": 6083 + }, + { + "epoch": 2.906994803177827, + "grad_norm": 0.46003291233940047, + "learning_rate": 5.3232309861857015e-08, + "loss": 0.2489, + "step": 6084 + }, + { + "epoch": 2.9074726718834, + "grad_norm": 0.4325963268001337, + "learning_rate": 5.269098501259007e-08, + "loss": 0.2794, + "step": 6085 + }, + { + "epoch": 2.907950540588973, + "grad_norm": 0.4637871151293466, + "learning_rate": 5.2152419371376276e-08, + "loss": 0.2726, + "step": 6086 + }, + { + "epoch": 2.9084284092945465, + "grad_norm": 0.45775569434566626, + "learning_rate": 5.161661308760613e-08, + "loss": 0.2769, + "step": 6087 + }, + { + "epoch": 2.9089062780001194, + "grad_norm": 0.5008124956659363, + "learning_rate": 5.108356630989963e-08, + "loss": 0.2598, + "step": 6088 + }, + { + "epoch": 2.9093841467056927, + "grad_norm": 0.4407017585102997, + "learning_rate": 5.05532791861163e-08, + "loss": 0.2704, + "step": 6089 + }, + { + "epoch": 2.9098620154112655, + "grad_norm": 0.461333224542017, + "learning_rate": 5.002575186334735e-08, + "loss": 0.2817, + "step": 6090 + }, + { + "epoch": 2.910339884116839, + "grad_norm": 0.44289468727956904, + "learning_rate": 4.950098448792129e-08, + "loss": 0.2656, + "step": 6091 + }, + { + "epoch": 2.910817752822412, + "grad_norm": 0.47285680491757126, + "learning_rate": 4.897897720539835e-08, + "loss": 0.2561, + "step": 6092 + }, + { + "epoch": 2.911295621527985, + "grad_norm": 0.4770909008792775, + "learning_rate": 4.8459730160573814e-08, + "loss": 0.2543, + "step": 6093 + }, + { + "epoch": 2.9117734902335584, + "grad_norm": 0.4711159546492226, + "learning_rate": 4.794324349747803e-08, + "loss": 0.263, + "step": 6094 + }, + { + "epoch": 2.9122513589391312, + "grad_norm": 0.4302650665937928, + "learning_rate": 4.742951735937418e-08, + "loss": 0.263, + "step": 6095 + }, + { + "epoch": 2.9127292276447045, + "grad_norm": 0.44234241950445463, + "learning_rate": 4.691855188876271e-08, + "loss": 0.2591, + "step": 6096 + }, + { + "epoch": 2.913207096350278, + "grad_norm": 0.43773577729487084, + "learning_rate": 4.641034722737581e-08, + "loss": 0.2743, + "step": 6097 + }, + { + "epoch": 2.913684965055851, + "grad_norm": 0.4482633004011929, + "learning_rate": 4.590490351618071e-08, + "loss": 0.2542, + "step": 6098 + }, + { + "epoch": 2.914162833761424, + "grad_norm": 0.45388506640042264, + "learning_rate": 4.5402220895377494e-08, + "loss": 0.2608, + "step": 6099 + }, + { + "epoch": 2.9146407024669974, + "grad_norm": 0.4677634390084611, + "learning_rate": 4.490229950440239e-08, + "loss": 0.267, + "step": 6100 + }, + { + "epoch": 2.9151185711725702, + "grad_norm": 0.43322907347723366, + "learning_rate": 4.4405139481924485e-08, + "loss": 0.2852, + "step": 6101 + }, + { + "epoch": 2.9155964398781435, + "grad_norm": 0.43253137272245884, + "learning_rate": 4.39107409658468e-08, + "loss": 0.2523, + "step": 6102 + }, + { + "epoch": 2.916074308583717, + "grad_norm": 0.8312520936498295, + "learning_rate": 4.341910409330741e-08, + "loss": 0.2635, + "step": 6103 + }, + { + "epoch": 2.9165521772892897, + "grad_norm": 0.6559893173522333, + "learning_rate": 4.293022900067723e-08, + "loss": 0.2503, + "step": 6104 + }, + { + "epoch": 2.917030045994863, + "grad_norm": 0.4843023216915645, + "learning_rate": 4.2444115823562226e-08, + "loss": 0.2675, + "step": 6105 + }, + { + "epoch": 2.917507914700436, + "grad_norm": 0.43442410136217113, + "learning_rate": 4.196076469680122e-08, + "loss": 0.2843, + "step": 6106 + }, + { + "epoch": 2.9179857834060092, + "grad_norm": 0.43098967320599624, + "learning_rate": 4.148017575446695e-08, + "loss": 0.2564, + "step": 6107 + }, + { + "epoch": 2.9184636521115825, + "grad_norm": 0.4679928302022156, + "learning_rate": 4.100234912986611e-08, + "loss": 0.2544, + "step": 6108 + }, + { + "epoch": 2.9189415208171554, + "grad_norm": 0.4269404926785668, + "learning_rate": 4.052728495554159e-08, + "loss": 0.2624, + "step": 6109 + }, + { + "epoch": 2.9194193895227287, + "grad_norm": 0.46370533071441733, + "learning_rate": 4.005498336326463e-08, + "loss": 0.2847, + "step": 6110 + }, + { + "epoch": 2.9198972582283016, + "grad_norm": 0.4586911857247521, + "learning_rate": 3.958544448404489e-08, + "loss": 0.2683, + "step": 6111 + }, + { + "epoch": 2.920375126933875, + "grad_norm": 0.4440943392862457, + "learning_rate": 3.911866844812484e-08, + "loss": 0.2576, + "step": 6112 + }, + { + "epoch": 2.9208529956394482, + "grad_norm": 0.4520138400943036, + "learning_rate": 3.865465538497981e-08, + "loss": 0.2718, + "step": 6113 + }, + { + "epoch": 2.921330864345021, + "grad_norm": 0.7797476182577636, + "learning_rate": 3.819340542331684e-08, + "loss": 0.2698, + "step": 6114 + }, + { + "epoch": 2.9218087330505944, + "grad_norm": 0.46668125602963445, + "learning_rate": 3.773491869108137e-08, + "loss": 0.2821, + "step": 6115 + }, + { + "epoch": 2.9222866017561673, + "grad_norm": 0.4473526269050483, + "learning_rate": 3.727919531544721e-08, + "loss": 0.2619, + "step": 6116 + }, + { + "epoch": 2.9227644704617406, + "grad_norm": 0.4465730584095882, + "learning_rate": 3.6826235422824375e-08, + "loss": 0.2719, + "step": 6117 + }, + { + "epoch": 2.923242339167314, + "grad_norm": 0.4405667747199723, + "learning_rate": 3.63760391388579e-08, + "loss": 0.2582, + "step": 6118 + }, + { + "epoch": 2.923720207872887, + "grad_norm": 0.43242706008898524, + "learning_rate": 3.5928606588422344e-08, + "loss": 0.2616, + "step": 6119 + }, + { + "epoch": 2.92419807657846, + "grad_norm": 0.43740281555793414, + "learning_rate": 3.548393789562732e-08, + "loss": 0.2542, + "step": 6120 + }, + { + "epoch": 2.924675945284033, + "grad_norm": 0.436622075631485, + "learning_rate": 3.504203318381749e-08, + "loss": 0.26, + "step": 6121 + }, + { + "epoch": 2.9251538139896063, + "grad_norm": 0.4323874580312323, + "learning_rate": 3.460289257556926e-08, + "loss": 0.2544, + "step": 6122 + }, + { + "epoch": 2.9256316826951796, + "grad_norm": 0.4449893196540946, + "learning_rate": 3.416651619269073e-08, + "loss": 0.2632, + "step": 6123 + }, + { + "epoch": 2.926109551400753, + "grad_norm": 0.4383521097185537, + "learning_rate": 3.373290415622732e-08, + "loss": 0.2594, + "step": 6124 + }, + { + "epoch": 2.9265874201063258, + "grad_norm": 0.4616607673944823, + "learning_rate": 3.3302056586453916e-08, + "loss": 0.2419, + "step": 6125 + }, + { + "epoch": 2.927065288811899, + "grad_norm": 0.43426940100872696, + "learning_rate": 3.287397360288047e-08, + "loss": 0.2517, + "step": 6126 + }, + { + "epoch": 2.927543157517472, + "grad_norm": 0.43983360686347156, + "learning_rate": 3.244865532424979e-08, + "loss": 0.2776, + "step": 6127 + }, + { + "epoch": 2.9280210262230453, + "grad_norm": 0.42574611069588764, + "learning_rate": 3.2026101868538605e-08, + "loss": 0.255, + "step": 6128 + }, + { + "epoch": 2.9284988949286186, + "grad_norm": 0.4331855136966984, + "learning_rate": 3.1606313352953166e-08, + "loss": 0.2561, + "step": 6129 + }, + { + "epoch": 2.9289767636341915, + "grad_norm": 0.4360937130460965, + "learning_rate": 3.118928989393699e-08, + "loss": 0.2666, + "step": 6130 + }, + { + "epoch": 2.9294546323397648, + "grad_norm": 0.4647708422391565, + "learning_rate": 3.077503160716533e-08, + "loss": 0.2639, + "step": 6131 + }, + { + "epoch": 2.9299325010453376, + "grad_norm": 0.43528666761458085, + "learning_rate": 3.0363538607546264e-08, + "loss": 0.2655, + "step": 6132 + }, + { + "epoch": 2.930410369750911, + "grad_norm": 0.4433119943756909, + "learning_rate": 2.99548110092196e-08, + "loss": 0.2583, + "step": 6133 + }, + { + "epoch": 2.9308882384564843, + "grad_norm": 0.440304631789597, + "learning_rate": 2.9548848925560204e-08, + "loss": 0.2647, + "step": 6134 + }, + { + "epoch": 2.931366107162057, + "grad_norm": 0.4259765178512256, + "learning_rate": 2.9145652469174666e-08, + "loss": 0.2695, + "step": 6135 + }, + { + "epoch": 2.9318439758676305, + "grad_norm": 0.4379793362905143, + "learning_rate": 2.874522175190242e-08, + "loss": 0.2571, + "step": 6136 + }, + { + "epoch": 2.9323218445732033, + "grad_norm": 0.4505313706517066, + "learning_rate": 2.8347556884814608e-08, + "loss": 0.2732, + "step": 6137 + }, + { + "epoch": 2.9327997132787766, + "grad_norm": 0.43130212488145003, + "learning_rate": 2.7952657978218557e-08, + "loss": 0.2715, + "step": 6138 + }, + { + "epoch": 2.93327758198435, + "grad_norm": 0.455093876629633, + "learning_rate": 2.7560525141651085e-08, + "loss": 0.2708, + "step": 6139 + }, + { + "epoch": 2.933755450689923, + "grad_norm": 0.542151458023748, + "learning_rate": 2.7171158483882963e-08, + "loss": 0.2667, + "step": 6140 + }, + { + "epoch": 2.934233319395496, + "grad_norm": 0.43987512532735656, + "learning_rate": 2.678455811291669e-08, + "loss": 0.2648, + "step": 6141 + }, + { + "epoch": 2.934711188101069, + "grad_norm": 0.4611481415195722, + "learning_rate": 2.640072413599093e-08, + "loss": 0.2523, + "step": 6142 + }, + { + "epoch": 2.9351890568066423, + "grad_norm": 0.4680505547187445, + "learning_rate": 2.6019656659572734e-08, + "loss": 0.2761, + "step": 6143 + }, + { + "epoch": 2.9356669255122156, + "grad_norm": 0.4347789880573274, + "learning_rate": 2.564135578936422e-08, + "loss": 0.2678, + "step": 6144 + }, + { + "epoch": 2.9361447942177885, + "grad_norm": 0.43779621846285605, + "learning_rate": 2.5265821630298116e-08, + "loss": 0.2703, + "step": 6145 + }, + { + "epoch": 2.936622662923362, + "grad_norm": 0.5080676465853426, + "learning_rate": 2.4893054286542207e-08, + "loss": 0.282, + "step": 6146 + }, + { + "epoch": 2.9371005316289347, + "grad_norm": 0.4240127151465205, + "learning_rate": 2.4523053861494894e-08, + "loss": 0.2666, + "step": 6147 + }, + { + "epoch": 2.937578400334508, + "grad_norm": 0.43808035306246806, + "learning_rate": 2.4155820457788525e-08, + "loss": 0.2559, + "step": 6148 + }, + { + "epoch": 2.9380562690400813, + "grad_norm": 0.4600124982961411, + "learning_rate": 2.3791354177286062e-08, + "loss": 0.2551, + "step": 6149 + }, + { + "epoch": 2.9385341377456546, + "grad_norm": 0.4808481604273592, + "learning_rate": 2.3429655121085525e-08, + "loss": 0.2572, + "step": 6150 + }, + { + "epoch": 2.9390120064512275, + "grad_norm": 0.4420542869404689, + "learning_rate": 2.3070723389514437e-08, + "loss": 0.2684, + "step": 6151 + }, + { + "epoch": 2.939489875156801, + "grad_norm": 0.42935263804364654, + "learning_rate": 2.2714559082134267e-08, + "loss": 0.2482, + "step": 6152 + }, + { + "epoch": 2.9399677438623737, + "grad_norm": 0.4476128760597761, + "learning_rate": 2.2361162297739327e-08, + "loss": 0.2692, + "step": 6153 + }, + { + "epoch": 2.940445612567947, + "grad_norm": 0.4687545989577253, + "learning_rate": 2.201053313435564e-08, + "loss": 0.2652, + "step": 6154 + }, + { + "epoch": 2.9409234812735203, + "grad_norm": 0.4447212940871088, + "learning_rate": 2.1662671689242076e-08, + "loss": 0.2644, + "step": 6155 + }, + { + "epoch": 2.941401349979093, + "grad_norm": 0.4315254295809012, + "learning_rate": 2.131757805888701e-08, + "loss": 0.2588, + "step": 6156 + }, + { + "epoch": 2.9418792186846665, + "grad_norm": 0.43460273368097063, + "learning_rate": 2.0975252339016095e-08, + "loss": 0.2647, + "step": 6157 + }, + { + "epoch": 2.9423570873902394, + "grad_norm": 0.4537080705014196, + "learning_rate": 2.0635694624582258e-08, + "loss": 0.2605, + "step": 6158 + }, + { + "epoch": 2.9428349560958127, + "grad_norm": 0.5945505658301509, + "learning_rate": 2.0298905009774606e-08, + "loss": 0.2566, + "step": 6159 + }, + { + "epoch": 2.943312824801386, + "grad_norm": 0.4221694390673157, + "learning_rate": 1.996488358801174e-08, + "loss": 0.2622, + "step": 6160 + }, + { + "epoch": 2.943790693506959, + "grad_norm": 0.4369946666917783, + "learning_rate": 1.9633630451945106e-08, + "loss": 0.2588, + "step": 6161 + }, + { + "epoch": 2.944268562212532, + "grad_norm": 0.4581796037391759, + "learning_rate": 1.9305145693457873e-08, + "loss": 0.262, + "step": 6162 + }, + { + "epoch": 2.944746430918105, + "grad_norm": 0.45812073938100284, + "learning_rate": 1.897942940366715e-08, + "loss": 0.2633, + "step": 6163 + }, + { + "epoch": 2.9452242996236784, + "grad_norm": 0.4342899841434601, + "learning_rate": 1.8656481672921778e-08, + "loss": 0.2524, + "step": 6164 + }, + { + "epoch": 2.9457021683292517, + "grad_norm": 0.5515670473317069, + "learning_rate": 1.8336302590798992e-08, + "loss": 0.2504, + "step": 6165 + }, + { + "epoch": 2.9461800370348246, + "grad_norm": 0.4650205257532432, + "learning_rate": 1.8018892246113307e-08, + "loss": 0.2555, + "step": 6166 + }, + { + "epoch": 2.946657905740398, + "grad_norm": 0.5280605407104664, + "learning_rate": 1.7704250726907623e-08, + "loss": 0.2598, + "step": 6167 + }, + { + "epoch": 2.9471357744459707, + "grad_norm": 0.4312322170959968, + "learning_rate": 1.7392378120457686e-08, + "loss": 0.2695, + "step": 6168 + }, + { + "epoch": 2.947613643151544, + "grad_norm": 0.43878655465975397, + "learning_rate": 1.7083274513272075e-08, + "loss": 0.2416, + "step": 6169 + }, + { + "epoch": 2.9480915118571174, + "grad_norm": 0.47724365855005124, + "learning_rate": 1.677693999109109e-08, + "loss": 0.2799, + "step": 6170 + }, + { + "epoch": 2.9485693805626902, + "grad_norm": 0.42611304191004884, + "learning_rate": 1.6473374638885653e-08, + "loss": 0.2656, + "step": 6171 + }, + { + "epoch": 2.9490472492682636, + "grad_norm": 0.4437090768992525, + "learning_rate": 1.6172578540859518e-08, + "loss": 0.2739, + "step": 6172 + }, + { + "epoch": 2.9495251179738364, + "grad_norm": 0.4993388134546404, + "learning_rate": 1.5874551780448168e-08, + "loss": 0.2611, + "step": 6173 + }, + { + "epoch": 2.9500029866794097, + "grad_norm": 0.5246384618817709, + "learning_rate": 1.5579294440319914e-08, + "loss": 0.26, + "step": 6174 + }, + { + "epoch": 2.950480855384983, + "grad_norm": 0.4396611447170219, + "learning_rate": 1.5286806602372583e-08, + "loss": 0.2662, + "step": 6175 + }, + { + "epoch": 2.9509587240905564, + "grad_norm": 0.43916399298231507, + "learning_rate": 1.4997088347737942e-08, + "loss": 0.2786, + "step": 6176 + }, + { + "epoch": 2.9514365927961292, + "grad_norm": 0.4416382489397697, + "learning_rate": 1.4710139756778374e-08, + "loss": 0.2701, + "step": 6177 + }, + { + "epoch": 2.9519144615017026, + "grad_norm": 0.44737409568336867, + "learning_rate": 1.4425960909087989e-08, + "loss": 0.27, + "step": 6178 + }, + { + "epoch": 2.9523923302072754, + "grad_norm": 0.44284766864103403, + "learning_rate": 1.414455188349484e-08, + "loss": 0.264, + "step": 6179 + }, + { + "epoch": 2.9528701989128487, + "grad_norm": 0.5614132268071157, + "learning_rate": 1.3865912758054267e-08, + "loss": 0.2693, + "step": 6180 + }, + { + "epoch": 2.953348067618422, + "grad_norm": 0.44645974337163896, + "learning_rate": 1.3590043610057779e-08, + "loss": 0.2856, + "step": 6181 + }, + { + "epoch": 2.953825936323995, + "grad_norm": 0.4384106162295247, + "learning_rate": 1.3316944516026386e-08, + "loss": 0.2532, + "step": 6182 + }, + { + "epoch": 2.9543038050295682, + "grad_norm": 0.43462058917441077, + "learning_rate": 1.3046615551711716e-08, + "loss": 0.2646, + "step": 6183 + }, + { + "epoch": 2.954781673735141, + "grad_norm": 0.5819832565699149, + "learning_rate": 1.2779056792099343e-08, + "loss": 0.2679, + "step": 6184 + }, + { + "epoch": 2.9552595424407144, + "grad_norm": 0.4460079501289947, + "learning_rate": 1.2514268311405452e-08, + "loss": 0.246, + "step": 6185 + }, + { + "epoch": 2.9557374111462877, + "grad_norm": 0.4356231894949297, + "learning_rate": 1.2252250183076852e-08, + "loss": 0.262, + "step": 6186 + }, + { + "epoch": 2.9562152798518606, + "grad_norm": 0.43337849139435436, + "learning_rate": 1.1993002479793181e-08, + "loss": 0.2683, + "step": 6187 + }, + { + "epoch": 2.956693148557434, + "grad_norm": 0.4622749840308141, + "learning_rate": 1.1736525273465805e-08, + "loss": 0.2842, + "step": 6188 + }, + { + "epoch": 2.957171017263007, + "grad_norm": 0.4502631856582929, + "learning_rate": 1.1482818635235592e-08, + "loss": 0.2788, + "step": 6189 + }, + { + "epoch": 2.95764888596858, + "grad_norm": 0.47079401887102384, + "learning_rate": 1.1231882635477364e-08, + "loss": 0.2614, + "step": 6190 + }, + { + "epoch": 2.9581267546741534, + "grad_norm": 0.4695321059725002, + "learning_rate": 1.0983717343796552e-08, + "loss": 0.2673, + "step": 6191 + }, + { + "epoch": 2.9586046233797263, + "grad_norm": 0.44118500556042844, + "learning_rate": 1.0738322829028092e-08, + "loss": 0.259, + "step": 6192 + }, + { + "epoch": 2.9590824920852996, + "grad_norm": 0.4277238079715053, + "learning_rate": 1.0495699159241979e-08, + "loss": 0.2523, + "step": 6193 + }, + { + "epoch": 2.9595603607908725, + "grad_norm": 0.4388918259690208, + "learning_rate": 1.0255846401737713e-08, + "loss": 0.2711, + "step": 6194 + }, + { + "epoch": 2.960038229496446, + "grad_norm": 0.43524845914682875, + "learning_rate": 1.0018764623045407e-08, + "loss": 0.2712, + "step": 6195 + }, + { + "epoch": 2.960516098202019, + "grad_norm": 0.43647257946844553, + "learning_rate": 9.784453888926903e-09, + "loss": 0.2615, + "step": 6196 + }, + { + "epoch": 2.960993966907592, + "grad_norm": 0.4421145755712329, + "learning_rate": 9.552914264376878e-09, + "loss": 0.2705, + "step": 6197 + }, + { + "epoch": 2.9614718356131653, + "grad_norm": 0.44265663704994035, + "learning_rate": 9.324145813619512e-09, + "loss": 0.2559, + "step": 6198 + }, + { + "epoch": 2.961949704318738, + "grad_norm": 0.49174281184358326, + "learning_rate": 9.098148600111822e-09, + "loss": 0.2795, + "step": 6199 + }, + { + "epoch": 2.9624275730243115, + "grad_norm": 0.4440331427084486, + "learning_rate": 8.874922686541442e-09, + "loss": 0.2811, + "step": 6200 + }, + { + "epoch": 2.962905441729885, + "grad_norm": 0.44320171963272337, + "learning_rate": 8.654468134826621e-09, + "loss": 0.2501, + "step": 6201 + }, + { + "epoch": 2.963383310435458, + "grad_norm": 0.44354434096176776, + "learning_rate": 8.436785006118442e-09, + "loss": 0.2701, + "step": 6202 + }, + { + "epoch": 2.963861179141031, + "grad_norm": 0.4389788144482154, + "learning_rate": 8.221873360798604e-09, + "loss": 0.2738, + "step": 6203 + }, + { + "epoch": 2.9643390478466043, + "grad_norm": 0.49592260296870216, + "learning_rate": 8.009733258478314e-09, + "loss": 0.268, + "step": 6204 + }, + { + "epoch": 2.964816916552177, + "grad_norm": 0.4400993468472191, + "learning_rate": 7.800364758002721e-09, + "loss": 0.2597, + "step": 6205 + }, + { + "epoch": 2.9652947852577505, + "grad_norm": 0.4309074500166779, + "learning_rate": 7.593767917445372e-09, + "loss": 0.264, + "step": 6206 + }, + { + "epoch": 2.965772653963324, + "grad_norm": 0.45179483541632487, + "learning_rate": 7.389942794114868e-09, + "loss": 0.2736, + "step": 6207 + }, + { + "epoch": 2.9662505226688967, + "grad_norm": 0.4220551564794903, + "learning_rate": 7.188889444548208e-09, + "loss": 0.2685, + "step": 6208 + }, + { + "epoch": 2.96672839137447, + "grad_norm": 0.4806287977254884, + "learning_rate": 6.990607924511894e-09, + "loss": 0.2508, + "step": 6209 + }, + { + "epoch": 2.967206260080043, + "grad_norm": 0.4320953184140779, + "learning_rate": 6.795098289008595e-09, + "loss": 0.2842, + "step": 6210 + }, + { + "epoch": 2.967684128785616, + "grad_norm": 0.43834356326205404, + "learning_rate": 6.602360592267154e-09, + "loss": 0.279, + "step": 6211 + }, + { + "epoch": 2.9681619974911895, + "grad_norm": 0.4481157300961235, + "learning_rate": 6.412394887750362e-09, + "loss": 0.2663, + "step": 6212 + }, + { + "epoch": 2.9686398661967623, + "grad_norm": 0.4295979402150286, + "learning_rate": 6.225201228151623e-09, + "loss": 0.2652, + "step": 6213 + }, + { + "epoch": 2.9691177349023357, + "grad_norm": 0.6031224976708806, + "learning_rate": 6.040779665394958e-09, + "loss": 0.2506, + "step": 6214 + }, + { + "epoch": 2.9695956036079085, + "grad_norm": 0.4315091977947095, + "learning_rate": 5.859130250636113e-09, + "loss": 0.2695, + "step": 6215 + }, + { + "epoch": 2.970073472313482, + "grad_norm": 0.6583738125813944, + "learning_rate": 5.680253034260341e-09, + "loss": 0.2548, + "step": 6216 + }, + { + "epoch": 2.970551341019055, + "grad_norm": 0.4378215901474907, + "learning_rate": 5.504148065885728e-09, + "loss": 0.2646, + "step": 6217 + }, + { + "epoch": 2.971029209724628, + "grad_norm": 0.4503142761167827, + "learning_rate": 5.330815394359867e-09, + "loss": 0.2473, + "step": 6218 + }, + { + "epoch": 2.9715070784302013, + "grad_norm": 0.4469746516278154, + "learning_rate": 5.160255067764297e-09, + "loss": 0.268, + "step": 6219 + }, + { + "epoch": 2.971984947135774, + "grad_norm": 0.4335561129582791, + "learning_rate": 4.992467133406731e-09, + "loss": 0.2516, + "step": 6220 + }, + { + "epoch": 2.9724628158413475, + "grad_norm": 0.4418315312058818, + "learning_rate": 4.827451637829938e-09, + "loss": 0.2604, + "step": 6221 + }, + { + "epoch": 2.972940684546921, + "grad_norm": 0.4494480907514312, + "learning_rate": 4.665208626807305e-09, + "loss": 0.254, + "step": 6222 + }, + { + "epoch": 2.9734185532524937, + "grad_norm": 0.4324550053316957, + "learning_rate": 4.50573814534061e-09, + "loss": 0.2436, + "step": 6223 + }, + { + "epoch": 2.973896421958067, + "grad_norm": 0.47336069337487435, + "learning_rate": 4.34904023766447e-09, + "loss": 0.2705, + "step": 6224 + }, + { + "epoch": 2.97437429066364, + "grad_norm": 0.4628580904559059, + "learning_rate": 4.195114947244117e-09, + "loss": 0.2687, + "step": 6225 + }, + { + "epoch": 2.974852159369213, + "grad_norm": 0.43605946474328056, + "learning_rate": 4.043962316775396e-09, + "loss": 0.2634, + "step": 6226 + }, + { + "epoch": 2.9753300280747865, + "grad_norm": 0.42009356204523896, + "learning_rate": 3.895582388186991e-09, + "loss": 0.2566, + "step": 6227 + }, + { + "epoch": 2.97580789678036, + "grad_norm": 0.44595200592177053, + "learning_rate": 3.749975202635981e-09, + "loss": 0.2609, + "step": 6228 + }, + { + "epoch": 2.9762857654859327, + "grad_norm": 0.45865874193509315, + "learning_rate": 3.607140800510056e-09, + "loss": 0.2625, + "step": 6229 + }, + { + "epoch": 2.976763634191506, + "grad_norm": 0.5227990370320047, + "learning_rate": 3.4670792214297476e-09, + "loss": 0.2808, + "step": 6230 + }, + { + "epoch": 2.977241502897079, + "grad_norm": 0.444601214169423, + "learning_rate": 3.3297905042462e-09, + "loss": 0.2686, + "step": 6231 + }, + { + "epoch": 2.977719371602652, + "grad_norm": 0.44325315886349526, + "learning_rate": 3.1952746870411723e-09, + "loss": 0.259, + "step": 6232 + }, + { + "epoch": 2.9781972403082255, + "grad_norm": 0.4263331644343002, + "learning_rate": 3.06353180712593e-09, + "loss": 0.2601, + "step": 6233 + }, + { + "epoch": 2.9786751090137984, + "grad_norm": 0.445388777913223, + "learning_rate": 2.9345619010434644e-09, + "loss": 0.2709, + "step": 6234 + }, + { + "epoch": 2.9791529777193717, + "grad_norm": 0.4290478297016977, + "learning_rate": 2.808365004569602e-09, + "loss": 0.2349, + "step": 6235 + }, + { + "epoch": 2.9796308464249446, + "grad_norm": 0.43300161485588196, + "learning_rate": 2.684941152706344e-09, + "loss": 0.254, + "step": 6236 + }, + { + "epoch": 2.980108715130518, + "grad_norm": 0.44745868192997307, + "learning_rate": 2.5642903796918583e-09, + "loss": 0.2586, + "step": 6237 + }, + { + "epoch": 2.980586583836091, + "grad_norm": 0.5017074416234102, + "learning_rate": 2.4464127189915975e-09, + "loss": 0.2563, + "step": 6238 + }, + { + "epoch": 2.981064452541664, + "grad_norm": 0.43904803227553557, + "learning_rate": 2.33130820330163e-09, + "loss": 0.2695, + "step": 6239 + }, + { + "epoch": 2.9815423212472374, + "grad_norm": 0.4428267317870827, + "learning_rate": 2.2189768645519693e-09, + "loss": 0.2783, + "step": 6240 + }, + { + "epoch": 2.9820201899528103, + "grad_norm": 0.44591251523595293, + "learning_rate": 2.109418733899915e-09, + "loss": 0.2721, + "step": 6241 + }, + { + "epoch": 2.9824980586583836, + "grad_norm": 0.4610652816160574, + "learning_rate": 2.0026338417344913e-09, + "loss": 0.2717, + "step": 6242 + }, + { + "epoch": 2.982975927363957, + "grad_norm": 0.4494233009141444, + "learning_rate": 1.898622217677559e-09, + "loss": 0.2636, + "step": 6243 + }, + { + "epoch": 2.9834537960695298, + "grad_norm": 0.4363101853354611, + "learning_rate": 1.7973838905793739e-09, + "loss": 0.2728, + "step": 6244 + }, + { + "epoch": 2.983931664775103, + "grad_norm": 0.4882459582895431, + "learning_rate": 1.6989188885219165e-09, + "loss": 0.2542, + "step": 6245 + }, + { + "epoch": 2.984409533480676, + "grad_norm": 0.44390851843666584, + "learning_rate": 1.6032272388166736e-09, + "loss": 0.2714, + "step": 6246 + }, + { + "epoch": 2.9848874021862493, + "grad_norm": 0.4429583465368995, + "learning_rate": 1.5103089680079674e-09, + "loss": 0.271, + "step": 6247 + }, + { + "epoch": 2.9853652708918226, + "grad_norm": 0.4302641158411793, + "learning_rate": 1.4201641018685152e-09, + "loss": 0.264, + "step": 6248 + }, + { + "epoch": 2.985843139597396, + "grad_norm": 0.4371029630970897, + "learning_rate": 1.3327926654049805e-09, + "loss": 0.2669, + "step": 6249 + }, + { + "epoch": 2.9863210083029688, + "grad_norm": 0.44144481030011334, + "learning_rate": 1.2481946828502011e-09, + "loss": 0.2598, + "step": 6250 + }, + { + "epoch": 2.9867988770085416, + "grad_norm": 0.5001018088911403, + "learning_rate": 1.1663701776709613e-09, + "loss": 0.2434, + "step": 6251 + }, + { + "epoch": 2.987276745714115, + "grad_norm": 0.430612900839865, + "learning_rate": 1.0873191725646604e-09, + "loss": 0.2798, + "step": 6252 + }, + { + "epoch": 2.9877546144196883, + "grad_norm": 0.45350891778339647, + "learning_rate": 1.0110416894593133e-09, + "loss": 0.2716, + "step": 6253 + }, + { + "epoch": 2.9882324831252616, + "grad_norm": 0.4337165441795163, + "learning_rate": 9.375377495102201e-10, + "loss": 0.254, + "step": 6254 + }, + { + "epoch": 2.9887103518308344, + "grad_norm": 0.4462339228646419, + "learning_rate": 8.668073731088467e-10, + "loss": 0.2599, + "step": 6255 + }, + { + "epoch": 2.9891882205364078, + "grad_norm": 0.45995272797479353, + "learning_rate": 7.988505798728341e-10, + "loss": 0.2673, + "step": 6256 + }, + { + "epoch": 2.9896660892419806, + "grad_norm": 0.43074554696329737, + "learning_rate": 7.33667388652659e-10, + "loss": 0.2644, + "step": 6257 + }, + { + "epoch": 2.990143957947554, + "grad_norm": 0.44281340320268414, + "learning_rate": 6.712578175294138e-10, + "loss": 0.2684, + "step": 6258 + }, + { + "epoch": 2.9906218266531273, + "grad_norm": 0.45094383291551116, + "learning_rate": 6.116218838148058e-10, + "loss": 0.2601, + "step": 6259 + }, + { + "epoch": 2.9910996953587, + "grad_norm": 0.4263274399474076, + "learning_rate": 5.547596040489378e-10, + "loss": 0.2628, + "step": 6260 + }, + { + "epoch": 2.9915775640642734, + "grad_norm": 0.44924007993713105, + "learning_rate": 5.006709940058585e-10, + "loss": 0.2646, + "step": 6261 + }, + { + "epoch": 2.9920554327698463, + "grad_norm": 0.4157503424555004, + "learning_rate": 4.4935606868912186e-10, + "loss": 0.2688, + "step": 6262 + }, + { + "epoch": 2.9925333014754196, + "grad_norm": 0.5530870038829894, + "learning_rate": 4.008148423306768e-10, + "loss": 0.261, + "step": 6263 + }, + { + "epoch": 2.993011170180993, + "grad_norm": 0.43360985730611284, + "learning_rate": 3.5504732839752867e-10, + "loss": 0.2605, + "step": 6264 + }, + { + "epoch": 2.993489038886566, + "grad_norm": 0.4305991915878331, + "learning_rate": 3.1205353958285724e-10, + "loss": 0.2565, + "step": 6265 + }, + { + "epoch": 2.993966907592139, + "grad_norm": 0.46592325654029976, + "learning_rate": 2.718334878137885e-10, + "loss": 0.2678, + "step": 6266 + }, + { + "epoch": 2.994444776297712, + "grad_norm": 0.44676400478930745, + "learning_rate": 2.3438718424473315e-10, + "loss": 0.2524, + "step": 6267 + }, + { + "epoch": 2.9949226450032853, + "grad_norm": 0.4574246654627526, + "learning_rate": 1.9971463926515833e-10, + "loss": 0.2602, + "step": 6268 + }, + { + "epoch": 2.9954005137088586, + "grad_norm": 0.45138653999661854, + "learning_rate": 1.6781586249070559e-10, + "loss": 0.2489, + "step": 6269 + }, + { + "epoch": 2.9958783824144315, + "grad_norm": 0.44396985604168704, + "learning_rate": 1.3869086276985243e-10, + "loss": 0.2679, + "step": 6270 + }, + { + "epoch": 2.996356251120005, + "grad_norm": 0.4756036796960099, + "learning_rate": 1.1233964818169185e-10, + "loss": 0.2624, + "step": 6271 + }, + { + "epoch": 2.9968341198255777, + "grad_norm": 0.46382667279220274, + "learning_rate": 8.876222603593221e-11, + "loss": 0.2646, + "step": 6272 + }, + { + "epoch": 2.997311988531151, + "grad_norm": 0.4785623678848972, + "learning_rate": 6.795860287178713e-11, + "loss": 0.2526, + "step": 6273 + }, + { + "epoch": 2.9977898572367243, + "grad_norm": 0.42404628861817545, + "learning_rate": 4.992878446019589e-11, + "loss": 0.2651, + "step": 6274 + }, + { + "epoch": 2.9982677259422976, + "grad_norm": 0.45160121782102897, + "learning_rate": 3.467277580271322e-11, + "loss": 0.2663, + "step": 6275 + }, + { + "epoch": 2.9987455946478705, + "grad_norm": 0.4354577405562673, + "learning_rate": 2.219058113039907e-11, + "loss": 0.2655, + "step": 6276 + }, + { + "epoch": 2.9992234633534434, + "grad_norm": 0.45460340307340963, + "learning_rate": 1.2482203904928824e-11, + "loss": 0.2514, + "step": 6277 + }, + { + "epoch": 2.9997013320590167, + "grad_norm": 0.47846568959514535, + "learning_rate": 5.547646820813768e-12, + "loss": 0.2667, + "step": 6278 + }, + { + "epoch": 3.0, + "grad_norm": 0.5483148609921531, + "learning_rate": 1.3869118009601778e-12, + "loss": 0.2654, + "step": 6279 + } + ], + "logging_steps": 1, + "max_steps": 6279, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 800, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5031825133207552e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}