{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.961451247165533, "eval_steps": 500, "global_step": 1155, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006046863189720333, "grad_norm": 3.711113929748535, "learning_rate": 0.0, "loss": 0.8664, "step": 1 }, { "epoch": 0.012093726379440665, "grad_norm": 4.3737382888793945, "learning_rate": 2.7894294565112984e-06, "loss": 0.9135, "step": 2 }, { "epoch": 0.018140589569160998, "grad_norm": 4.166050910949707, "learning_rate": 4.421141086977404e-06, "loss": 0.8859, "step": 3 }, { "epoch": 0.02418745275888133, "grad_norm": 3.66565203666687, "learning_rate": 5.578858913022597e-06, "loss": 0.8918, "step": 4 }, { "epoch": 0.030234315948601664, "grad_norm": 4.2954020500183105, "learning_rate": 6.47685462377997e-06, "loss": 0.9257, "step": 5 }, { "epoch": 0.036281179138321996, "grad_norm": 3.8972299098968506, "learning_rate": 7.210570543488702e-06, "loss": 0.9237, "step": 6 }, { "epoch": 0.042328042328042326, "grad_norm": 4.1708879470825195, "learning_rate": 7.830918514469461e-06, "loss": 0.9304, "step": 7 }, { "epoch": 0.04837490551776266, "grad_norm": 3.7282660007476807, "learning_rate": 8.368288369533896e-06, "loss": 0.8657, "step": 8 }, { "epoch": 0.05442176870748299, "grad_norm": 3.548099994659424, "learning_rate": 8.842282173954808e-06, "loss": 0.8375, "step": 9 }, { "epoch": 0.06046863189720333, "grad_norm": 3.497669219970703, "learning_rate": 9.26628408029127e-06, "loss": 0.8647, "step": 10 }, { "epoch": 0.06651549508692366, "grad_norm": 3.3578875064849854, "learning_rate": 9.64984045981344e-06, "loss": 0.8572, "step": 11 }, { "epoch": 0.07256235827664399, "grad_norm": 3.4553167819976807, "learning_rate": 1e-05, "loss": 0.8212, "step": 12 }, { "epoch": 0.07860922146636433, "grad_norm": 3.311001777648926, "learning_rate": 1e-05, "loss": 0.791, "step": 13 }, { "epoch": 0.08465608465608465, "grad_norm": 3.3359620571136475, "learning_rate": 1e-05, "loss": 0.8568, "step": 14 }, { "epoch": 0.09070294784580499, "grad_norm": 3.044142961502075, "learning_rate": 1e-05, "loss": 0.8401, "step": 15 }, { "epoch": 0.09674981103552532, "grad_norm": 2.6783864498138428, "learning_rate": 1e-05, "loss": 0.7713, "step": 16 }, { "epoch": 0.10279667422524566, "grad_norm": 2.8936526775360107, "learning_rate": 1e-05, "loss": 0.8308, "step": 17 }, { "epoch": 0.10884353741496598, "grad_norm": 3.396224021911621, "learning_rate": 1e-05, "loss": 0.8157, "step": 18 }, { "epoch": 0.11489040060468632, "grad_norm": 2.8923842906951904, "learning_rate": 1e-05, "loss": 0.7786, "step": 19 }, { "epoch": 0.12093726379440665, "grad_norm": 2.8523995876312256, "learning_rate": 1e-05, "loss": 0.7559, "step": 20 }, { "epoch": 0.12698412698412698, "grad_norm": 2.7873380184173584, "learning_rate": 1e-05, "loss": 0.7619, "step": 21 }, { "epoch": 0.1330309901738473, "grad_norm": 2.699129104614258, "learning_rate": 1e-05, "loss": 0.7664, "step": 22 }, { "epoch": 0.13907785336356765, "grad_norm": 2.7552123069763184, "learning_rate": 1e-05, "loss": 0.7477, "step": 23 }, { "epoch": 0.14512471655328799, "grad_norm": 2.6023707389831543, "learning_rate": 1e-05, "loss": 0.7812, "step": 24 }, { "epoch": 0.15117157974300832, "grad_norm": 2.6848936080932617, "learning_rate": 1e-05, "loss": 0.7472, "step": 25 }, { "epoch": 0.15721844293272866, "grad_norm": 2.3893909454345703, "learning_rate": 1e-05, "loss": 0.7186, "step": 26 }, { "epoch": 0.16326530612244897, "grad_norm": 2.3240654468536377, "learning_rate": 1e-05, "loss": 0.7066, "step": 27 }, { "epoch": 0.1693121693121693, "grad_norm": 2.541768789291382, "learning_rate": 1e-05, "loss": 0.7453, "step": 28 }, { "epoch": 0.17535903250188964, "grad_norm": 2.2445993423461914, "learning_rate": 1e-05, "loss": 0.7239, "step": 29 }, { "epoch": 0.18140589569160998, "grad_norm": 2.2153027057647705, "learning_rate": 1e-05, "loss": 0.7255, "step": 30 }, { "epoch": 0.1874527588813303, "grad_norm": 2.113799810409546, "learning_rate": 1e-05, "loss": 0.6945, "step": 31 }, { "epoch": 0.19349962207105065, "grad_norm": 2.165024757385254, "learning_rate": 1e-05, "loss": 0.6929, "step": 32 }, { "epoch": 0.19954648526077098, "grad_norm": 2.070373773574829, "learning_rate": 1e-05, "loss": 0.6883, "step": 33 }, { "epoch": 0.20559334845049132, "grad_norm": 1.9677635431289673, "learning_rate": 1e-05, "loss": 0.6806, "step": 34 }, { "epoch": 0.21164021164021163, "grad_norm": 2.0685954093933105, "learning_rate": 1e-05, "loss": 0.6869, "step": 35 }, { "epoch": 0.21768707482993196, "grad_norm": 1.96161687374115, "learning_rate": 1e-05, "loss": 0.7144, "step": 36 }, { "epoch": 0.2237339380196523, "grad_norm": 1.8623857498168945, "learning_rate": 1e-05, "loss": 0.7092, "step": 37 }, { "epoch": 0.22978080120937264, "grad_norm": 1.8184101581573486, "learning_rate": 1e-05, "loss": 0.6944, "step": 38 }, { "epoch": 0.23582766439909297, "grad_norm": 1.8283226490020752, "learning_rate": 1e-05, "loss": 0.643, "step": 39 }, { "epoch": 0.2418745275888133, "grad_norm": 1.7799246311187744, "learning_rate": 1e-05, "loss": 0.6594, "step": 40 }, { "epoch": 0.24792139077853365, "grad_norm": 1.7448480129241943, "learning_rate": 1e-05, "loss": 0.6344, "step": 41 }, { "epoch": 0.25396825396825395, "grad_norm": 1.812776803970337, "learning_rate": 1e-05, "loss": 0.6498, "step": 42 }, { "epoch": 0.2600151171579743, "grad_norm": 1.688184142112732, "learning_rate": 1e-05, "loss": 0.637, "step": 43 }, { "epoch": 0.2660619803476946, "grad_norm": 1.6118687391281128, "learning_rate": 1e-05, "loss": 0.6347, "step": 44 }, { "epoch": 0.272108843537415, "grad_norm": 1.760033130645752, "learning_rate": 1e-05, "loss": 0.6599, "step": 45 }, { "epoch": 0.2781557067271353, "grad_norm": 1.6662259101867676, "learning_rate": 1e-05, "loss": 0.6249, "step": 46 }, { "epoch": 0.2842025699168556, "grad_norm": 1.738208293914795, "learning_rate": 1e-05, "loss": 0.6319, "step": 47 }, { "epoch": 0.29024943310657597, "grad_norm": 1.6041938066482544, "learning_rate": 1e-05, "loss": 0.6401, "step": 48 }, { "epoch": 0.2962962962962963, "grad_norm": 1.505116581916809, "learning_rate": 1e-05, "loss": 0.5982, "step": 49 }, { "epoch": 0.30234315948601664, "grad_norm": 1.6501286029815674, "learning_rate": 1e-05, "loss": 0.6082, "step": 50 }, { "epoch": 0.30839002267573695, "grad_norm": 1.6904505491256714, "learning_rate": 1e-05, "loss": 0.6411, "step": 51 }, { "epoch": 0.3144368858654573, "grad_norm": 1.568041205406189, "learning_rate": 1e-05, "loss": 0.6439, "step": 52 }, { "epoch": 0.3204837490551776, "grad_norm": 1.4448561668395996, "learning_rate": 1e-05, "loss": 0.6492, "step": 53 }, { "epoch": 0.32653061224489793, "grad_norm": 1.4173052310943604, "learning_rate": 1e-05, "loss": 0.5715, "step": 54 }, { "epoch": 0.3325774754346183, "grad_norm": 1.4669299125671387, "learning_rate": 1e-05, "loss": 0.5732, "step": 55 }, { "epoch": 0.3386243386243386, "grad_norm": 1.4686517715454102, "learning_rate": 1e-05, "loss": 0.6144, "step": 56 }, { "epoch": 0.34467120181405897, "grad_norm": 1.3720322847366333, "learning_rate": 1e-05, "loss": 0.6028, "step": 57 }, { "epoch": 0.3507180650037793, "grad_norm": 1.359766960144043, "learning_rate": 1e-05, "loss": 0.6075, "step": 58 }, { "epoch": 0.35676492819349964, "grad_norm": 1.3213920593261719, "learning_rate": 1e-05, "loss": 0.5857, "step": 59 }, { "epoch": 0.36281179138321995, "grad_norm": 1.2998356819152832, "learning_rate": 1e-05, "loss": 0.6139, "step": 60 }, { "epoch": 0.3688586545729403, "grad_norm": 1.2622840404510498, "learning_rate": 1e-05, "loss": 0.5689, "step": 61 }, { "epoch": 0.3749055177626606, "grad_norm": 1.3091906309127808, "learning_rate": 1e-05, "loss": 0.5625, "step": 62 }, { "epoch": 0.38095238095238093, "grad_norm": 1.2008758783340454, "learning_rate": 1e-05, "loss": 0.5869, "step": 63 }, { "epoch": 0.3869992441421013, "grad_norm": 1.224686861038208, "learning_rate": 1e-05, "loss": 0.5979, "step": 64 }, { "epoch": 0.3930461073318216, "grad_norm": 1.0641875267028809, "learning_rate": 1e-05, "loss": 0.5636, "step": 65 }, { "epoch": 0.39909297052154197, "grad_norm": 1.101380467414856, "learning_rate": 1e-05, "loss": 0.5857, "step": 66 }, { "epoch": 0.4051398337112623, "grad_norm": 1.0325955152511597, "learning_rate": 1e-05, "loss": 0.5665, "step": 67 }, { "epoch": 0.41118669690098264, "grad_norm": 1.179768443107605, "learning_rate": 1e-05, "loss": 0.5812, "step": 68 }, { "epoch": 0.41723356009070295, "grad_norm": 1.0342235565185547, "learning_rate": 1e-05, "loss": 0.5589, "step": 69 }, { "epoch": 0.42328042328042326, "grad_norm": 1.0457615852355957, "learning_rate": 1e-05, "loss": 0.597, "step": 70 }, { "epoch": 0.4293272864701436, "grad_norm": 0.9849565029144287, "learning_rate": 1e-05, "loss": 0.5839, "step": 71 }, { "epoch": 0.43537414965986393, "grad_norm": 0.982824981212616, "learning_rate": 1e-05, "loss": 0.5398, "step": 72 }, { "epoch": 0.4414210128495843, "grad_norm": 0.9600711464881897, "learning_rate": 1e-05, "loss": 0.571, "step": 73 }, { "epoch": 0.4474678760393046, "grad_norm": 0.9347206354141235, "learning_rate": 1e-05, "loss": 0.5696, "step": 74 }, { "epoch": 0.45351473922902497, "grad_norm": 0.9394077062606812, "learning_rate": 1e-05, "loss": 0.5518, "step": 75 }, { "epoch": 0.4595616024187453, "grad_norm": 0.9704285264015198, "learning_rate": 1e-05, "loss": 0.5415, "step": 76 }, { "epoch": 0.4656084656084656, "grad_norm": 0.9535228610038757, "learning_rate": 1e-05, "loss": 0.5536, "step": 77 }, { "epoch": 0.47165532879818595, "grad_norm": 0.8936919569969177, "learning_rate": 1e-05, "loss": 0.5338, "step": 78 }, { "epoch": 0.47770219198790626, "grad_norm": 0.8748770952224731, "learning_rate": 1e-05, "loss": 0.5385, "step": 79 }, { "epoch": 0.4837490551776266, "grad_norm": 1.0223513841629028, "learning_rate": 1e-05, "loss": 0.557, "step": 80 }, { "epoch": 0.4897959183673469, "grad_norm": 0.9543147087097168, "learning_rate": 1e-05, "loss": 0.5685, "step": 81 }, { "epoch": 0.4958427815570673, "grad_norm": 0.8917294144630432, "learning_rate": 1e-05, "loss": 0.539, "step": 82 }, { "epoch": 0.5018896447467877, "grad_norm": 0.8871799111366272, "learning_rate": 1e-05, "loss": 0.5282, "step": 83 }, { "epoch": 0.5079365079365079, "grad_norm": 0.8421934843063354, "learning_rate": 1e-05, "loss": 0.5627, "step": 84 }, { "epoch": 0.5139833711262283, "grad_norm": 0.9445014595985413, "learning_rate": 1e-05, "loss": 0.5424, "step": 85 }, { "epoch": 0.5200302343159486, "grad_norm": 0.887968897819519, "learning_rate": 1e-05, "loss": 0.5867, "step": 86 }, { "epoch": 0.5260770975056689, "grad_norm": 0.8449985980987549, "learning_rate": 1e-05, "loss": 0.5207, "step": 87 }, { "epoch": 0.5321239606953893, "grad_norm": 0.8589107394218445, "learning_rate": 1e-05, "loss": 0.5393, "step": 88 }, { "epoch": 0.5381708238851096, "grad_norm": 0.9249810576438904, "learning_rate": 1e-05, "loss": 0.5347, "step": 89 }, { "epoch": 0.54421768707483, "grad_norm": 0.7938629984855652, "learning_rate": 1e-05, "loss": 0.5035, "step": 90 }, { "epoch": 0.5502645502645502, "grad_norm": 0.8851982951164246, "learning_rate": 1e-05, "loss": 0.5516, "step": 91 }, { "epoch": 0.5563114134542706, "grad_norm": 1.0278273820877075, "learning_rate": 1e-05, "loss": 0.5565, "step": 92 }, { "epoch": 0.562358276643991, "grad_norm": 0.8049068450927734, "learning_rate": 1e-05, "loss": 0.5181, "step": 93 }, { "epoch": 0.5684051398337112, "grad_norm": 0.8061883449554443, "learning_rate": 1e-05, "loss": 0.5454, "step": 94 }, { "epoch": 0.5744520030234316, "grad_norm": 0.8366959691047668, "learning_rate": 1e-05, "loss": 0.5338, "step": 95 }, { "epoch": 0.5804988662131519, "grad_norm": 0.7461470365524292, "learning_rate": 1e-05, "loss": 0.4844, "step": 96 }, { "epoch": 0.5865457294028723, "grad_norm": 0.7860020399093628, "learning_rate": 1e-05, "loss": 0.5548, "step": 97 }, { "epoch": 0.5925925925925926, "grad_norm": 0.7596490383148193, "learning_rate": 1e-05, "loss": 0.5238, "step": 98 }, { "epoch": 0.5986394557823129, "grad_norm": 0.7378799915313721, "learning_rate": 1e-05, "loss": 0.4983, "step": 99 }, { "epoch": 0.6046863189720333, "grad_norm": 0.7875183820724487, "learning_rate": 1e-05, "loss": 0.5389, "step": 100 }, { "epoch": 0.6107331821617535, "grad_norm": 0.7769007086753845, "learning_rate": 1e-05, "loss": 0.5601, "step": 101 }, { "epoch": 0.6167800453514739, "grad_norm": 0.8144631385803223, "learning_rate": 1e-05, "loss": 0.5473, "step": 102 }, { "epoch": 0.6228269085411943, "grad_norm": 0.7058612704277039, "learning_rate": 1e-05, "loss": 0.514, "step": 103 }, { "epoch": 0.6288737717309146, "grad_norm": 0.6682018041610718, "learning_rate": 1e-05, "loss": 0.5072, "step": 104 }, { "epoch": 0.6349206349206349, "grad_norm": 0.7572987675666809, "learning_rate": 1e-05, "loss": 0.5483, "step": 105 }, { "epoch": 0.6409674981103552, "grad_norm": 0.7027807235717773, "learning_rate": 1e-05, "loss": 0.532, "step": 106 }, { "epoch": 0.6470143613000756, "grad_norm": 0.9655821323394775, "learning_rate": 1e-05, "loss": 0.551, "step": 107 }, { "epoch": 0.6530612244897959, "grad_norm": 0.8295503854751587, "learning_rate": 1e-05, "loss": 0.5587, "step": 108 }, { "epoch": 0.6591080876795162, "grad_norm": 0.7149176001548767, "learning_rate": 1e-05, "loss": 0.532, "step": 109 }, { "epoch": 0.6651549508692366, "grad_norm": 0.6809913516044617, "learning_rate": 1e-05, "loss": 0.4877, "step": 110 }, { "epoch": 0.671201814058957, "grad_norm": 0.8903219103813171, "learning_rate": 1e-05, "loss": 0.5708, "step": 111 }, { "epoch": 0.6772486772486772, "grad_norm": 0.8803872466087341, "learning_rate": 1e-05, "loss": 0.5204, "step": 112 }, { "epoch": 0.6832955404383976, "grad_norm": 0.6723837852478027, "learning_rate": 1e-05, "loss": 0.5108, "step": 113 }, { "epoch": 0.6893424036281179, "grad_norm": 0.6500742435455322, "learning_rate": 1e-05, "loss": 0.4917, "step": 114 }, { "epoch": 0.6953892668178382, "grad_norm": 0.8980260491371155, "learning_rate": 1e-05, "loss": 0.5142, "step": 115 }, { "epoch": 0.7014361300075586, "grad_norm": 0.625626802444458, "learning_rate": 1e-05, "loss": 0.5113, "step": 116 }, { "epoch": 0.7074829931972789, "grad_norm": 0.6847211718559265, "learning_rate": 1e-05, "loss": 0.5099, "step": 117 }, { "epoch": 0.7135298563869993, "grad_norm": 0.6465173363685608, "learning_rate": 1e-05, "loss": 0.497, "step": 118 }, { "epoch": 0.7195767195767195, "grad_norm": 0.7003100514411926, "learning_rate": 1e-05, "loss": 0.4964, "step": 119 }, { "epoch": 0.7256235827664399, "grad_norm": 0.6872817873954773, "learning_rate": 1e-05, "loss": 0.5003, "step": 120 }, { "epoch": 0.7316704459561603, "grad_norm": 1.2496439218521118, "learning_rate": 1e-05, "loss": 0.542, "step": 121 }, { "epoch": 0.7377173091458806, "grad_norm": 0.6454047560691833, "learning_rate": 1e-05, "loss": 0.5268, "step": 122 }, { "epoch": 0.7437641723356009, "grad_norm": 0.6559277176856995, "learning_rate": 1e-05, "loss": 0.5204, "step": 123 }, { "epoch": 0.7498110355253212, "grad_norm": 0.6836625337600708, "learning_rate": 1e-05, "loss": 0.5254, "step": 124 }, { "epoch": 0.7558578987150416, "grad_norm": 0.710595428943634, "learning_rate": 1e-05, "loss": 0.4898, "step": 125 }, { "epoch": 0.7619047619047619, "grad_norm": 0.6704943180084229, "learning_rate": 1e-05, "loss": 0.4879, "step": 126 }, { "epoch": 0.7679516250944822, "grad_norm": 0.6254847049713135, "learning_rate": 1e-05, "loss": 0.5003, "step": 127 }, { "epoch": 0.7739984882842026, "grad_norm": 0.615053117275238, "learning_rate": 1e-05, "loss": 0.4888, "step": 128 }, { "epoch": 0.780045351473923, "grad_norm": 0.6638553738594055, "learning_rate": 1e-05, "loss": 0.5102, "step": 129 }, { "epoch": 0.7860922146636432, "grad_norm": 0.7983378171920776, "learning_rate": 1e-05, "loss": 0.475, "step": 130 }, { "epoch": 0.7921390778533636, "grad_norm": 0.7363254427909851, "learning_rate": 1e-05, "loss": 0.4984, "step": 131 }, { "epoch": 0.7981859410430839, "grad_norm": 0.7959357500076294, "learning_rate": 1e-05, "loss": 0.501, "step": 132 }, { "epoch": 0.8042328042328042, "grad_norm": 0.6690060496330261, "learning_rate": 1e-05, "loss": 0.5184, "step": 133 }, { "epoch": 0.8102796674225246, "grad_norm": 0.8345390558242798, "learning_rate": 1e-05, "loss": 0.5254, "step": 134 }, { "epoch": 0.8163265306122449, "grad_norm": 0.7033228874206543, "learning_rate": 1e-05, "loss": 0.4915, "step": 135 }, { "epoch": 0.8223733938019653, "grad_norm": 0.6511311531066895, "learning_rate": 1e-05, "loss": 0.4895, "step": 136 }, { "epoch": 0.8284202569916855, "grad_norm": 0.6077845692634583, "learning_rate": 1e-05, "loss": 0.4937, "step": 137 }, { "epoch": 0.8344671201814059, "grad_norm": 0.6507716774940491, "learning_rate": 1e-05, "loss": 0.5004, "step": 138 }, { "epoch": 0.8405139833711263, "grad_norm": 0.6580595374107361, "learning_rate": 1e-05, "loss": 0.512, "step": 139 }, { "epoch": 0.8465608465608465, "grad_norm": 0.6461679339408875, "learning_rate": 1e-05, "loss": 0.4722, "step": 140 }, { "epoch": 0.8526077097505669, "grad_norm": 0.6787775158882141, "learning_rate": 1e-05, "loss": 0.4942, "step": 141 }, { "epoch": 0.8586545729402872, "grad_norm": 0.6103595495223999, "learning_rate": 1e-05, "loss": 0.5039, "step": 142 }, { "epoch": 0.8647014361300076, "grad_norm": 0.6503426432609558, "learning_rate": 1e-05, "loss": 0.4656, "step": 143 }, { "epoch": 0.8707482993197279, "grad_norm": 0.6484214663505554, "learning_rate": 1e-05, "loss": 0.4433, "step": 144 }, { "epoch": 0.8767951625094482, "grad_norm": 0.6366398930549622, "learning_rate": 1e-05, "loss": 0.4978, "step": 145 }, { "epoch": 0.8828420256991686, "grad_norm": 0.6443809866905212, "learning_rate": 1e-05, "loss": 0.484, "step": 146 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6322662234306335, "learning_rate": 1e-05, "loss": 0.4921, "step": 147 }, { "epoch": 0.8949357520786092, "grad_norm": 0.6437771320343018, "learning_rate": 1e-05, "loss": 0.4965, "step": 148 }, { "epoch": 0.9009826152683296, "grad_norm": 0.6509385108947754, "learning_rate": 1e-05, "loss": 0.4876, "step": 149 }, { "epoch": 0.9070294784580499, "grad_norm": 0.7864116430282593, "learning_rate": 1e-05, "loss": 0.499, "step": 150 }, { "epoch": 0.9130763416477702, "grad_norm": 0.6669912934303284, "learning_rate": 1e-05, "loss": 0.487, "step": 151 }, { "epoch": 0.9191232048374905, "grad_norm": 0.7598031759262085, "learning_rate": 1e-05, "loss": 0.4672, "step": 152 }, { "epoch": 0.9251700680272109, "grad_norm": 0.6642863154411316, "learning_rate": 1e-05, "loss": 0.5284, "step": 153 }, { "epoch": 0.9312169312169312, "grad_norm": 0.6117709875106812, "learning_rate": 1e-05, "loss": 0.4797, "step": 154 }, { "epoch": 0.9372637944066515, "grad_norm": 0.5979474782943726, "learning_rate": 1e-05, "loss": 0.4743, "step": 155 }, { "epoch": 0.9433106575963719, "grad_norm": 0.6468530297279358, "learning_rate": 1e-05, "loss": 0.4943, "step": 156 }, { "epoch": 0.9493575207860923, "grad_norm": 0.7074465155601501, "learning_rate": 1e-05, "loss": 0.5007, "step": 157 }, { "epoch": 0.9554043839758125, "grad_norm": 0.6670670509338379, "learning_rate": 1e-05, "loss": 0.4656, "step": 158 }, { "epoch": 0.9614512471655329, "grad_norm": 0.7960174083709717, "learning_rate": 1e-05, "loss": 0.5051, "step": 159 }, { "epoch": 0.9674981103552532, "grad_norm": 0.6261197328567505, "learning_rate": 1e-05, "loss": 0.4559, "step": 160 }, { "epoch": 0.9735449735449735, "grad_norm": 0.6303992867469788, "learning_rate": 1e-05, "loss": 0.5078, "step": 161 }, { "epoch": 0.9795918367346939, "grad_norm": 0.6176378130912781, "learning_rate": 1e-05, "loss": 0.4644, "step": 162 }, { "epoch": 0.9856386999244142, "grad_norm": 0.7477989792823792, "learning_rate": 1e-05, "loss": 0.4878, "step": 163 }, { "epoch": 0.9916855631141346, "grad_norm": 0.7213401794433594, "learning_rate": 1e-05, "loss": 0.4798, "step": 164 }, { "epoch": 0.9977324263038548, "grad_norm": 0.6424558758735657, "learning_rate": 1e-05, "loss": 0.4807, "step": 165 }, { "epoch": 1.0, "grad_norm": 0.6424558758735657, "learning_rate": 1e-05, "loss": 0.2094, "step": 166 }, { "epoch": 1.0060468631897204, "grad_norm": 0.6606348752975464, "learning_rate": 1e-05, "loss": 0.4824, "step": 167 }, { "epoch": 1.0120937263794407, "grad_norm": 0.6616156101226807, "learning_rate": 1e-05, "loss": 0.5006, "step": 168 }, { "epoch": 1.018140589569161, "grad_norm": 0.6229767203330994, "learning_rate": 1e-05, "loss": 0.4525, "step": 169 }, { "epoch": 1.0241874527588812, "grad_norm": 0.6239718198776245, "learning_rate": 1e-05, "loss": 0.4859, "step": 170 }, { "epoch": 1.0302343159486016, "grad_norm": 0.6365458369255066, "learning_rate": 1e-05, "loss": 0.4444, "step": 171 }, { "epoch": 1.036281179138322, "grad_norm": 0.6115719079971313, "learning_rate": 1e-05, "loss": 0.5038, "step": 172 }, { "epoch": 1.0423280423280423, "grad_norm": 0.6136119365692139, "learning_rate": 1e-05, "loss": 0.4654, "step": 173 }, { "epoch": 1.0483749055177627, "grad_norm": 0.6639455556869507, "learning_rate": 1e-05, "loss": 0.4512, "step": 174 }, { "epoch": 1.054421768707483, "grad_norm": 0.6987008452415466, "learning_rate": 1e-05, "loss": 0.445, "step": 175 }, { "epoch": 1.0604686318972034, "grad_norm": 0.6095190644264221, "learning_rate": 1e-05, "loss": 0.4491, "step": 176 }, { "epoch": 1.0665154950869236, "grad_norm": 0.6218482255935669, "learning_rate": 1e-05, "loss": 0.4524, "step": 177 }, { "epoch": 1.072562358276644, "grad_norm": 0.7821850180625916, "learning_rate": 1e-05, "loss": 0.5102, "step": 178 }, { "epoch": 1.0786092214663643, "grad_norm": 0.6159230470657349, "learning_rate": 1e-05, "loss": 0.4385, "step": 179 }, { "epoch": 1.0846560846560847, "grad_norm": 0.7824166417121887, "learning_rate": 1e-05, "loss": 0.4548, "step": 180 }, { "epoch": 1.090702947845805, "grad_norm": 0.6106266975402832, "learning_rate": 1e-05, "loss": 0.4439, "step": 181 }, { "epoch": 1.0967498110355254, "grad_norm": 0.7144667506217957, "learning_rate": 1e-05, "loss": 0.4638, "step": 182 }, { "epoch": 1.1027966742252457, "grad_norm": 0.6751576662063599, "learning_rate": 1e-05, "loss": 0.4447, "step": 183 }, { "epoch": 1.1088435374149659, "grad_norm": 0.6461648941040039, "learning_rate": 1e-05, "loss": 0.4747, "step": 184 }, { "epoch": 1.1148904006046862, "grad_norm": 0.5774666666984558, "learning_rate": 1e-05, "loss": 0.4598, "step": 185 }, { "epoch": 1.1209372637944066, "grad_norm": 0.6550050377845764, "learning_rate": 1e-05, "loss": 0.4625, "step": 186 }, { "epoch": 1.126984126984127, "grad_norm": 0.8720169067382812, "learning_rate": 1e-05, "loss": 0.4825, "step": 187 }, { "epoch": 1.1330309901738473, "grad_norm": 0.6267352104187012, "learning_rate": 1e-05, "loss": 0.4462, "step": 188 }, { "epoch": 1.1390778533635677, "grad_norm": 0.6439146399497986, "learning_rate": 1e-05, "loss": 0.4863, "step": 189 }, { "epoch": 1.145124716553288, "grad_norm": 0.793402373790741, "learning_rate": 1e-05, "loss": 0.4544, "step": 190 }, { "epoch": 1.1511715797430084, "grad_norm": 0.6598357558250427, "learning_rate": 1e-05, "loss": 0.4433, "step": 191 }, { "epoch": 1.1572184429327286, "grad_norm": 0.6053168773651123, "learning_rate": 1e-05, "loss": 0.4628, "step": 192 }, { "epoch": 1.163265306122449, "grad_norm": 0.611825168132782, "learning_rate": 1e-05, "loss": 0.4757, "step": 193 }, { "epoch": 1.1693121693121693, "grad_norm": 0.7772480249404907, "learning_rate": 1e-05, "loss": 0.4735, "step": 194 }, { "epoch": 1.1753590325018897, "grad_norm": 0.7278615832328796, "learning_rate": 1e-05, "loss": 0.452, "step": 195 }, { "epoch": 1.18140589569161, "grad_norm": 0.6458846926689148, "learning_rate": 1e-05, "loss": 0.4864, "step": 196 }, { "epoch": 1.1874527588813304, "grad_norm": 0.6122244000434875, "learning_rate": 1e-05, "loss": 0.4728, "step": 197 }, { "epoch": 1.1934996220710508, "grad_norm": 0.5845707654953003, "learning_rate": 1e-05, "loss": 0.4659, "step": 198 }, { "epoch": 1.199546485260771, "grad_norm": 0.5802354216575623, "learning_rate": 1e-05, "loss": 0.4808, "step": 199 }, { "epoch": 1.2055933484504913, "grad_norm": 0.6875635385513306, "learning_rate": 1e-05, "loss": 0.4857, "step": 200 }, { "epoch": 1.2116402116402116, "grad_norm": 0.6133288741111755, "learning_rate": 1e-05, "loss": 0.4208, "step": 201 }, { "epoch": 1.217687074829932, "grad_norm": 0.5954310894012451, "learning_rate": 1e-05, "loss": 0.4655, "step": 202 }, { "epoch": 1.2237339380196524, "grad_norm": 0.6206385493278503, "learning_rate": 1e-05, "loss": 0.4813, "step": 203 }, { "epoch": 1.2297808012093727, "grad_norm": 0.6135386228561401, "learning_rate": 1e-05, "loss": 0.4851, "step": 204 }, { "epoch": 1.235827664399093, "grad_norm": 0.6469545960426331, "learning_rate": 1e-05, "loss": 0.4782, "step": 205 }, { "epoch": 1.2418745275888132, "grad_norm": 0.610987663269043, "learning_rate": 1e-05, "loss": 0.4636, "step": 206 }, { "epoch": 1.2479213907785336, "grad_norm": 0.694190263748169, "learning_rate": 1e-05, "loss": 0.4516, "step": 207 }, { "epoch": 1.253968253968254, "grad_norm": 0.6002838015556335, "learning_rate": 1e-05, "loss": 0.4659, "step": 208 }, { "epoch": 1.2600151171579743, "grad_norm": 0.669956624507904, "learning_rate": 1e-05, "loss": 0.4635, "step": 209 }, { "epoch": 1.2660619803476947, "grad_norm": 0.7150918245315552, "learning_rate": 1e-05, "loss": 0.4925, "step": 210 }, { "epoch": 1.272108843537415, "grad_norm": 0.613088846206665, "learning_rate": 1e-05, "loss": 0.4453, "step": 211 }, { "epoch": 1.2781557067271354, "grad_norm": 0.6749387383460999, "learning_rate": 1e-05, "loss": 0.476, "step": 212 }, { "epoch": 1.2842025699168556, "grad_norm": 0.8278664350509644, "learning_rate": 1e-05, "loss": 0.469, "step": 213 }, { "epoch": 1.290249433106576, "grad_norm": 0.6310950517654419, "learning_rate": 1e-05, "loss": 0.4256, "step": 214 }, { "epoch": 1.2962962962962963, "grad_norm": 0.6965487003326416, "learning_rate": 1e-05, "loss": 0.4532, "step": 215 }, { "epoch": 1.3023431594860166, "grad_norm": 0.6796351671218872, "learning_rate": 1e-05, "loss": 0.4954, "step": 216 }, { "epoch": 1.308390022675737, "grad_norm": 0.6296037435531616, "learning_rate": 1e-05, "loss": 0.4548, "step": 217 }, { "epoch": 1.3144368858654574, "grad_norm": 0.7185596227645874, "learning_rate": 1e-05, "loss": 0.4741, "step": 218 }, { "epoch": 1.3204837490551777, "grad_norm": 0.6666784882545471, "learning_rate": 1e-05, "loss": 0.4781, "step": 219 }, { "epoch": 1.3265306122448979, "grad_norm": 0.6976655125617981, "learning_rate": 1e-05, "loss": 0.4686, "step": 220 }, { "epoch": 1.3325774754346182, "grad_norm": 0.6550024747848511, "learning_rate": 1e-05, "loss": 0.4636, "step": 221 }, { "epoch": 1.3386243386243386, "grad_norm": 0.5954509377479553, "learning_rate": 1e-05, "loss": 0.4451, "step": 222 }, { "epoch": 1.344671201814059, "grad_norm": 0.6983951926231384, "learning_rate": 1e-05, "loss": 0.4661, "step": 223 }, { "epoch": 1.3507180650037793, "grad_norm": 0.6467093229293823, "learning_rate": 1e-05, "loss": 0.4381, "step": 224 }, { "epoch": 1.3567649281934997, "grad_norm": 0.6951491236686707, "learning_rate": 1e-05, "loss": 0.4652, "step": 225 }, { "epoch": 1.36281179138322, "grad_norm": 0.5601893067359924, "learning_rate": 1e-05, "loss": 0.4777, "step": 226 }, { "epoch": 1.3688586545729402, "grad_norm": 0.6614255309104919, "learning_rate": 1e-05, "loss": 0.4712, "step": 227 }, { "epoch": 1.3749055177626606, "grad_norm": 0.6624944806098938, "learning_rate": 1e-05, "loss": 0.4602, "step": 228 }, { "epoch": 1.380952380952381, "grad_norm": 0.6369934678077698, "learning_rate": 1e-05, "loss": 0.4892, "step": 229 }, { "epoch": 1.3869992441421013, "grad_norm": 0.6697752475738525, "learning_rate": 1e-05, "loss": 0.4825, "step": 230 }, { "epoch": 1.3930461073318217, "grad_norm": 0.6465206742286682, "learning_rate": 1e-05, "loss": 0.4924, "step": 231 }, { "epoch": 1.399092970521542, "grad_norm": 0.6809276938438416, "learning_rate": 1e-05, "loss": 0.453, "step": 232 }, { "epoch": 1.4051398337112624, "grad_norm": 0.6897946000099182, "learning_rate": 1e-05, "loss": 0.4253, "step": 233 }, { "epoch": 1.4111866969009825, "grad_norm": 0.6927686333656311, "learning_rate": 1e-05, "loss": 0.4667, "step": 234 }, { "epoch": 1.417233560090703, "grad_norm": 0.6365827918052673, "learning_rate": 1e-05, "loss": 0.4915, "step": 235 }, { "epoch": 1.4232804232804233, "grad_norm": 0.6107192039489746, "learning_rate": 1e-05, "loss": 0.4862, "step": 236 }, { "epoch": 1.4293272864701436, "grad_norm": 0.6444121599197388, "learning_rate": 1e-05, "loss": 0.4673, "step": 237 }, { "epoch": 1.435374149659864, "grad_norm": 0.5968717932701111, "learning_rate": 1e-05, "loss": 0.4471, "step": 238 }, { "epoch": 1.4414210128495843, "grad_norm": 0.7065095901489258, "learning_rate": 1e-05, "loss": 0.4492, "step": 239 }, { "epoch": 1.4474678760393047, "grad_norm": 0.5995264053344727, "learning_rate": 1e-05, "loss": 0.4574, "step": 240 }, { "epoch": 1.4535147392290249, "grad_norm": 0.6707980632781982, "learning_rate": 1e-05, "loss": 0.4685, "step": 241 }, { "epoch": 1.4595616024187452, "grad_norm": 0.6315703988075256, "learning_rate": 1e-05, "loss": 0.4433, "step": 242 }, { "epoch": 1.4656084656084656, "grad_norm": 0.6184569001197815, "learning_rate": 1e-05, "loss": 0.4384, "step": 243 }, { "epoch": 1.471655328798186, "grad_norm": 0.6238490343093872, "learning_rate": 1e-05, "loss": 0.4614, "step": 244 }, { "epoch": 1.4777021919879063, "grad_norm": 0.6371328830718994, "learning_rate": 1e-05, "loss": 0.4526, "step": 245 }, { "epoch": 1.4837490551776267, "grad_norm": 0.6465550065040588, "learning_rate": 1e-05, "loss": 0.4612, "step": 246 }, { "epoch": 1.489795918367347, "grad_norm": 0.6414604187011719, "learning_rate": 1e-05, "loss": 0.4416, "step": 247 }, { "epoch": 1.4958427815570672, "grad_norm": 0.7275083661079407, "learning_rate": 1e-05, "loss": 0.4443, "step": 248 }, { "epoch": 1.5018896447467878, "grad_norm": 0.6674472093582153, "learning_rate": 1e-05, "loss": 0.4536, "step": 249 }, { "epoch": 1.507936507936508, "grad_norm": 0.5826581716537476, "learning_rate": 1e-05, "loss": 0.4817, "step": 250 }, { "epoch": 1.5139833711262283, "grad_norm": 0.6314810514450073, "learning_rate": 1e-05, "loss": 0.467, "step": 251 }, { "epoch": 1.5200302343159486, "grad_norm": 0.637401282787323, "learning_rate": 1e-05, "loss": 0.4786, "step": 252 }, { "epoch": 1.5260770975056688, "grad_norm": 0.784031093120575, "learning_rate": 1e-05, "loss": 0.4889, "step": 253 }, { "epoch": 1.5321239606953894, "grad_norm": 0.6010222434997559, "learning_rate": 1e-05, "loss": 0.4796, "step": 254 }, { "epoch": 1.5381708238851095, "grad_norm": 0.6853978037834167, "learning_rate": 1e-05, "loss": 0.4435, "step": 255 }, { "epoch": 1.54421768707483, "grad_norm": 0.5840683579444885, "learning_rate": 1e-05, "loss": 0.4837, "step": 256 }, { "epoch": 1.5502645502645502, "grad_norm": 0.6400360465049744, "learning_rate": 1e-05, "loss": 0.4741, "step": 257 }, { "epoch": 1.5563114134542706, "grad_norm": 0.8613634705543518, "learning_rate": 1e-05, "loss": 0.4596, "step": 258 }, { "epoch": 1.562358276643991, "grad_norm": 0.7580351233482361, "learning_rate": 1e-05, "loss": 0.472, "step": 259 }, { "epoch": 1.568405139833711, "grad_norm": 0.6610977649688721, "learning_rate": 1e-05, "loss": 0.4358, "step": 260 }, { "epoch": 1.5744520030234317, "grad_norm": 0.6119037866592407, "learning_rate": 1e-05, "loss": 0.4407, "step": 261 }, { "epoch": 1.5804988662131518, "grad_norm": 0.7009762525558472, "learning_rate": 1e-05, "loss": 0.4688, "step": 262 }, { "epoch": 1.5865457294028724, "grad_norm": 0.6398757696151733, "learning_rate": 1e-05, "loss": 0.443, "step": 263 }, { "epoch": 1.5925925925925926, "grad_norm": 0.6141228675842285, "learning_rate": 1e-05, "loss": 0.4064, "step": 264 }, { "epoch": 1.598639455782313, "grad_norm": 0.6586297154426575, "learning_rate": 1e-05, "loss": 0.4766, "step": 265 }, { "epoch": 1.6046863189720333, "grad_norm": 0.6146064400672913, "learning_rate": 1e-05, "loss": 0.46, "step": 266 }, { "epoch": 1.6107331821617534, "grad_norm": 0.6430327296257019, "learning_rate": 1e-05, "loss": 0.4983, "step": 267 }, { "epoch": 1.616780045351474, "grad_norm": 0.6267253756523132, "learning_rate": 1e-05, "loss": 0.4336, "step": 268 }, { "epoch": 1.6228269085411942, "grad_norm": 0.6364631652832031, "learning_rate": 1e-05, "loss": 0.4418, "step": 269 }, { "epoch": 1.6288737717309147, "grad_norm": 0.5934423804283142, "learning_rate": 1e-05, "loss": 0.4715, "step": 270 }, { "epoch": 1.6349206349206349, "grad_norm": 0.7002220153808594, "learning_rate": 1e-05, "loss": 0.4205, "step": 271 }, { "epoch": 1.6409674981103552, "grad_norm": 0.5583924055099487, "learning_rate": 1e-05, "loss": 0.4425, "step": 272 }, { "epoch": 1.6470143613000756, "grad_norm": 0.8225222229957581, "learning_rate": 1e-05, "loss": 0.4572, "step": 273 }, { "epoch": 1.6530612244897958, "grad_norm": 0.7018880248069763, "learning_rate": 1e-05, "loss": 0.5016, "step": 274 }, { "epoch": 1.6591080876795163, "grad_norm": 0.600951611995697, "learning_rate": 1e-05, "loss": 0.4575, "step": 275 }, { "epoch": 1.6651549508692365, "grad_norm": 0.747036337852478, "learning_rate": 1e-05, "loss": 0.4591, "step": 276 }, { "epoch": 1.671201814058957, "grad_norm": 0.6101221442222595, "learning_rate": 1e-05, "loss": 0.4329, "step": 277 }, { "epoch": 1.6772486772486772, "grad_norm": 0.6129553914070129, "learning_rate": 1e-05, "loss": 0.4434, "step": 278 }, { "epoch": 1.6832955404383976, "grad_norm": 0.6353614926338196, "learning_rate": 1e-05, "loss": 0.4748, "step": 279 }, { "epoch": 1.689342403628118, "grad_norm": 0.6356557607650757, "learning_rate": 1e-05, "loss": 0.4964, "step": 280 }, { "epoch": 1.695389266817838, "grad_norm": 0.6503037810325623, "learning_rate": 1e-05, "loss": 0.4588, "step": 281 }, { "epoch": 1.7014361300075587, "grad_norm": 0.6293248534202576, "learning_rate": 1e-05, "loss": 0.4449, "step": 282 }, { "epoch": 1.7074829931972788, "grad_norm": 0.6629444360733032, "learning_rate": 1e-05, "loss": 0.458, "step": 283 }, { "epoch": 1.7135298563869994, "grad_norm": 0.7334532737731934, "learning_rate": 1e-05, "loss": 0.4624, "step": 284 }, { "epoch": 1.7195767195767195, "grad_norm": 0.8572254180908203, "learning_rate": 1e-05, "loss": 0.4794, "step": 285 }, { "epoch": 1.72562358276644, "grad_norm": 0.6750730276107788, "learning_rate": 1e-05, "loss": 0.4555, "step": 286 }, { "epoch": 1.7316704459561603, "grad_norm": 0.6547385454177856, "learning_rate": 1e-05, "loss": 0.4622, "step": 287 }, { "epoch": 1.7377173091458806, "grad_norm": 0.617925226688385, "learning_rate": 1e-05, "loss": 0.4513, "step": 288 }, { "epoch": 1.743764172335601, "grad_norm": 0.6336359977722168, "learning_rate": 1e-05, "loss": 0.444, "step": 289 }, { "epoch": 1.7498110355253211, "grad_norm": 0.6159520149230957, "learning_rate": 1e-05, "loss": 0.4538, "step": 290 }, { "epoch": 1.7558578987150417, "grad_norm": 0.6046229004859924, "learning_rate": 1e-05, "loss": 0.4569, "step": 291 }, { "epoch": 1.7619047619047619, "grad_norm": 0.6550801992416382, "learning_rate": 1e-05, "loss": 0.4893, "step": 292 }, { "epoch": 1.7679516250944822, "grad_norm": 0.6317894458770752, "learning_rate": 1e-05, "loss": 0.4193, "step": 293 }, { "epoch": 1.7739984882842026, "grad_norm": 0.5674409866333008, "learning_rate": 1e-05, "loss": 0.4593, "step": 294 }, { "epoch": 1.780045351473923, "grad_norm": 0.6055679321289062, "learning_rate": 1e-05, "loss": 0.45, "step": 295 }, { "epoch": 1.7860922146636433, "grad_norm": 0.6228786706924438, "learning_rate": 1e-05, "loss": 0.4514, "step": 296 }, { "epoch": 1.7921390778533635, "grad_norm": 0.6810594797134399, "learning_rate": 1e-05, "loss": 0.4839, "step": 297 }, { "epoch": 1.798185941043084, "grad_norm": 0.6592750549316406, "learning_rate": 1e-05, "loss": 0.4155, "step": 298 }, { "epoch": 1.8042328042328042, "grad_norm": 0.6659589409828186, "learning_rate": 1e-05, "loss": 0.4623, "step": 299 }, { "epoch": 1.8102796674225246, "grad_norm": 0.8698583245277405, "learning_rate": 1e-05, "loss": 0.4184, "step": 300 }, { "epoch": 1.816326530612245, "grad_norm": 0.602044939994812, "learning_rate": 1e-05, "loss": 0.4538, "step": 301 }, { "epoch": 1.8223733938019653, "grad_norm": 0.5736281871795654, "learning_rate": 1e-05, "loss": 0.4127, "step": 302 }, { "epoch": 1.8284202569916856, "grad_norm": 0.7261680364608765, "learning_rate": 1e-05, "loss": 0.4334, "step": 303 }, { "epoch": 1.8344671201814058, "grad_norm": 0.7073959112167358, "learning_rate": 1e-05, "loss": 0.4706, "step": 304 }, { "epoch": 1.8405139833711264, "grad_norm": 0.6551429629325867, "learning_rate": 1e-05, "loss": 0.4738, "step": 305 }, { "epoch": 1.8465608465608465, "grad_norm": 0.709190845489502, "learning_rate": 1e-05, "loss": 0.4581, "step": 306 }, { "epoch": 1.8526077097505669, "grad_norm": 0.6238887906074524, "learning_rate": 1e-05, "loss": 0.4315, "step": 307 }, { "epoch": 1.8586545729402872, "grad_norm": 0.6026177406311035, "learning_rate": 1e-05, "loss": 0.4291, "step": 308 }, { "epoch": 1.8647014361300076, "grad_norm": 0.6351094841957092, "learning_rate": 1e-05, "loss": 0.4703, "step": 309 }, { "epoch": 1.870748299319728, "grad_norm": 0.6246944069862366, "learning_rate": 1e-05, "loss": 0.443, "step": 310 }, { "epoch": 1.8767951625094481, "grad_norm": 0.6804561614990234, "learning_rate": 1e-05, "loss": 0.4862, "step": 311 }, { "epoch": 1.8828420256991687, "grad_norm": 0.6391934752464294, "learning_rate": 1e-05, "loss": 0.449, "step": 312 }, { "epoch": 1.8888888888888888, "grad_norm": 0.584983766078949, "learning_rate": 1e-05, "loss": 0.4169, "step": 313 }, { "epoch": 1.8949357520786092, "grad_norm": 0.6371360421180725, "learning_rate": 1e-05, "loss": 0.4312, "step": 314 }, { "epoch": 1.9009826152683296, "grad_norm": 0.5869805812835693, "learning_rate": 1e-05, "loss": 0.481, "step": 315 }, { "epoch": 1.90702947845805, "grad_norm": 0.6193313002586365, "learning_rate": 1e-05, "loss": 0.4369, "step": 316 }, { "epoch": 1.9130763416477703, "grad_norm": 0.6947498917579651, "learning_rate": 1e-05, "loss": 0.4493, "step": 317 }, { "epoch": 1.9191232048374904, "grad_norm": 0.5941237807273865, "learning_rate": 1e-05, "loss": 0.4427, "step": 318 }, { "epoch": 1.925170068027211, "grad_norm": 0.6937701106071472, "learning_rate": 1e-05, "loss": 0.4414, "step": 319 }, { "epoch": 1.9312169312169312, "grad_norm": 0.6981029510498047, "learning_rate": 1e-05, "loss": 0.4558, "step": 320 }, { "epoch": 1.9372637944066515, "grad_norm": 0.6035318970680237, "learning_rate": 1e-05, "loss": 0.4356, "step": 321 }, { "epoch": 1.943310657596372, "grad_norm": 0.650826096534729, "learning_rate": 1e-05, "loss": 0.4493, "step": 322 }, { "epoch": 1.9493575207860923, "grad_norm": 0.5956555604934692, "learning_rate": 1e-05, "loss": 0.4152, "step": 323 }, { "epoch": 1.9554043839758126, "grad_norm": 0.6644042730331421, "learning_rate": 1e-05, "loss": 0.4416, "step": 324 }, { "epoch": 1.9614512471655328, "grad_norm": 0.651853084564209, "learning_rate": 1e-05, "loss": 0.4393, "step": 325 }, { "epoch": 1.9674981103552533, "grad_norm": 0.755530059337616, "learning_rate": 1e-05, "loss": 0.4678, "step": 326 }, { "epoch": 1.9735449735449735, "grad_norm": 0.6087958216667175, "learning_rate": 1e-05, "loss": 0.4424, "step": 327 }, { "epoch": 1.9795918367346939, "grad_norm": 0.6306193470954895, "learning_rate": 1e-05, "loss": 0.4166, "step": 328 }, { "epoch": 1.9856386999244142, "grad_norm": 0.60347980260849, "learning_rate": 1e-05, "loss": 0.4267, "step": 329 }, { "epoch": 1.9916855631141346, "grad_norm": 0.5859919190406799, "learning_rate": 1e-05, "loss": 0.4289, "step": 330 }, { "epoch": 1.997732426303855, "grad_norm": 0.6933625936508179, "learning_rate": 1e-05, "loss": 0.4609, "step": 331 }, { "epoch": 2.0, "grad_norm": 0.6933625936508179, "learning_rate": 1e-05, "loss": 0.148, "step": 332 }, { "epoch": 2.00604686318972, "grad_norm": 0.6603690981864929, "learning_rate": 1e-05, "loss": 0.4293, "step": 333 }, { "epoch": 2.0120937263794407, "grad_norm": 0.6994916200637817, "learning_rate": 1e-05, "loss": 0.4412, "step": 334 }, { "epoch": 2.018140589569161, "grad_norm": 0.7604825496673584, "learning_rate": 1e-05, "loss": 0.4115, "step": 335 }, { "epoch": 2.0241874527588815, "grad_norm": 0.5603572130203247, "learning_rate": 1e-05, "loss": 0.4438, "step": 336 }, { "epoch": 2.0302343159486016, "grad_norm": 0.640121579170227, "learning_rate": 1e-05, "loss": 0.4357, "step": 337 }, { "epoch": 2.036281179138322, "grad_norm": 0.6164838671684265, "learning_rate": 1e-05, "loss": 0.4574, "step": 338 }, { "epoch": 2.0423280423280423, "grad_norm": 0.6057737469673157, "learning_rate": 1e-05, "loss": 0.4368, "step": 339 }, { "epoch": 2.0483749055177625, "grad_norm": 0.5864904522895813, "learning_rate": 1e-05, "loss": 0.4836, "step": 340 }, { "epoch": 2.054421768707483, "grad_norm": 0.7257908582687378, "learning_rate": 1e-05, "loss": 0.4471, "step": 341 }, { "epoch": 2.060468631897203, "grad_norm": 0.6308581829071045, "learning_rate": 1e-05, "loss": 0.4319, "step": 342 }, { "epoch": 2.066515495086924, "grad_norm": 0.603664219379425, "learning_rate": 1e-05, "loss": 0.4766, "step": 343 }, { "epoch": 2.072562358276644, "grad_norm": 0.5839913487434387, "learning_rate": 1e-05, "loss": 0.4376, "step": 344 }, { "epoch": 2.0786092214663645, "grad_norm": 0.5631858110427856, "learning_rate": 1e-05, "loss": 0.3978, "step": 345 }, { "epoch": 2.0846560846560847, "grad_norm": 0.5701799392700195, "learning_rate": 1e-05, "loss": 0.4085, "step": 346 }, { "epoch": 2.090702947845805, "grad_norm": 0.7251225709915161, "learning_rate": 1e-05, "loss": 0.4259, "step": 347 }, { "epoch": 2.0967498110355254, "grad_norm": 0.7313527464866638, "learning_rate": 1e-05, "loss": 0.4657, "step": 348 }, { "epoch": 2.1027966742252455, "grad_norm": 0.6027727723121643, "learning_rate": 1e-05, "loss": 0.459, "step": 349 }, { "epoch": 2.108843537414966, "grad_norm": 0.5810455083847046, "learning_rate": 1e-05, "loss": 0.4283, "step": 350 }, { "epoch": 2.1148904006046862, "grad_norm": 0.595798909664154, "learning_rate": 1e-05, "loss": 0.4369, "step": 351 }, { "epoch": 2.120937263794407, "grad_norm": 0.6135094165802002, "learning_rate": 1e-05, "loss": 0.4316, "step": 352 }, { "epoch": 2.126984126984127, "grad_norm": 0.6158870458602905, "learning_rate": 1e-05, "loss": 0.4352, "step": 353 }, { "epoch": 2.133030990173847, "grad_norm": 0.5907214879989624, "learning_rate": 1e-05, "loss": 0.4621, "step": 354 }, { "epoch": 2.1390778533635677, "grad_norm": 0.6108521819114685, "learning_rate": 1e-05, "loss": 0.4467, "step": 355 }, { "epoch": 2.145124716553288, "grad_norm": 0.6298407912254333, "learning_rate": 1e-05, "loss": 0.4198, "step": 356 }, { "epoch": 2.1511715797430084, "grad_norm": 0.5679177641868591, "learning_rate": 1e-05, "loss": 0.4366, "step": 357 }, { "epoch": 2.1572184429327286, "grad_norm": 0.6569507718086243, "learning_rate": 1e-05, "loss": 0.4157, "step": 358 }, { "epoch": 2.163265306122449, "grad_norm": 0.5669922232627869, "learning_rate": 1e-05, "loss": 0.4683, "step": 359 }, { "epoch": 2.1693121693121693, "grad_norm": 0.6744571924209595, "learning_rate": 1e-05, "loss": 0.4393, "step": 360 }, { "epoch": 2.1753590325018894, "grad_norm": 0.6491524577140808, "learning_rate": 1e-05, "loss": 0.43, "step": 361 }, { "epoch": 2.18140589569161, "grad_norm": 0.593529224395752, "learning_rate": 1e-05, "loss": 0.4522, "step": 362 }, { "epoch": 2.18745275888133, "grad_norm": 0.7529059648513794, "learning_rate": 1e-05, "loss": 0.4286, "step": 363 }, { "epoch": 2.1934996220710508, "grad_norm": 0.6293076276779175, "learning_rate": 1e-05, "loss": 0.405, "step": 364 }, { "epoch": 2.199546485260771, "grad_norm": 0.6186328530311584, "learning_rate": 1e-05, "loss": 0.4419, "step": 365 }, { "epoch": 2.2055933484504915, "grad_norm": 0.6195396780967712, "learning_rate": 1e-05, "loss": 0.4381, "step": 366 }, { "epoch": 2.2116402116402116, "grad_norm": 0.7332632541656494, "learning_rate": 1e-05, "loss": 0.4533, "step": 367 }, { "epoch": 2.2176870748299318, "grad_norm": 0.6982107758522034, "learning_rate": 1e-05, "loss": 0.428, "step": 368 }, { "epoch": 2.2237339380196524, "grad_norm": 0.697144627571106, "learning_rate": 1e-05, "loss": 0.4533, "step": 369 }, { "epoch": 2.2297808012093725, "grad_norm": 0.6097111105918884, "learning_rate": 1e-05, "loss": 0.4122, "step": 370 }, { "epoch": 2.235827664399093, "grad_norm": 0.6297764182090759, "learning_rate": 1e-05, "loss": 0.4413, "step": 371 }, { "epoch": 2.2418745275888132, "grad_norm": 0.625715970993042, "learning_rate": 1e-05, "loss": 0.4372, "step": 372 }, { "epoch": 2.247921390778534, "grad_norm": 0.6380645632743835, "learning_rate": 1e-05, "loss": 0.4059, "step": 373 }, { "epoch": 2.253968253968254, "grad_norm": 0.6337711215019226, "learning_rate": 1e-05, "loss": 0.4649, "step": 374 }, { "epoch": 2.260015117157974, "grad_norm": 0.614594578742981, "learning_rate": 1e-05, "loss": 0.4044, "step": 375 }, { "epoch": 2.2660619803476947, "grad_norm": 0.6339592337608337, "learning_rate": 1e-05, "loss": 0.4184, "step": 376 }, { "epoch": 2.272108843537415, "grad_norm": 0.6063976287841797, "learning_rate": 1e-05, "loss": 0.4254, "step": 377 }, { "epoch": 2.2781557067271354, "grad_norm": 0.6234972476959229, "learning_rate": 1e-05, "loss": 0.4507, "step": 378 }, { "epoch": 2.2842025699168556, "grad_norm": 0.687817394733429, "learning_rate": 1e-05, "loss": 0.4272, "step": 379 }, { "epoch": 2.290249433106576, "grad_norm": 0.7282426357269287, "learning_rate": 1e-05, "loss": 0.4364, "step": 380 }, { "epoch": 2.2962962962962963, "grad_norm": 0.7689639925956726, "learning_rate": 1e-05, "loss": 0.4672, "step": 381 }, { "epoch": 2.302343159486017, "grad_norm": 0.6412914991378784, "learning_rate": 1e-05, "loss": 0.4058, "step": 382 }, { "epoch": 2.308390022675737, "grad_norm": 0.6424279808998108, "learning_rate": 1e-05, "loss": 0.4449, "step": 383 }, { "epoch": 2.314436885865457, "grad_norm": 0.660686731338501, "learning_rate": 1e-05, "loss": 0.4181, "step": 384 }, { "epoch": 2.3204837490551777, "grad_norm": 0.6181021928787231, "learning_rate": 1e-05, "loss": 0.4603, "step": 385 }, { "epoch": 2.326530612244898, "grad_norm": 0.7471036314964294, "learning_rate": 1e-05, "loss": 0.4091, "step": 386 }, { "epoch": 2.3325774754346185, "grad_norm": 0.5858867168426514, "learning_rate": 1e-05, "loss": 0.4299, "step": 387 }, { "epoch": 2.3386243386243386, "grad_norm": 0.6382971405982971, "learning_rate": 1e-05, "loss": 0.4334, "step": 388 }, { "epoch": 2.3446712018140587, "grad_norm": 0.6103458404541016, "learning_rate": 1e-05, "loss": 0.4484, "step": 389 }, { "epoch": 2.3507180650037793, "grad_norm": 0.7106808423995972, "learning_rate": 1e-05, "loss": 0.4437, "step": 390 }, { "epoch": 2.3567649281934995, "grad_norm": 0.5801584124565125, "learning_rate": 1e-05, "loss": 0.4439, "step": 391 }, { "epoch": 2.36281179138322, "grad_norm": 0.6244776844978333, "learning_rate": 1e-05, "loss": 0.4714, "step": 392 }, { "epoch": 2.36885865457294, "grad_norm": 0.6998816132545471, "learning_rate": 1e-05, "loss": 0.4405, "step": 393 }, { "epoch": 2.374905517762661, "grad_norm": 0.6233670711517334, "learning_rate": 1e-05, "loss": 0.4596, "step": 394 }, { "epoch": 2.380952380952381, "grad_norm": 0.59064120054245, "learning_rate": 1e-05, "loss": 0.4378, "step": 395 }, { "epoch": 2.3869992441421015, "grad_norm": 0.6747873425483704, "learning_rate": 1e-05, "loss": 0.4364, "step": 396 }, { "epoch": 2.3930461073318217, "grad_norm": 0.6693813800811768, "learning_rate": 1e-05, "loss": 0.4275, "step": 397 }, { "epoch": 2.399092970521542, "grad_norm": 0.5732771158218384, "learning_rate": 1e-05, "loss": 0.4285, "step": 398 }, { "epoch": 2.4051398337112624, "grad_norm": 0.5857324004173279, "learning_rate": 1e-05, "loss": 0.4365, "step": 399 }, { "epoch": 2.4111866969009825, "grad_norm": 0.7219252586364746, "learning_rate": 1e-05, "loss": 0.4174, "step": 400 }, { "epoch": 2.417233560090703, "grad_norm": 0.6316457986831665, "learning_rate": 1e-05, "loss": 0.4297, "step": 401 }, { "epoch": 2.4232804232804233, "grad_norm": 0.7518462538719177, "learning_rate": 1e-05, "loss": 0.4179, "step": 402 }, { "epoch": 2.4293272864701434, "grad_norm": 0.662816047668457, "learning_rate": 1e-05, "loss": 0.4367, "step": 403 }, { "epoch": 2.435374149659864, "grad_norm": 0.5900476574897766, "learning_rate": 1e-05, "loss": 0.4411, "step": 404 }, { "epoch": 2.441421012849584, "grad_norm": 0.6242678165435791, "learning_rate": 1e-05, "loss": 0.4283, "step": 405 }, { "epoch": 2.4474678760393047, "grad_norm": 0.7597957253456116, "learning_rate": 1e-05, "loss": 0.4622, "step": 406 }, { "epoch": 2.453514739229025, "grad_norm": 0.6564098596572876, "learning_rate": 1e-05, "loss": 0.4275, "step": 407 }, { "epoch": 2.4595616024187454, "grad_norm": 0.6206280589103699, "learning_rate": 1e-05, "loss": 0.4576, "step": 408 }, { "epoch": 2.4656084656084656, "grad_norm": 0.6594191193580627, "learning_rate": 1e-05, "loss": 0.4546, "step": 409 }, { "epoch": 2.471655328798186, "grad_norm": 0.6617541909217834, "learning_rate": 1e-05, "loss": 0.4955, "step": 410 }, { "epoch": 2.4777021919879063, "grad_norm": 0.6559837460517883, "learning_rate": 1e-05, "loss": 0.4254, "step": 411 }, { "epoch": 2.4837490551776265, "grad_norm": 0.6617287397384644, "learning_rate": 1e-05, "loss": 0.4278, "step": 412 }, { "epoch": 2.489795918367347, "grad_norm": 0.6606351733207703, "learning_rate": 1e-05, "loss": 0.4492, "step": 413 }, { "epoch": 2.495842781557067, "grad_norm": 0.593546450138092, "learning_rate": 1e-05, "loss": 0.4567, "step": 414 }, { "epoch": 2.5018896447467878, "grad_norm": 0.6175308227539062, "learning_rate": 1e-05, "loss": 0.4194, "step": 415 }, { "epoch": 2.507936507936508, "grad_norm": 0.6103977560997009, "learning_rate": 1e-05, "loss": 0.4607, "step": 416 }, { "epoch": 2.513983371126228, "grad_norm": 0.6861665844917297, "learning_rate": 1e-05, "loss": 0.4346, "step": 417 }, { "epoch": 2.5200302343159486, "grad_norm": 0.6308091282844543, "learning_rate": 1e-05, "loss": 0.4374, "step": 418 }, { "epoch": 2.526077097505669, "grad_norm": 0.62021803855896, "learning_rate": 1e-05, "loss": 0.4368, "step": 419 }, { "epoch": 2.5321239606953894, "grad_norm": 0.5960768461227417, "learning_rate": 1e-05, "loss": 0.4272, "step": 420 }, { "epoch": 2.5381708238851095, "grad_norm": 0.6310625076293945, "learning_rate": 1e-05, "loss": 0.453, "step": 421 }, { "epoch": 2.54421768707483, "grad_norm": 0.6985680460929871, "learning_rate": 1e-05, "loss": 0.4525, "step": 422 }, { "epoch": 2.5502645502645502, "grad_norm": 0.6140491962432861, "learning_rate": 1e-05, "loss": 0.4185, "step": 423 }, { "epoch": 2.556311413454271, "grad_norm": 0.6356143355369568, "learning_rate": 1e-05, "loss": 0.4378, "step": 424 }, { "epoch": 2.562358276643991, "grad_norm": 0.6402518153190613, "learning_rate": 1e-05, "loss": 0.443, "step": 425 }, { "epoch": 2.568405139833711, "grad_norm": 0.5940840244293213, "learning_rate": 1e-05, "loss": 0.4286, "step": 426 }, { "epoch": 2.5744520030234317, "grad_norm": 0.6120002865791321, "learning_rate": 1e-05, "loss": 0.4128, "step": 427 }, { "epoch": 2.580498866213152, "grad_norm": 0.5649886131286621, "learning_rate": 1e-05, "loss": 0.4288, "step": 428 }, { "epoch": 2.5865457294028724, "grad_norm": 0.8437002301216125, "learning_rate": 1e-05, "loss": 0.449, "step": 429 }, { "epoch": 2.5925925925925926, "grad_norm": 0.5640137791633606, "learning_rate": 1e-05, "loss": 0.4499, "step": 430 }, { "epoch": 2.5986394557823127, "grad_norm": 0.630789041519165, "learning_rate": 1e-05, "loss": 0.4125, "step": 431 }, { "epoch": 2.6046863189720333, "grad_norm": 0.5941190719604492, "learning_rate": 1e-05, "loss": 0.3997, "step": 432 }, { "epoch": 2.6107331821617534, "grad_norm": 0.6079080700874329, "learning_rate": 1e-05, "loss": 0.4705, "step": 433 }, { "epoch": 2.616780045351474, "grad_norm": 0.6112462878227234, "learning_rate": 1e-05, "loss": 0.4209, "step": 434 }, { "epoch": 2.622826908541194, "grad_norm": 0.5916441082954407, "learning_rate": 1e-05, "loss": 0.413, "step": 435 }, { "epoch": 2.6288737717309147, "grad_norm": 0.7234858274459839, "learning_rate": 1e-05, "loss": 0.4416, "step": 436 }, { "epoch": 2.634920634920635, "grad_norm": 0.5952900052070618, "learning_rate": 1e-05, "loss": 0.4539, "step": 437 }, { "epoch": 2.6409674981103555, "grad_norm": 0.7268300652503967, "learning_rate": 1e-05, "loss": 0.4368, "step": 438 }, { "epoch": 2.6470143613000756, "grad_norm": 0.7060340046882629, "learning_rate": 1e-05, "loss": 0.4727, "step": 439 }, { "epoch": 2.6530612244897958, "grad_norm": 0.6274359226226807, "learning_rate": 1e-05, "loss": 0.4515, "step": 440 }, { "epoch": 2.6591080876795163, "grad_norm": 0.5820181369781494, "learning_rate": 1e-05, "loss": 0.4141, "step": 441 }, { "epoch": 2.6651549508692365, "grad_norm": 0.6407735347747803, "learning_rate": 1e-05, "loss": 0.4472, "step": 442 }, { "epoch": 2.671201814058957, "grad_norm": 0.6541780233383179, "learning_rate": 1e-05, "loss": 0.4614, "step": 443 }, { "epoch": 2.677248677248677, "grad_norm": 0.6327083706855774, "learning_rate": 1e-05, "loss": 0.4426, "step": 444 }, { "epoch": 2.6832955404383974, "grad_norm": 0.8332440257072449, "learning_rate": 1e-05, "loss": 0.4427, "step": 445 }, { "epoch": 2.689342403628118, "grad_norm": 0.6143855452537537, "learning_rate": 1e-05, "loss": 0.4289, "step": 446 }, { "epoch": 2.695389266817838, "grad_norm": 0.6880478262901306, "learning_rate": 1e-05, "loss": 0.4136, "step": 447 }, { "epoch": 2.7014361300075587, "grad_norm": 0.6012048721313477, "learning_rate": 1e-05, "loss": 0.4261, "step": 448 }, { "epoch": 2.707482993197279, "grad_norm": 0.5910801291465759, "learning_rate": 1e-05, "loss": 0.4588, "step": 449 }, { "epoch": 2.7135298563869994, "grad_norm": 0.7595794200897217, "learning_rate": 1e-05, "loss": 0.409, "step": 450 }, { "epoch": 2.7195767195767195, "grad_norm": 0.6292067170143127, "learning_rate": 1e-05, "loss": 0.4277, "step": 451 }, { "epoch": 2.72562358276644, "grad_norm": 0.6329100131988525, "learning_rate": 1e-05, "loss": 0.4468, "step": 452 }, { "epoch": 2.7316704459561603, "grad_norm": 0.6338825225830078, "learning_rate": 1e-05, "loss": 0.3999, "step": 453 }, { "epoch": 2.7377173091458804, "grad_norm": 0.673941433429718, "learning_rate": 1e-05, "loss": 0.432, "step": 454 }, { "epoch": 2.743764172335601, "grad_norm": 0.623754620552063, "learning_rate": 1e-05, "loss": 0.4163, "step": 455 }, { "epoch": 2.749811035525321, "grad_norm": 0.5795051455497742, "learning_rate": 1e-05, "loss": 0.4188, "step": 456 }, { "epoch": 2.7558578987150417, "grad_norm": 0.7059243321418762, "learning_rate": 1e-05, "loss": 0.4555, "step": 457 }, { "epoch": 2.761904761904762, "grad_norm": 0.6102821230888367, "learning_rate": 1e-05, "loss": 0.427, "step": 458 }, { "epoch": 2.767951625094482, "grad_norm": 0.6392788290977478, "learning_rate": 1e-05, "loss": 0.3798, "step": 459 }, { "epoch": 2.7739984882842026, "grad_norm": 0.6091472506523132, "learning_rate": 1e-05, "loss": 0.4238, "step": 460 }, { "epoch": 2.780045351473923, "grad_norm": 0.5836897492408752, "learning_rate": 1e-05, "loss": 0.4366, "step": 461 }, { "epoch": 2.7860922146636433, "grad_norm": 0.941039502620697, "learning_rate": 1e-05, "loss": 0.4046, "step": 462 }, { "epoch": 2.7921390778533635, "grad_norm": 0.5628529787063599, "learning_rate": 1e-05, "loss": 0.4208, "step": 463 }, { "epoch": 2.798185941043084, "grad_norm": 0.6621745228767395, "learning_rate": 1e-05, "loss": 0.4661, "step": 464 }, { "epoch": 2.804232804232804, "grad_norm": 0.6870080828666687, "learning_rate": 1e-05, "loss": 0.426, "step": 465 }, { "epoch": 2.8102796674225248, "grad_norm": 0.6435585618019104, "learning_rate": 1e-05, "loss": 0.4322, "step": 466 }, { "epoch": 2.816326530612245, "grad_norm": 0.6991519331932068, "learning_rate": 1e-05, "loss": 0.4353, "step": 467 }, { "epoch": 2.822373393801965, "grad_norm": 0.6273677945137024, "learning_rate": 1e-05, "loss": 0.4278, "step": 468 }, { "epoch": 2.8284202569916856, "grad_norm": 0.6428662538528442, "learning_rate": 1e-05, "loss": 0.4231, "step": 469 }, { "epoch": 2.834467120181406, "grad_norm": 0.6282612085342407, "learning_rate": 1e-05, "loss": 0.4293, "step": 470 }, { "epoch": 2.8405139833711264, "grad_norm": 0.5842074155807495, "learning_rate": 1e-05, "loss": 0.4042, "step": 471 }, { "epoch": 2.8465608465608465, "grad_norm": 0.5663440227508545, "learning_rate": 1e-05, "loss": 0.3981, "step": 472 }, { "epoch": 2.8526077097505667, "grad_norm": 0.6044777631759644, "learning_rate": 1e-05, "loss": 0.452, "step": 473 }, { "epoch": 2.8586545729402872, "grad_norm": 0.6199264526367188, "learning_rate": 1e-05, "loss": 0.4105, "step": 474 }, { "epoch": 2.864701436130008, "grad_norm": 0.7735686302185059, "learning_rate": 1e-05, "loss": 0.4152, "step": 475 }, { "epoch": 2.870748299319728, "grad_norm": 0.6115516424179077, "learning_rate": 1e-05, "loss": 0.4352, "step": 476 }, { "epoch": 2.876795162509448, "grad_norm": 0.6233912110328674, "learning_rate": 1e-05, "loss": 0.4364, "step": 477 }, { "epoch": 2.8828420256991687, "grad_norm": 0.6158405542373657, "learning_rate": 1e-05, "loss": 0.4125, "step": 478 }, { "epoch": 2.888888888888889, "grad_norm": 0.6264033317565918, "learning_rate": 1e-05, "loss": 0.3917, "step": 479 }, { "epoch": 2.8949357520786094, "grad_norm": 0.6061130166053772, "learning_rate": 1e-05, "loss": 0.4196, "step": 480 }, { "epoch": 2.9009826152683296, "grad_norm": 0.6103699207305908, "learning_rate": 1e-05, "loss": 0.419, "step": 481 }, { "epoch": 2.9070294784580497, "grad_norm": 0.5866860747337341, "learning_rate": 1e-05, "loss": 0.4061, "step": 482 }, { "epoch": 2.9130763416477703, "grad_norm": 0.6195345520973206, "learning_rate": 1e-05, "loss": 0.4366, "step": 483 }, { "epoch": 2.9191232048374904, "grad_norm": 0.5707806944847107, "learning_rate": 1e-05, "loss": 0.4078, "step": 484 }, { "epoch": 2.925170068027211, "grad_norm": 0.8734687566757202, "learning_rate": 1e-05, "loss": 0.4722, "step": 485 }, { "epoch": 2.931216931216931, "grad_norm": 0.8262805342674255, "learning_rate": 1e-05, "loss": 0.4092, "step": 486 }, { "epoch": 2.9372637944066513, "grad_norm": 0.6194629669189453, "learning_rate": 1e-05, "loss": 0.4468, "step": 487 }, { "epoch": 2.943310657596372, "grad_norm": 0.7542822360992432, "learning_rate": 1e-05, "loss": 0.4249, "step": 488 }, { "epoch": 2.9493575207860925, "grad_norm": 0.5416433215141296, "learning_rate": 1e-05, "loss": 0.4205, "step": 489 }, { "epoch": 2.9554043839758126, "grad_norm": 0.643470823764801, "learning_rate": 1e-05, "loss": 0.4317, "step": 490 }, { "epoch": 2.9614512471655328, "grad_norm": 0.6726208925247192, "learning_rate": 1e-05, "loss": 0.4134, "step": 491 }, { "epoch": 2.9674981103552533, "grad_norm": 0.6030561327934265, "learning_rate": 1e-05, "loss": 0.4592, "step": 492 }, { "epoch": 2.9735449735449735, "grad_norm": 0.7280339598655701, "learning_rate": 1e-05, "loss": 0.4823, "step": 493 }, { "epoch": 2.979591836734694, "grad_norm": 0.6342500448226929, "learning_rate": 1e-05, "loss": 0.427, "step": 494 }, { "epoch": 2.985638699924414, "grad_norm": 0.5872485041618347, "learning_rate": 1e-05, "loss": 0.437, "step": 495 }, { "epoch": 2.9916855631141344, "grad_norm": 0.6782248020172119, "learning_rate": 1e-05, "loss": 0.444, "step": 496 }, { "epoch": 2.997732426303855, "grad_norm": 0.6571045517921448, "learning_rate": 1e-05, "loss": 0.4124, "step": 497 }, { "epoch": 3.0, "grad_norm": 0.6176797151565552, "learning_rate": 1e-05, "loss": 0.1652, "step": 498 }, { "epoch": 3.00604686318972, "grad_norm": 0.7607374787330627, "learning_rate": 1e-05, "loss": 0.4272, "step": 499 }, { "epoch": 3.0120937263794407, "grad_norm": 0.6168681979179382, "learning_rate": 1e-05, "loss": 0.4137, "step": 500 }, { "epoch": 3.018140589569161, "grad_norm": 0.567806601524353, "learning_rate": 1e-05, "loss": 0.4076, "step": 501 }, { "epoch": 3.0241874527588815, "grad_norm": 0.6219955682754517, "learning_rate": 1e-05, "loss": 0.4316, "step": 502 }, { "epoch": 3.0302343159486016, "grad_norm": 0.6967265605926514, "learning_rate": 1e-05, "loss": 0.3997, "step": 503 }, { "epoch": 3.036281179138322, "grad_norm": 0.5977517366409302, "learning_rate": 1e-05, "loss": 0.3952, "step": 504 }, { "epoch": 3.0423280423280423, "grad_norm": 0.6792314052581787, "learning_rate": 1e-05, "loss": 0.3991, "step": 505 }, { "epoch": 3.0483749055177625, "grad_norm": 0.6674013733863831, "learning_rate": 1e-05, "loss": 0.4057, "step": 506 }, { "epoch": 3.054421768707483, "grad_norm": 0.5633118152618408, "learning_rate": 1e-05, "loss": 0.4253, "step": 507 }, { "epoch": 3.060468631897203, "grad_norm": 0.626979649066925, "learning_rate": 1e-05, "loss": 0.403, "step": 508 }, { "epoch": 3.066515495086924, "grad_norm": 0.6246337890625, "learning_rate": 1e-05, "loss": 0.4363, "step": 509 }, { "epoch": 3.072562358276644, "grad_norm": 0.6035377979278564, "learning_rate": 1e-05, "loss": 0.4018, "step": 510 }, { "epoch": 3.0786092214663645, "grad_norm": 0.600320041179657, "learning_rate": 1e-05, "loss": 0.4097, "step": 511 }, { "epoch": 3.0846560846560847, "grad_norm": 0.5939974784851074, "learning_rate": 1e-05, "loss": 0.4214, "step": 512 }, { "epoch": 3.090702947845805, "grad_norm": 0.5401872396469116, "learning_rate": 1e-05, "loss": 0.4239, "step": 513 }, { "epoch": 3.0967498110355254, "grad_norm": 0.683673083782196, "learning_rate": 1e-05, "loss": 0.4423, "step": 514 }, { "epoch": 3.1027966742252455, "grad_norm": 0.7111365795135498, "learning_rate": 1e-05, "loss": 0.3988, "step": 515 }, { "epoch": 3.108843537414966, "grad_norm": 0.6365870237350464, "learning_rate": 1e-05, "loss": 0.3994, "step": 516 }, { "epoch": 3.1148904006046862, "grad_norm": 0.763279378414154, "learning_rate": 1e-05, "loss": 0.4322, "step": 517 }, { "epoch": 3.120937263794407, "grad_norm": 0.7069273591041565, "learning_rate": 1e-05, "loss": 0.4419, "step": 518 }, { "epoch": 3.126984126984127, "grad_norm": 0.6043683290481567, "learning_rate": 1e-05, "loss": 0.428, "step": 519 }, { "epoch": 3.133030990173847, "grad_norm": 0.6091717481613159, "learning_rate": 1e-05, "loss": 0.4293, "step": 520 }, { "epoch": 3.1390778533635677, "grad_norm": 0.606554388999939, "learning_rate": 1e-05, "loss": 0.4245, "step": 521 }, { "epoch": 3.145124716553288, "grad_norm": 0.5879369974136353, "learning_rate": 1e-05, "loss": 0.4111, "step": 522 }, { "epoch": 3.1511715797430084, "grad_norm": 0.6110349893569946, "learning_rate": 1e-05, "loss": 0.412, "step": 523 }, { "epoch": 3.1572184429327286, "grad_norm": 0.5943782329559326, "learning_rate": 1e-05, "loss": 0.4107, "step": 524 }, { "epoch": 3.163265306122449, "grad_norm": 0.5901566743850708, "learning_rate": 1e-05, "loss": 0.4303, "step": 525 }, { "epoch": 3.1693121693121693, "grad_norm": 0.5924205183982849, "learning_rate": 1e-05, "loss": 0.4338, "step": 526 }, { "epoch": 3.1753590325018894, "grad_norm": 0.5981183648109436, "learning_rate": 1e-05, "loss": 0.4353, "step": 527 }, { "epoch": 3.18140589569161, "grad_norm": 0.5813708305358887, "learning_rate": 1e-05, "loss": 0.4409, "step": 528 }, { "epoch": 3.18745275888133, "grad_norm": 0.8089960813522339, "learning_rate": 1e-05, "loss": 0.4515, "step": 529 }, { "epoch": 3.1934996220710508, "grad_norm": 0.5774374008178711, "learning_rate": 1e-05, "loss": 0.3908, "step": 530 }, { "epoch": 3.199546485260771, "grad_norm": 0.6418164968490601, "learning_rate": 1e-05, "loss": 0.4445, "step": 531 }, { "epoch": 3.2055933484504915, "grad_norm": 0.6748641133308411, "learning_rate": 1e-05, "loss": 0.4272, "step": 532 }, { "epoch": 3.2116402116402116, "grad_norm": 0.6062082052230835, "learning_rate": 1e-05, "loss": 0.4271, "step": 533 }, { "epoch": 3.2176870748299318, "grad_norm": 0.7739962339401245, "learning_rate": 1e-05, "loss": 0.4334, "step": 534 }, { "epoch": 3.2237339380196524, "grad_norm": 0.5871445536613464, "learning_rate": 1e-05, "loss": 0.4188, "step": 535 }, { "epoch": 3.2297808012093725, "grad_norm": 0.565229594707489, "learning_rate": 1e-05, "loss": 0.4169, "step": 536 }, { "epoch": 3.235827664399093, "grad_norm": 0.6169576644897461, "learning_rate": 1e-05, "loss": 0.4031, "step": 537 }, { "epoch": 3.2418745275888132, "grad_norm": 0.7920488119125366, "learning_rate": 1e-05, "loss": 0.4568, "step": 538 }, { "epoch": 3.247921390778534, "grad_norm": 0.6632086038589478, "learning_rate": 1e-05, "loss": 0.4345, "step": 539 }, { "epoch": 3.253968253968254, "grad_norm": 0.6227561235427856, "learning_rate": 1e-05, "loss": 0.4205, "step": 540 }, { "epoch": 3.260015117157974, "grad_norm": 0.6597675085067749, "learning_rate": 1e-05, "loss": 0.4386, "step": 541 }, { "epoch": 3.2660619803476947, "grad_norm": 0.7808500528335571, "learning_rate": 1e-05, "loss": 0.4028, "step": 542 }, { "epoch": 3.272108843537415, "grad_norm": 0.5969167947769165, "learning_rate": 1e-05, "loss": 0.408, "step": 543 }, { "epoch": 3.2781557067271354, "grad_norm": 0.8019803166389465, "learning_rate": 1e-05, "loss": 0.4635, "step": 544 }, { "epoch": 3.2842025699168556, "grad_norm": 0.60172039270401, "learning_rate": 1e-05, "loss": 0.4076, "step": 545 }, { "epoch": 3.290249433106576, "grad_norm": 0.5975821018218994, "learning_rate": 1e-05, "loss": 0.4296, "step": 546 }, { "epoch": 3.2962962962962963, "grad_norm": 0.6326526403427124, "learning_rate": 1e-05, "loss": 0.4659, "step": 547 }, { "epoch": 3.302343159486017, "grad_norm": 0.6621552109718323, "learning_rate": 1e-05, "loss": 0.4115, "step": 548 }, { "epoch": 3.308390022675737, "grad_norm": 0.5504428148269653, "learning_rate": 1e-05, "loss": 0.4047, "step": 549 }, { "epoch": 3.314436885865457, "grad_norm": 0.6373119950294495, "learning_rate": 1e-05, "loss": 0.4365, "step": 550 }, { "epoch": 3.3204837490551777, "grad_norm": 0.654000997543335, "learning_rate": 1e-05, "loss": 0.4278, "step": 551 }, { "epoch": 3.326530612244898, "grad_norm": 0.6986036896705627, "learning_rate": 1e-05, "loss": 0.4188, "step": 552 }, { "epoch": 3.3325774754346185, "grad_norm": 0.5813942551612854, "learning_rate": 1e-05, "loss": 0.4034, "step": 553 }, { "epoch": 3.3386243386243386, "grad_norm": 0.6143907308578491, "learning_rate": 1e-05, "loss": 0.407, "step": 554 }, { "epoch": 3.3446712018140587, "grad_norm": 0.6602691411972046, "learning_rate": 1e-05, "loss": 0.4279, "step": 555 }, { "epoch": 3.3507180650037793, "grad_norm": 0.593880295753479, "learning_rate": 1e-05, "loss": 0.4002, "step": 556 }, { "epoch": 3.3567649281934995, "grad_norm": 0.5535792112350464, "learning_rate": 1e-05, "loss": 0.3858, "step": 557 }, { "epoch": 3.36281179138322, "grad_norm": 0.5866678953170776, "learning_rate": 1e-05, "loss": 0.3808, "step": 558 }, { "epoch": 3.36885865457294, "grad_norm": 0.6740050315856934, "learning_rate": 1e-05, "loss": 0.4206, "step": 559 }, { "epoch": 3.374905517762661, "grad_norm": 0.6608660221099854, "learning_rate": 1e-05, "loss": 0.4542, "step": 560 }, { "epoch": 3.380952380952381, "grad_norm": 0.6334772706031799, "learning_rate": 1e-05, "loss": 0.4257, "step": 561 }, { "epoch": 3.3869992441421015, "grad_norm": 0.5592483282089233, "learning_rate": 1e-05, "loss": 0.406, "step": 562 }, { "epoch": 3.3930461073318217, "grad_norm": 0.5821921825408936, "learning_rate": 1e-05, "loss": 0.4082, "step": 563 }, { "epoch": 3.399092970521542, "grad_norm": 0.635015606880188, "learning_rate": 1e-05, "loss": 0.4089, "step": 564 }, { "epoch": 3.4051398337112624, "grad_norm": 0.5655603408813477, "learning_rate": 1e-05, "loss": 0.3864, "step": 565 }, { "epoch": 3.4111866969009825, "grad_norm": 0.6066403985023499, "learning_rate": 1e-05, "loss": 0.4409, "step": 566 }, { "epoch": 3.417233560090703, "grad_norm": 0.73860764503479, "learning_rate": 1e-05, "loss": 0.4341, "step": 567 }, { "epoch": 3.4232804232804233, "grad_norm": 0.6196745038032532, "learning_rate": 1e-05, "loss": 0.4136, "step": 568 }, { "epoch": 3.4293272864701434, "grad_norm": 0.590583860874176, "learning_rate": 1e-05, "loss": 0.4046, "step": 569 }, { "epoch": 3.435374149659864, "grad_norm": 0.6130207180976868, "learning_rate": 1e-05, "loss": 0.4503, "step": 570 }, { "epoch": 3.441421012849584, "grad_norm": 0.6346332430839539, "learning_rate": 1e-05, "loss": 0.4137, "step": 571 }, { "epoch": 3.4474678760393047, "grad_norm": 0.7304969429969788, "learning_rate": 1e-05, "loss": 0.402, "step": 572 }, { "epoch": 3.453514739229025, "grad_norm": 0.7201123237609863, "learning_rate": 1e-05, "loss": 0.4201, "step": 573 }, { "epoch": 3.4595616024187454, "grad_norm": 0.6254672408103943, "learning_rate": 1e-05, "loss": 0.3997, "step": 574 }, { "epoch": 3.4656084656084656, "grad_norm": 0.5806301236152649, "learning_rate": 1e-05, "loss": 0.4037, "step": 575 }, { "epoch": 3.471655328798186, "grad_norm": 0.7682508826255798, "learning_rate": 1e-05, "loss": 0.4276, "step": 576 }, { "epoch": 3.4777021919879063, "grad_norm": 0.6343861818313599, "learning_rate": 1e-05, "loss": 0.4263, "step": 577 }, { "epoch": 3.4837490551776265, "grad_norm": 0.7493196725845337, "learning_rate": 1e-05, "loss": 0.4133, "step": 578 }, { "epoch": 3.489795918367347, "grad_norm": 0.5379742980003357, "learning_rate": 1e-05, "loss": 0.404, "step": 579 }, { "epoch": 3.495842781557067, "grad_norm": 0.6693356037139893, "learning_rate": 1e-05, "loss": 0.418, "step": 580 }, { "epoch": 3.5018896447467878, "grad_norm": 0.7691015005111694, "learning_rate": 1e-05, "loss": 0.4291, "step": 581 }, { "epoch": 3.507936507936508, "grad_norm": 0.6868825554847717, "learning_rate": 1e-05, "loss": 0.4184, "step": 582 }, { "epoch": 3.513983371126228, "grad_norm": 0.6583042740821838, "learning_rate": 1e-05, "loss": 0.4437, "step": 583 }, { "epoch": 3.5200302343159486, "grad_norm": 0.5697051882743835, "learning_rate": 1e-05, "loss": 0.4281, "step": 584 }, { "epoch": 3.526077097505669, "grad_norm": 0.6466888189315796, "learning_rate": 1e-05, "loss": 0.4274, "step": 585 }, { "epoch": 3.5321239606953894, "grad_norm": 0.5636702179908752, "learning_rate": 1e-05, "loss": 0.4258, "step": 586 }, { "epoch": 3.5381708238851095, "grad_norm": 0.6343228816986084, "learning_rate": 1e-05, "loss": 0.4168, "step": 587 }, { "epoch": 3.54421768707483, "grad_norm": 0.6140140295028687, "learning_rate": 1e-05, "loss": 0.4243, "step": 588 }, { "epoch": 3.5502645502645502, "grad_norm": 0.607572615146637, "learning_rate": 1e-05, "loss": 0.41, "step": 589 }, { "epoch": 3.556311413454271, "grad_norm": 0.5617547035217285, "learning_rate": 1e-05, "loss": 0.4069, "step": 590 }, { "epoch": 3.562358276643991, "grad_norm": 0.6026951670646667, "learning_rate": 1e-05, "loss": 0.4413, "step": 591 }, { "epoch": 3.568405139833711, "grad_norm": 0.6141639947891235, "learning_rate": 1e-05, "loss": 0.3904, "step": 592 }, { "epoch": 3.5744520030234317, "grad_norm": 0.586068868637085, "learning_rate": 1e-05, "loss": 0.4326, "step": 593 }, { "epoch": 3.580498866213152, "grad_norm": 0.5956772565841675, "learning_rate": 1e-05, "loss": 0.4322, "step": 594 }, { "epoch": 3.5865457294028724, "grad_norm": 0.6121401190757751, "learning_rate": 1e-05, "loss": 0.4138, "step": 595 }, { "epoch": 3.5925925925925926, "grad_norm": 0.5967589020729065, "learning_rate": 1e-05, "loss": 0.4007, "step": 596 }, { "epoch": 3.5986394557823127, "grad_norm": 0.6019216775894165, "learning_rate": 1e-05, "loss": 0.4289, "step": 597 }, { "epoch": 3.6046863189720333, "grad_norm": 0.6205087304115295, "learning_rate": 1e-05, "loss": 0.4053, "step": 598 }, { "epoch": 3.6107331821617534, "grad_norm": 0.6043209433555603, "learning_rate": 1e-05, "loss": 0.3946, "step": 599 }, { "epoch": 3.616780045351474, "grad_norm": 0.6003085970878601, "learning_rate": 1e-05, "loss": 0.4228, "step": 600 }, { "epoch": 3.622826908541194, "grad_norm": 0.5430454015731812, "learning_rate": 1e-05, "loss": 0.4159, "step": 601 }, { "epoch": 3.6288737717309147, "grad_norm": 0.6331672072410583, "learning_rate": 1e-05, "loss": 0.4396, "step": 602 }, { "epoch": 3.634920634920635, "grad_norm": 0.613858699798584, "learning_rate": 1e-05, "loss": 0.4219, "step": 603 }, { "epoch": 3.6409674981103555, "grad_norm": 0.6415857672691345, "learning_rate": 1e-05, "loss": 0.44, "step": 604 }, { "epoch": 3.6470143613000756, "grad_norm": 0.6424372792243958, "learning_rate": 1e-05, "loss": 0.4341, "step": 605 }, { "epoch": 3.6530612244897958, "grad_norm": 0.587757408618927, "learning_rate": 1e-05, "loss": 0.4081, "step": 606 }, { "epoch": 3.6591080876795163, "grad_norm": 0.696824848651886, "learning_rate": 1e-05, "loss": 0.416, "step": 607 }, { "epoch": 3.6651549508692365, "grad_norm": 0.6960793137550354, "learning_rate": 1e-05, "loss": 0.4465, "step": 608 }, { "epoch": 3.671201814058957, "grad_norm": 0.5955398678779602, "learning_rate": 1e-05, "loss": 0.4071, "step": 609 }, { "epoch": 3.677248677248677, "grad_norm": 0.6189931631088257, "learning_rate": 1e-05, "loss": 0.4291, "step": 610 }, { "epoch": 3.6832955404383974, "grad_norm": 0.682124137878418, "learning_rate": 1e-05, "loss": 0.4245, "step": 611 }, { "epoch": 3.689342403628118, "grad_norm": 0.5930956602096558, "learning_rate": 1e-05, "loss": 0.4261, "step": 612 }, { "epoch": 3.695389266817838, "grad_norm": 0.6003082394599915, "learning_rate": 1e-05, "loss": 0.4107, "step": 613 }, { "epoch": 3.7014361300075587, "grad_norm": 0.6489179134368896, "learning_rate": 1e-05, "loss": 0.4542, "step": 614 }, { "epoch": 3.707482993197279, "grad_norm": 0.7475566267967224, "learning_rate": 1e-05, "loss": 0.4709, "step": 615 }, { "epoch": 3.7135298563869994, "grad_norm": 0.5824201703071594, "learning_rate": 1e-05, "loss": 0.3928, "step": 616 }, { "epoch": 3.7195767195767195, "grad_norm": 0.584107518196106, "learning_rate": 1e-05, "loss": 0.4183, "step": 617 }, { "epoch": 3.72562358276644, "grad_norm": 0.5755361914634705, "learning_rate": 1e-05, "loss": 0.4064, "step": 618 }, { "epoch": 3.7316704459561603, "grad_norm": 0.5795572400093079, "learning_rate": 1e-05, "loss": 0.4163, "step": 619 }, { "epoch": 3.7377173091458804, "grad_norm": 0.6164476275444031, "learning_rate": 1e-05, "loss": 0.3858, "step": 620 }, { "epoch": 3.743764172335601, "grad_norm": 0.6104868650436401, "learning_rate": 1e-05, "loss": 0.3961, "step": 621 }, { "epoch": 3.749811035525321, "grad_norm": 0.6502463817596436, "learning_rate": 1e-05, "loss": 0.4331, "step": 622 }, { "epoch": 3.7558578987150417, "grad_norm": 0.696357786655426, "learning_rate": 1e-05, "loss": 0.4199, "step": 623 }, { "epoch": 3.761904761904762, "grad_norm": 0.6037179827690125, "learning_rate": 1e-05, "loss": 0.423, "step": 624 }, { "epoch": 3.767951625094482, "grad_norm": 0.5953124761581421, "learning_rate": 1e-05, "loss": 0.3989, "step": 625 }, { "epoch": 3.7739984882842026, "grad_norm": 0.6055779457092285, "learning_rate": 1e-05, "loss": 0.402, "step": 626 }, { "epoch": 3.780045351473923, "grad_norm": 0.5701073408126831, "learning_rate": 1e-05, "loss": 0.3961, "step": 627 }, { "epoch": 3.7860922146636433, "grad_norm": 0.6233461499214172, "learning_rate": 1e-05, "loss": 0.4369, "step": 628 }, { "epoch": 3.7921390778533635, "grad_norm": 0.6432021260261536, "learning_rate": 1e-05, "loss": 0.4354, "step": 629 }, { "epoch": 3.798185941043084, "grad_norm": 0.5663430094718933, "learning_rate": 1e-05, "loss": 0.4096, "step": 630 }, { "epoch": 3.804232804232804, "grad_norm": 0.6619196534156799, "learning_rate": 1e-05, "loss": 0.4566, "step": 631 }, { "epoch": 3.8102796674225248, "grad_norm": 0.5850746631622314, "learning_rate": 1e-05, "loss": 0.4264, "step": 632 }, { "epoch": 3.816326530612245, "grad_norm": 0.6600084900856018, "learning_rate": 1e-05, "loss": 0.4434, "step": 633 }, { "epoch": 3.822373393801965, "grad_norm": 0.789292573928833, "learning_rate": 1e-05, "loss": 0.4206, "step": 634 }, { "epoch": 3.8284202569916856, "grad_norm": 0.6029325723648071, "learning_rate": 1e-05, "loss": 0.4122, "step": 635 }, { "epoch": 3.834467120181406, "grad_norm": 0.6007590293884277, "learning_rate": 1e-05, "loss": 0.418, "step": 636 }, { "epoch": 3.8405139833711264, "grad_norm": 0.6150406002998352, "learning_rate": 1e-05, "loss": 0.4161, "step": 637 }, { "epoch": 3.8465608465608465, "grad_norm": 0.5676735639572144, "learning_rate": 1e-05, "loss": 0.3971, "step": 638 }, { "epoch": 3.8526077097505667, "grad_norm": 0.7282530665397644, "learning_rate": 1e-05, "loss": 0.4172, "step": 639 }, { "epoch": 3.8586545729402872, "grad_norm": 0.5913415551185608, "learning_rate": 1e-05, "loss": 0.4367, "step": 640 }, { "epoch": 3.864701436130008, "grad_norm": 0.6281553506851196, "learning_rate": 1e-05, "loss": 0.4065, "step": 641 }, { "epoch": 3.870748299319728, "grad_norm": 0.6268578767776489, "learning_rate": 1e-05, "loss": 0.4314, "step": 642 }, { "epoch": 3.876795162509448, "grad_norm": 0.5916160345077515, "learning_rate": 1e-05, "loss": 0.4319, "step": 643 }, { "epoch": 3.8828420256991687, "grad_norm": 0.5768805742263794, "learning_rate": 1e-05, "loss": 0.4175, "step": 644 }, { "epoch": 3.888888888888889, "grad_norm": 0.6338863968849182, "learning_rate": 1e-05, "loss": 0.4041, "step": 645 }, { "epoch": 3.8949357520786094, "grad_norm": 0.6433572769165039, "learning_rate": 1e-05, "loss": 0.4349, "step": 646 }, { "epoch": 3.9009826152683296, "grad_norm": 0.6549029350280762, "learning_rate": 1e-05, "loss": 0.4177, "step": 647 }, { "epoch": 3.9070294784580497, "grad_norm": 0.619968831539154, "learning_rate": 1e-05, "loss": 0.4017, "step": 648 }, { "epoch": 3.9130763416477703, "grad_norm": 0.6248337030410767, "learning_rate": 1e-05, "loss": 0.3765, "step": 649 }, { "epoch": 3.9191232048374904, "grad_norm": 0.6068743467330933, "learning_rate": 1e-05, "loss": 0.4239, "step": 650 }, { "epoch": 3.925170068027211, "grad_norm": 0.6443238258361816, "learning_rate": 1e-05, "loss": 0.4259, "step": 651 }, { "epoch": 3.931216931216931, "grad_norm": 0.6279610395431519, "learning_rate": 1e-05, "loss": 0.4186, "step": 652 }, { "epoch": 3.9372637944066513, "grad_norm": 0.6126477122306824, "learning_rate": 1e-05, "loss": 0.4185, "step": 653 }, { "epoch": 3.943310657596372, "grad_norm": 0.5992019772529602, "learning_rate": 1e-05, "loss": 0.424, "step": 654 }, { "epoch": 3.9493575207860925, "grad_norm": 0.6540263295173645, "learning_rate": 1e-05, "loss": 0.4158, "step": 655 }, { "epoch": 3.9554043839758126, "grad_norm": 0.635087788105011, "learning_rate": 1e-05, "loss": 0.4123, "step": 656 }, { "epoch": 3.9614512471655328, "grad_norm": 0.7051719427108765, "learning_rate": 1e-05, "loss": 0.4263, "step": 657 }, { "epoch": 3.9674981103552533, "grad_norm": 0.6558767557144165, "learning_rate": 1e-05, "loss": 0.4503, "step": 658 }, { "epoch": 3.9735449735449735, "grad_norm": 0.7922623157501221, "learning_rate": 1e-05, "loss": 0.4316, "step": 659 }, { "epoch": 3.979591836734694, "grad_norm": 0.6541545391082764, "learning_rate": 1e-05, "loss": 0.3954, "step": 660 }, { "epoch": 3.985638699924414, "grad_norm": 0.5941789150238037, "learning_rate": 1e-05, "loss": 0.4252, "step": 661 }, { "epoch": 3.9916855631141344, "grad_norm": 0.6134931445121765, "learning_rate": 1e-05, "loss": 0.416, "step": 662 }, { "epoch": 3.997732426303855, "grad_norm": 0.6220409274101257, "learning_rate": 1e-05, "loss": 0.4352, "step": 663 }, { "epoch": 4.0, "grad_norm": 0.6220409274101257, "learning_rate": 1e-05, "loss": 0.1538, "step": 664 }, { "epoch": 4.00604686318972, "grad_norm": 0.8188051581382751, "learning_rate": 1e-05, "loss": 0.4282, "step": 665 }, { "epoch": 4.01209372637944, "grad_norm": 0.6495438814163208, "learning_rate": 1e-05, "loss": 0.4127, "step": 666 }, { "epoch": 4.018140589569161, "grad_norm": 0.6934894919395447, "learning_rate": 1e-05, "loss": 0.4031, "step": 667 }, { "epoch": 4.0241874527588815, "grad_norm": 0.5819240212440491, "learning_rate": 1e-05, "loss": 0.4029, "step": 668 }, { "epoch": 4.030234315948602, "grad_norm": 0.6669180393218994, "learning_rate": 1e-05, "loss": 0.4291, "step": 669 }, { "epoch": 4.036281179138322, "grad_norm": 0.6001705527305603, "learning_rate": 1e-05, "loss": 0.3958, "step": 670 }, { "epoch": 4.042328042328043, "grad_norm": 0.6176862120628357, "learning_rate": 1e-05, "loss": 0.4049, "step": 671 }, { "epoch": 4.048374905517763, "grad_norm": 0.6462976932525635, "learning_rate": 1e-05, "loss": 0.3935, "step": 672 }, { "epoch": 4.054421768707483, "grad_norm": 0.5445351004600525, "learning_rate": 1e-05, "loss": 0.4166, "step": 673 }, { "epoch": 4.060468631897203, "grad_norm": 0.5651801228523254, "learning_rate": 1e-05, "loss": 0.3931, "step": 674 }, { "epoch": 4.066515495086923, "grad_norm": 0.6144750118255615, "learning_rate": 1e-05, "loss": 0.4088, "step": 675 }, { "epoch": 4.072562358276644, "grad_norm": 0.5723121166229248, "learning_rate": 1e-05, "loss": 0.4167, "step": 676 }, { "epoch": 4.0786092214663645, "grad_norm": 0.5866492986679077, "learning_rate": 1e-05, "loss": 0.3948, "step": 677 }, { "epoch": 4.084656084656085, "grad_norm": 0.6020256876945496, "learning_rate": 1e-05, "loss": 0.3814, "step": 678 }, { "epoch": 4.090702947845805, "grad_norm": 0.5681604146957397, "learning_rate": 1e-05, "loss": 0.3954, "step": 679 }, { "epoch": 4.096749811035525, "grad_norm": 0.6793861985206604, "learning_rate": 1e-05, "loss": 0.4192, "step": 680 }, { "epoch": 4.102796674225246, "grad_norm": 0.6970557570457458, "learning_rate": 1e-05, "loss": 0.4399, "step": 681 }, { "epoch": 4.108843537414966, "grad_norm": 0.6030147075653076, "learning_rate": 1e-05, "loss": 0.3996, "step": 682 }, { "epoch": 4.114890400604686, "grad_norm": 0.6158145666122437, "learning_rate": 1e-05, "loss": 0.4184, "step": 683 }, { "epoch": 4.120937263794406, "grad_norm": 0.5950566530227661, "learning_rate": 1e-05, "loss": 0.3796, "step": 684 }, { "epoch": 4.1269841269841265, "grad_norm": 0.6540164351463318, "learning_rate": 1e-05, "loss": 0.4148, "step": 685 }, { "epoch": 4.133030990173848, "grad_norm": 0.5669929385185242, "learning_rate": 1e-05, "loss": 0.4066, "step": 686 }, { "epoch": 4.139077853363568, "grad_norm": 0.6367812752723694, "learning_rate": 1e-05, "loss": 0.4128, "step": 687 }, { "epoch": 4.145124716553288, "grad_norm": 0.6450753808021545, "learning_rate": 1e-05, "loss": 0.4131, "step": 688 }, { "epoch": 4.151171579743008, "grad_norm": 0.6737693548202515, "learning_rate": 1e-05, "loss": 0.4166, "step": 689 }, { "epoch": 4.157218442932729, "grad_norm": 0.6024237275123596, "learning_rate": 1e-05, "loss": 0.4011, "step": 690 }, { "epoch": 4.163265306122449, "grad_norm": 0.6005855202674866, "learning_rate": 1e-05, "loss": 0.4088, "step": 691 }, { "epoch": 4.169312169312169, "grad_norm": 0.5795685052871704, "learning_rate": 1e-05, "loss": 0.436, "step": 692 }, { "epoch": 4.1753590325018894, "grad_norm": 0.6175339818000793, "learning_rate": 1e-05, "loss": 0.4077, "step": 693 }, { "epoch": 4.18140589569161, "grad_norm": 0.6956577301025391, "learning_rate": 1e-05, "loss": 0.4356, "step": 694 }, { "epoch": 4.187452758881331, "grad_norm": 0.6014675498008728, "learning_rate": 1e-05, "loss": 0.3906, "step": 695 }, { "epoch": 4.193499622071051, "grad_norm": 0.5440590977668762, "learning_rate": 1e-05, "loss": 0.3954, "step": 696 }, { "epoch": 4.199546485260771, "grad_norm": 0.5932565331459045, "learning_rate": 1e-05, "loss": 0.4047, "step": 697 }, { "epoch": 4.205593348450491, "grad_norm": 0.6294877529144287, "learning_rate": 1e-05, "loss": 0.4207, "step": 698 }, { "epoch": 4.211640211640212, "grad_norm": 0.6184750199317932, "learning_rate": 1e-05, "loss": 0.403, "step": 699 }, { "epoch": 4.217687074829932, "grad_norm": 0.6708357334136963, "learning_rate": 1e-05, "loss": 0.4313, "step": 700 }, { "epoch": 4.223733938019652, "grad_norm": 0.7957239151000977, "learning_rate": 1e-05, "loss": 0.3983, "step": 701 }, { "epoch": 4.2297808012093725, "grad_norm": 0.6050925850868225, "learning_rate": 1e-05, "loss": 0.4267, "step": 702 }, { "epoch": 4.235827664399093, "grad_norm": 0.8065488338470459, "learning_rate": 1e-05, "loss": 0.4271, "step": 703 }, { "epoch": 4.241874527588814, "grad_norm": 0.7302738428115845, "learning_rate": 1e-05, "loss": 0.4266, "step": 704 }, { "epoch": 4.247921390778534, "grad_norm": 0.590469479560852, "learning_rate": 1e-05, "loss": 0.4031, "step": 705 }, { "epoch": 4.253968253968254, "grad_norm": 0.5590572953224182, "learning_rate": 1e-05, "loss": 0.3743, "step": 706 }, { "epoch": 4.260015117157974, "grad_norm": 0.6213292479515076, "learning_rate": 1e-05, "loss": 0.424, "step": 707 }, { "epoch": 4.266061980347694, "grad_norm": 0.6214923858642578, "learning_rate": 1e-05, "loss": 0.4005, "step": 708 }, { "epoch": 4.272108843537415, "grad_norm": 0.758690595626831, "learning_rate": 1e-05, "loss": 0.4001, "step": 709 }, { "epoch": 4.278155706727135, "grad_norm": 0.6100674867630005, "learning_rate": 1e-05, "loss": 0.419, "step": 710 }, { "epoch": 4.2842025699168556, "grad_norm": 0.605494499206543, "learning_rate": 1e-05, "loss": 0.3776, "step": 711 }, { "epoch": 4.290249433106576, "grad_norm": 0.5499102473258972, "learning_rate": 1e-05, "loss": 0.4153, "step": 712 }, { "epoch": 4.296296296296296, "grad_norm": 0.6226376891136169, "learning_rate": 1e-05, "loss": 0.4133, "step": 713 }, { "epoch": 4.302343159486017, "grad_norm": 0.6211143732070923, "learning_rate": 1e-05, "loss": 0.4022, "step": 714 }, { "epoch": 4.308390022675737, "grad_norm": 0.5623895525932312, "learning_rate": 1e-05, "loss": 0.4062, "step": 715 }, { "epoch": 4.314436885865457, "grad_norm": 0.6161671876907349, "learning_rate": 1e-05, "loss": 0.4026, "step": 716 }, { "epoch": 4.320483749055177, "grad_norm": 0.5893692970275879, "learning_rate": 1e-05, "loss": 0.409, "step": 717 }, { "epoch": 4.326530612244898, "grad_norm": 0.5655657649040222, "learning_rate": 1e-05, "loss": 0.3956, "step": 718 }, { "epoch": 4.3325774754346185, "grad_norm": 0.7444940805435181, "learning_rate": 1e-05, "loss": 0.3797, "step": 719 }, { "epoch": 4.338624338624339, "grad_norm": 0.5602829456329346, "learning_rate": 1e-05, "loss": 0.3996, "step": 720 }, { "epoch": 4.344671201814059, "grad_norm": 0.5943112969398499, "learning_rate": 1e-05, "loss": 0.4277, "step": 721 }, { "epoch": 4.350718065003779, "grad_norm": 0.5991412997245789, "learning_rate": 1e-05, "loss": 0.4101, "step": 722 }, { "epoch": 4.3567649281935, "grad_norm": 0.6174283623695374, "learning_rate": 1e-05, "loss": 0.4248, "step": 723 }, { "epoch": 4.36281179138322, "grad_norm": 0.6071466207504272, "learning_rate": 1e-05, "loss": 0.3971, "step": 724 }, { "epoch": 4.36885865457294, "grad_norm": 0.5959432721138, "learning_rate": 1e-05, "loss": 0.4167, "step": 725 }, { "epoch": 4.37490551776266, "grad_norm": 0.5944430828094482, "learning_rate": 1e-05, "loss": 0.4203, "step": 726 }, { "epoch": 4.380952380952381, "grad_norm": 0.557955801486969, "learning_rate": 1e-05, "loss": 0.4217, "step": 727 }, { "epoch": 4.3869992441421015, "grad_norm": 0.6216171383857727, "learning_rate": 1e-05, "loss": 0.405, "step": 728 }, { "epoch": 4.393046107331822, "grad_norm": 0.5890682935714722, "learning_rate": 1e-05, "loss": 0.4256, "step": 729 }, { "epoch": 4.399092970521542, "grad_norm": 0.6060634255409241, "learning_rate": 1e-05, "loss": 0.4121, "step": 730 }, { "epoch": 4.405139833711262, "grad_norm": 0.5492538809776306, "learning_rate": 1e-05, "loss": 0.3851, "step": 731 }, { "epoch": 4.411186696900983, "grad_norm": 0.5372482538223267, "learning_rate": 1e-05, "loss": 0.3787, "step": 732 }, { "epoch": 4.417233560090703, "grad_norm": 0.5772688388824463, "learning_rate": 1e-05, "loss": 0.4291, "step": 733 }, { "epoch": 4.423280423280423, "grad_norm": 0.5881582498550415, "learning_rate": 1e-05, "loss": 0.4151, "step": 734 }, { "epoch": 4.429327286470143, "grad_norm": 0.5601872205734253, "learning_rate": 1e-05, "loss": 0.3884, "step": 735 }, { "epoch": 4.4353741496598635, "grad_norm": 0.6866946220397949, "learning_rate": 1e-05, "loss": 0.4298, "step": 736 }, { "epoch": 4.441421012849585, "grad_norm": 0.5926361680030823, "learning_rate": 1e-05, "loss": 0.397, "step": 737 }, { "epoch": 4.447467876039305, "grad_norm": 0.5584942102432251, "learning_rate": 1e-05, "loss": 0.3888, "step": 738 }, { "epoch": 4.453514739229025, "grad_norm": 0.5899773836135864, "learning_rate": 1e-05, "loss": 0.4017, "step": 739 }, { "epoch": 4.459561602418745, "grad_norm": 0.6079036593437195, "learning_rate": 1e-05, "loss": 0.4164, "step": 740 }, { "epoch": 4.465608465608465, "grad_norm": 0.6349128484725952, "learning_rate": 1e-05, "loss": 0.3995, "step": 741 }, { "epoch": 4.471655328798186, "grad_norm": 0.6118564009666443, "learning_rate": 1e-05, "loss": 0.4312, "step": 742 }, { "epoch": 4.477702191987906, "grad_norm": 0.6407651305198669, "learning_rate": 1e-05, "loss": 0.3964, "step": 743 }, { "epoch": 4.4837490551776265, "grad_norm": 0.6188228130340576, "learning_rate": 1e-05, "loss": 0.3766, "step": 744 }, { "epoch": 4.489795918367347, "grad_norm": 0.5605167746543884, "learning_rate": 1e-05, "loss": 0.3955, "step": 745 }, { "epoch": 4.495842781557068, "grad_norm": 0.5918389558792114, "learning_rate": 1e-05, "loss": 0.3855, "step": 746 }, { "epoch": 4.501889644746788, "grad_norm": 0.5756183862686157, "learning_rate": 1e-05, "loss": 0.411, "step": 747 }, { "epoch": 4.507936507936508, "grad_norm": 0.784270703792572, "learning_rate": 1e-05, "loss": 0.4183, "step": 748 }, { "epoch": 4.513983371126228, "grad_norm": 0.6550467014312744, "learning_rate": 1e-05, "loss": 0.3907, "step": 749 }, { "epoch": 4.520030234315948, "grad_norm": 0.5685514211654663, "learning_rate": 1e-05, "loss": 0.3843, "step": 750 }, { "epoch": 4.526077097505669, "grad_norm": 0.7023878693580627, "learning_rate": 1e-05, "loss": 0.4104, "step": 751 }, { "epoch": 4.532123960695389, "grad_norm": 0.6227160692214966, "learning_rate": 1e-05, "loss": 0.3987, "step": 752 }, { "epoch": 4.5381708238851095, "grad_norm": 0.6170135736465454, "learning_rate": 1e-05, "loss": 0.4161, "step": 753 }, { "epoch": 4.54421768707483, "grad_norm": 0.6479912996292114, "learning_rate": 1e-05, "loss": 0.4097, "step": 754 }, { "epoch": 4.550264550264551, "grad_norm": 0.7180340886116028, "learning_rate": 1e-05, "loss": 0.4456, "step": 755 }, { "epoch": 4.556311413454271, "grad_norm": 0.6167806386947632, "learning_rate": 1e-05, "loss": 0.4129, "step": 756 }, { "epoch": 4.562358276643991, "grad_norm": 0.6317205429077148, "learning_rate": 1e-05, "loss": 0.3984, "step": 757 }, { "epoch": 4.568405139833711, "grad_norm": 0.5577670335769653, "learning_rate": 1e-05, "loss": 0.4042, "step": 758 }, { "epoch": 4.574452003023431, "grad_norm": 0.6305320262908936, "learning_rate": 1e-05, "loss": 0.417, "step": 759 }, { "epoch": 4.580498866213152, "grad_norm": 0.5920323729515076, "learning_rate": 1e-05, "loss": 0.4123, "step": 760 }, { "epoch": 4.586545729402872, "grad_norm": 0.6055198311805725, "learning_rate": 1e-05, "loss": 0.3766, "step": 761 }, { "epoch": 4.592592592592593, "grad_norm": 0.5790096521377563, "learning_rate": 1e-05, "loss": 0.392, "step": 762 }, { "epoch": 4.598639455782313, "grad_norm": 0.6098610162734985, "learning_rate": 1e-05, "loss": 0.4021, "step": 763 }, { "epoch": 4.604686318972034, "grad_norm": 0.5918252468109131, "learning_rate": 1e-05, "loss": 0.3949, "step": 764 }, { "epoch": 4.610733182161754, "grad_norm": 0.7322283387184143, "learning_rate": 1e-05, "loss": 0.388, "step": 765 }, { "epoch": 4.616780045351474, "grad_norm": 0.6204307079315186, "learning_rate": 1e-05, "loss": 0.3953, "step": 766 }, { "epoch": 4.622826908541194, "grad_norm": 0.6196629405021667, "learning_rate": 1e-05, "loss": 0.426, "step": 767 }, { "epoch": 4.628873771730914, "grad_norm": 0.5979423522949219, "learning_rate": 1e-05, "loss": 0.4033, "step": 768 }, { "epoch": 4.634920634920634, "grad_norm": 0.6183368563652039, "learning_rate": 1e-05, "loss": 0.3807, "step": 769 }, { "epoch": 4.6409674981103555, "grad_norm": 0.5708688497543335, "learning_rate": 1e-05, "loss": 0.3905, "step": 770 }, { "epoch": 4.647014361300076, "grad_norm": 0.7033713459968567, "learning_rate": 1e-05, "loss": 0.4224, "step": 771 }, { "epoch": 4.653061224489796, "grad_norm": 0.6814243197441101, "learning_rate": 1e-05, "loss": 0.4306, "step": 772 }, { "epoch": 4.659108087679516, "grad_norm": 0.6684714555740356, "learning_rate": 1e-05, "loss": 0.405, "step": 773 }, { "epoch": 4.665154950869237, "grad_norm": 0.7152442932128906, "learning_rate": 1e-05, "loss": 0.3969, "step": 774 }, { "epoch": 4.671201814058957, "grad_norm": 0.5884862542152405, "learning_rate": 1e-05, "loss": 0.4189, "step": 775 }, { "epoch": 4.677248677248677, "grad_norm": 0.5732600092887878, "learning_rate": 1e-05, "loss": 0.3967, "step": 776 }, { "epoch": 4.683295540438397, "grad_norm": 0.589526355266571, "learning_rate": 1e-05, "loss": 0.4079, "step": 777 }, { "epoch": 4.6893424036281175, "grad_norm": 0.6696410179138184, "learning_rate": 1e-05, "loss": 0.4141, "step": 778 }, { "epoch": 4.6953892668178385, "grad_norm": 0.5552047491073608, "learning_rate": 1e-05, "loss": 0.4089, "step": 779 }, { "epoch": 4.701436130007559, "grad_norm": 0.6207208037376404, "learning_rate": 1e-05, "loss": 0.4143, "step": 780 }, { "epoch": 4.707482993197279, "grad_norm": 0.6729224324226379, "learning_rate": 1e-05, "loss": 0.409, "step": 781 }, { "epoch": 4.713529856386999, "grad_norm": 0.599002480506897, "learning_rate": 1e-05, "loss": 0.4238, "step": 782 }, { "epoch": 4.71957671957672, "grad_norm": 0.589688777923584, "learning_rate": 1e-05, "loss": 0.407, "step": 783 }, { "epoch": 4.72562358276644, "grad_norm": 0.6342570185661316, "learning_rate": 1e-05, "loss": 0.4017, "step": 784 }, { "epoch": 4.73167044595616, "grad_norm": 0.59300297498703, "learning_rate": 1e-05, "loss": 0.4174, "step": 785 }, { "epoch": 4.73771730914588, "grad_norm": 0.6077319979667664, "learning_rate": 1e-05, "loss": 0.4059, "step": 786 }, { "epoch": 4.7437641723356005, "grad_norm": 0.6765458583831787, "learning_rate": 1e-05, "loss": 0.4251, "step": 787 }, { "epoch": 4.749811035525322, "grad_norm": 0.685330331325531, "learning_rate": 1e-05, "loss": 0.4224, "step": 788 }, { "epoch": 4.755857898715042, "grad_norm": 0.6128014922142029, "learning_rate": 1e-05, "loss": 0.408, "step": 789 }, { "epoch": 4.761904761904762, "grad_norm": 0.6340499520301819, "learning_rate": 1e-05, "loss": 0.3809, "step": 790 }, { "epoch": 4.767951625094482, "grad_norm": 0.6031668186187744, "learning_rate": 1e-05, "loss": 0.3903, "step": 791 }, { "epoch": 4.773998488284203, "grad_norm": 0.5895487666130066, "learning_rate": 1e-05, "loss": 0.399, "step": 792 }, { "epoch": 4.780045351473923, "grad_norm": 0.6340817213058472, "learning_rate": 1e-05, "loss": 0.3962, "step": 793 }, { "epoch": 4.786092214663643, "grad_norm": 0.6243227124214172, "learning_rate": 1e-05, "loss": 0.4328, "step": 794 }, { "epoch": 4.7921390778533635, "grad_norm": 0.5987825393676758, "learning_rate": 1e-05, "loss": 0.4015, "step": 795 }, { "epoch": 4.798185941043084, "grad_norm": 0.6007119417190552, "learning_rate": 1e-05, "loss": 0.4086, "step": 796 }, { "epoch": 4.804232804232804, "grad_norm": 0.6575354933738708, "learning_rate": 1e-05, "loss": 0.4044, "step": 797 }, { "epoch": 4.810279667422525, "grad_norm": 0.6065220832824707, "learning_rate": 1e-05, "loss": 0.4114, "step": 798 }, { "epoch": 4.816326530612245, "grad_norm": 0.6646922826766968, "learning_rate": 1e-05, "loss": 0.3936, "step": 799 }, { "epoch": 4.822373393801965, "grad_norm": 0.5684565901756287, "learning_rate": 1e-05, "loss": 0.4034, "step": 800 }, { "epoch": 4.828420256991685, "grad_norm": 0.5750718712806702, "learning_rate": 1e-05, "loss": 0.3987, "step": 801 }, { "epoch": 4.834467120181406, "grad_norm": 0.5854185223579407, "learning_rate": 1e-05, "loss": 0.4265, "step": 802 }, { "epoch": 4.840513983371126, "grad_norm": 0.5679122805595398, "learning_rate": 1e-05, "loss": 0.4112, "step": 803 }, { "epoch": 4.8465608465608465, "grad_norm": 0.6520095467567444, "learning_rate": 1e-05, "loss": 0.411, "step": 804 }, { "epoch": 4.852607709750567, "grad_norm": 0.6008126139640808, "learning_rate": 1e-05, "loss": 0.3972, "step": 805 }, { "epoch": 4.858654572940287, "grad_norm": 0.5682318210601807, "learning_rate": 1e-05, "loss": 0.3942, "step": 806 }, { "epoch": 4.864701436130008, "grad_norm": 0.5535310506820679, "learning_rate": 1e-05, "loss": 0.4134, "step": 807 }, { "epoch": 4.870748299319728, "grad_norm": 0.6449189782142639, "learning_rate": 1e-05, "loss": 0.4269, "step": 808 }, { "epoch": 4.876795162509448, "grad_norm": 0.6849431991577148, "learning_rate": 1e-05, "loss": 0.3448, "step": 809 }, { "epoch": 4.882842025699168, "grad_norm": 0.6312978267669678, "learning_rate": 1e-05, "loss": 0.4205, "step": 810 }, { "epoch": 4.888888888888889, "grad_norm": 0.6130386590957642, "learning_rate": 1e-05, "loss": 0.4036, "step": 811 }, { "epoch": 4.894935752078609, "grad_norm": 0.5687529444694519, "learning_rate": 1e-05, "loss": 0.4096, "step": 812 }, { "epoch": 4.90098261526833, "grad_norm": 0.7488480806350708, "learning_rate": 1e-05, "loss": 0.4183, "step": 813 }, { "epoch": 4.90702947845805, "grad_norm": 0.5877987742424011, "learning_rate": 1e-05, "loss": 0.4033, "step": 814 }, { "epoch": 4.91307634164777, "grad_norm": 0.6138003468513489, "learning_rate": 1e-05, "loss": 0.3753, "step": 815 }, { "epoch": 4.919123204837491, "grad_norm": 0.5748922824859619, "learning_rate": 1e-05, "loss": 0.4131, "step": 816 }, { "epoch": 4.925170068027211, "grad_norm": 0.6724855899810791, "learning_rate": 1e-05, "loss": 0.4218, "step": 817 }, { "epoch": 4.931216931216931, "grad_norm": 0.6609808206558228, "learning_rate": 1e-05, "loss": 0.3956, "step": 818 }, { "epoch": 4.937263794406651, "grad_norm": 0.5820748209953308, "learning_rate": 1e-05, "loss": 0.4221, "step": 819 }, { "epoch": 4.943310657596372, "grad_norm": 0.6472249627113342, "learning_rate": 1e-05, "loss": 0.445, "step": 820 }, { "epoch": 4.9493575207860925, "grad_norm": 0.6081069111824036, "learning_rate": 1e-05, "loss": 0.4416, "step": 821 }, { "epoch": 4.955404383975813, "grad_norm": 0.6290565729141235, "learning_rate": 1e-05, "loss": 0.4036, "step": 822 }, { "epoch": 4.961451247165533, "grad_norm": 0.7588478326797485, "learning_rate": 1e-05, "loss": 0.4216, "step": 823 }, { "epoch": 4.967498110355253, "grad_norm": 0.6369971036911011, "learning_rate": 1e-05, "loss": 0.41, "step": 824 }, { "epoch": 4.973544973544973, "grad_norm": 0.6309062838554382, "learning_rate": 1e-05, "loss": 0.4169, "step": 825 }, { "epoch": 4.979591836734694, "grad_norm": 0.6134672164916992, "learning_rate": 1e-05, "loss": 0.4463, "step": 826 }, { "epoch": 4.985638699924414, "grad_norm": 0.6943141222000122, "learning_rate": 1e-05, "loss": 0.4177, "step": 827 }, { "epoch": 4.991685563114134, "grad_norm": 0.6027072072029114, "learning_rate": 1e-05, "loss": 0.3984, "step": 828 }, { "epoch": 4.9977324263038545, "grad_norm": 0.6190260052680969, "learning_rate": 1e-05, "loss": 0.3847, "step": 829 }, { "epoch": 5.0, "grad_norm": 0.6190260052680969, "learning_rate": 1e-05, "loss": 0.1478, "step": 830 }, { "epoch": 5.00604686318972, "grad_norm": 0.5644193291664124, "learning_rate": 1e-05, "loss": 0.3883, "step": 831 }, { "epoch": 5.01209372637944, "grad_norm": 0.6146500706672668, "learning_rate": 1e-05, "loss": 0.3931, "step": 832 }, { "epoch": 5.018140589569161, "grad_norm": 0.573191225528717, "learning_rate": 1e-05, "loss": 0.4119, "step": 833 }, { "epoch": 5.0241874527588815, "grad_norm": 0.6214166283607483, "learning_rate": 1e-05, "loss": 0.4029, "step": 834 }, { "epoch": 5.030234315948602, "grad_norm": 0.637066125869751, "learning_rate": 1e-05, "loss": 0.3842, "step": 835 }, { "epoch": 5.036281179138322, "grad_norm": 0.5641190409660339, "learning_rate": 1e-05, "loss": 0.4011, "step": 836 }, { "epoch": 5.042328042328043, "grad_norm": 0.5529543161392212, "learning_rate": 1e-05, "loss": 0.4039, "step": 837 }, { "epoch": 5.048374905517763, "grad_norm": 0.5620729923248291, "learning_rate": 1e-05, "loss": 0.4125, "step": 838 }, { "epoch": 5.054421768707483, "grad_norm": 0.5791226625442505, "learning_rate": 1e-05, "loss": 0.3799, "step": 839 }, { "epoch": 5.060468631897203, "grad_norm": 0.6750614047050476, "learning_rate": 1e-05, "loss": 0.403, "step": 840 }, { "epoch": 5.066515495086923, "grad_norm": 0.6042314767837524, "learning_rate": 1e-05, "loss": 0.3858, "step": 841 }, { "epoch": 5.072562358276644, "grad_norm": 0.6166208386421204, "learning_rate": 1e-05, "loss": 0.3801, "step": 842 }, { "epoch": 5.0786092214663645, "grad_norm": 0.5962995290756226, "learning_rate": 1e-05, "loss": 0.4024, "step": 843 }, { "epoch": 5.084656084656085, "grad_norm": 0.6499397158622742, "learning_rate": 1e-05, "loss": 0.4208, "step": 844 }, { "epoch": 5.090702947845805, "grad_norm": 0.6491777896881104, "learning_rate": 1e-05, "loss": 0.3942, "step": 845 }, { "epoch": 5.096749811035525, "grad_norm": 0.5822063684463501, "learning_rate": 1e-05, "loss": 0.3976, "step": 846 }, { "epoch": 5.102796674225246, "grad_norm": 0.5677438974380493, "learning_rate": 1e-05, "loss": 0.3963, "step": 847 }, { "epoch": 5.108843537414966, "grad_norm": 0.5805613398551941, "learning_rate": 1e-05, "loss": 0.3568, "step": 848 }, { "epoch": 5.114890400604686, "grad_norm": 0.5924104452133179, "learning_rate": 1e-05, "loss": 0.4089, "step": 849 }, { "epoch": 5.120937263794406, "grad_norm": 0.6130611896514893, "learning_rate": 1e-05, "loss": 0.4017, "step": 850 }, { "epoch": 5.1269841269841265, "grad_norm": 0.6010926365852356, "learning_rate": 1e-05, "loss": 0.3855, "step": 851 }, { "epoch": 5.133030990173848, "grad_norm": 0.5658302307128906, "learning_rate": 1e-05, "loss": 0.3844, "step": 852 }, { "epoch": 5.139077853363568, "grad_norm": 0.5884436368942261, "learning_rate": 1e-05, "loss": 0.3799, "step": 853 }, { "epoch": 5.145124716553288, "grad_norm": 0.5914259552955627, "learning_rate": 1e-05, "loss": 0.4207, "step": 854 }, { "epoch": 5.151171579743008, "grad_norm": 0.6282844543457031, "learning_rate": 1e-05, "loss": 0.3956, "step": 855 }, { "epoch": 5.157218442932729, "grad_norm": 0.7018958926200867, "learning_rate": 1e-05, "loss": 0.3932, "step": 856 }, { "epoch": 5.163265306122449, "grad_norm": 0.6689984202384949, "learning_rate": 1e-05, "loss": 0.3634, "step": 857 }, { "epoch": 5.169312169312169, "grad_norm": 0.5609104037284851, "learning_rate": 1e-05, "loss": 0.3916, "step": 858 }, { "epoch": 5.1753590325018894, "grad_norm": 0.773385226726532, "learning_rate": 1e-05, "loss": 0.3805, "step": 859 }, { "epoch": 5.18140589569161, "grad_norm": 0.5585230588912964, "learning_rate": 1e-05, "loss": 0.3626, "step": 860 }, { "epoch": 5.187452758881331, "grad_norm": 0.6529350876808167, "learning_rate": 1e-05, "loss": 0.3778, "step": 861 }, { "epoch": 5.193499622071051, "grad_norm": 0.5960966944694519, "learning_rate": 1e-05, "loss": 0.3811, "step": 862 }, { "epoch": 5.199546485260771, "grad_norm": 0.6667266488075256, "learning_rate": 1e-05, "loss": 0.3873, "step": 863 }, { "epoch": 5.205593348450491, "grad_norm": 0.606666624546051, "learning_rate": 1e-05, "loss": 0.3996, "step": 864 }, { "epoch": 5.211640211640212, "grad_norm": 0.5707694292068481, "learning_rate": 1e-05, "loss": 0.3701, "step": 865 }, { "epoch": 5.217687074829932, "grad_norm": 0.5388896465301514, "learning_rate": 1e-05, "loss": 0.4094, "step": 866 }, { "epoch": 5.223733938019652, "grad_norm": 0.6056711673736572, "learning_rate": 1e-05, "loss": 0.4065, "step": 867 }, { "epoch": 5.2297808012093725, "grad_norm": 0.5956007838249207, "learning_rate": 1e-05, "loss": 0.3947, "step": 868 }, { "epoch": 5.235827664399093, "grad_norm": 0.6871543526649475, "learning_rate": 1e-05, "loss": 0.4237, "step": 869 }, { "epoch": 5.241874527588814, "grad_norm": 0.5842042565345764, "learning_rate": 1e-05, "loss": 0.4178, "step": 870 }, { "epoch": 5.247921390778534, "grad_norm": 0.720669686794281, "learning_rate": 1e-05, "loss": 0.395, "step": 871 }, { "epoch": 5.253968253968254, "grad_norm": 0.5799580812454224, "learning_rate": 1e-05, "loss": 0.3793, "step": 872 }, { "epoch": 5.260015117157974, "grad_norm": 0.5723053216934204, "learning_rate": 1e-05, "loss": 0.3958, "step": 873 }, { "epoch": 5.266061980347694, "grad_norm": 0.5999190211296082, "learning_rate": 1e-05, "loss": 0.4031, "step": 874 }, { "epoch": 5.272108843537415, "grad_norm": 0.610127329826355, "learning_rate": 1e-05, "loss": 0.3741, "step": 875 }, { "epoch": 5.278155706727135, "grad_norm": 0.5861444473266602, "learning_rate": 1e-05, "loss": 0.3977, "step": 876 }, { "epoch": 5.2842025699168556, "grad_norm": 0.6783025860786438, "learning_rate": 1e-05, "loss": 0.4223, "step": 877 }, { "epoch": 5.290249433106576, "grad_norm": 0.5829923748970032, "learning_rate": 1e-05, "loss": 0.4065, "step": 878 }, { "epoch": 5.296296296296296, "grad_norm": 0.5837451219558716, "learning_rate": 1e-05, "loss": 0.4271, "step": 879 }, { "epoch": 5.302343159486017, "grad_norm": 0.6096411347389221, "learning_rate": 1e-05, "loss": 0.4269, "step": 880 }, { "epoch": 5.308390022675737, "grad_norm": 0.5435876250267029, "learning_rate": 1e-05, "loss": 0.4035, "step": 881 }, { "epoch": 5.314436885865457, "grad_norm": 0.6891360878944397, "learning_rate": 1e-05, "loss": 0.393, "step": 882 }, { "epoch": 5.320483749055177, "grad_norm": 0.6976891756057739, "learning_rate": 1e-05, "loss": 0.3688, "step": 883 }, { "epoch": 5.326530612244898, "grad_norm": 0.5449498891830444, "learning_rate": 1e-05, "loss": 0.3924, "step": 884 }, { "epoch": 5.3325774754346185, "grad_norm": 0.5757130980491638, "learning_rate": 1e-05, "loss": 0.4167, "step": 885 }, { "epoch": 5.338624338624339, "grad_norm": 0.7904279828071594, "learning_rate": 1e-05, "loss": 0.3869, "step": 886 }, { "epoch": 5.344671201814059, "grad_norm": 0.5850111246109009, "learning_rate": 1e-05, "loss": 0.3957, "step": 887 }, { "epoch": 5.350718065003779, "grad_norm": 0.5646911859512329, "learning_rate": 1e-05, "loss": 0.403, "step": 888 }, { "epoch": 5.3567649281935, "grad_norm": 0.6256917715072632, "learning_rate": 1e-05, "loss": 0.4394, "step": 889 }, { "epoch": 5.36281179138322, "grad_norm": 0.7096668481826782, "learning_rate": 1e-05, "loss": 0.4493, "step": 890 }, { "epoch": 5.36885865457294, "grad_norm": 0.7172689437866211, "learning_rate": 1e-05, "loss": 0.3457, "step": 891 }, { "epoch": 5.37490551776266, "grad_norm": 0.5623851418495178, "learning_rate": 1e-05, "loss": 0.4091, "step": 892 }, { "epoch": 5.380952380952381, "grad_norm": 0.5638351440429688, "learning_rate": 1e-05, "loss": 0.4308, "step": 893 }, { "epoch": 5.3869992441421015, "grad_norm": 0.6328375339508057, "learning_rate": 1e-05, "loss": 0.3943, "step": 894 }, { "epoch": 5.393046107331822, "grad_norm": 0.6492130756378174, "learning_rate": 1e-05, "loss": 0.3814, "step": 895 }, { "epoch": 5.399092970521542, "grad_norm": 0.6095450520515442, "learning_rate": 1e-05, "loss": 0.3669, "step": 896 }, { "epoch": 5.405139833711262, "grad_norm": 0.5913347601890564, "learning_rate": 1e-05, "loss": 0.3854, "step": 897 }, { "epoch": 5.411186696900983, "grad_norm": 0.6443659067153931, "learning_rate": 1e-05, "loss": 0.4079, "step": 898 }, { "epoch": 5.417233560090703, "grad_norm": 0.624110758304596, "learning_rate": 1e-05, "loss": 0.3459, "step": 899 }, { "epoch": 5.423280423280423, "grad_norm": 0.5518099069595337, "learning_rate": 1e-05, "loss": 0.4175, "step": 900 }, { "epoch": 5.429327286470143, "grad_norm": 0.7107487320899963, "learning_rate": 1e-05, "loss": 0.3901, "step": 901 }, { "epoch": 5.4353741496598635, "grad_norm": 0.6260417103767395, "learning_rate": 1e-05, "loss": 0.3746, "step": 902 }, { "epoch": 5.441421012849585, "grad_norm": 0.5767214298248291, "learning_rate": 1e-05, "loss": 0.3958, "step": 903 }, { "epoch": 5.447467876039305, "grad_norm": 0.5957703590393066, "learning_rate": 1e-05, "loss": 0.4269, "step": 904 }, { "epoch": 5.453514739229025, "grad_norm": 0.5688199996948242, "learning_rate": 1e-05, "loss": 0.3506, "step": 905 }, { "epoch": 5.459561602418745, "grad_norm": 0.5642909407615662, "learning_rate": 1e-05, "loss": 0.4068, "step": 906 }, { "epoch": 5.465608465608465, "grad_norm": 0.63872891664505, "learning_rate": 1e-05, "loss": 0.4199, "step": 907 }, { "epoch": 5.471655328798186, "grad_norm": 0.6136561036109924, "learning_rate": 1e-05, "loss": 0.4044, "step": 908 }, { "epoch": 5.477702191987906, "grad_norm": 0.5827731490135193, "learning_rate": 1e-05, "loss": 0.4123, "step": 909 }, { "epoch": 5.4837490551776265, "grad_norm": 0.5805728435516357, "learning_rate": 1e-05, "loss": 0.4049, "step": 910 }, { "epoch": 5.489795918367347, "grad_norm": 0.5923486948013306, "learning_rate": 1e-05, "loss": 0.408, "step": 911 }, { "epoch": 5.495842781557068, "grad_norm": 0.6073300838470459, "learning_rate": 1e-05, "loss": 0.394, "step": 912 }, { "epoch": 5.501889644746788, "grad_norm": 0.5872564315795898, "learning_rate": 1e-05, "loss": 0.417, "step": 913 }, { "epoch": 5.507936507936508, "grad_norm": 0.5996981859207153, "learning_rate": 1e-05, "loss": 0.4037, "step": 914 }, { "epoch": 5.513983371126228, "grad_norm": 0.6282681822776794, "learning_rate": 1e-05, "loss": 0.3889, "step": 915 }, { "epoch": 5.520030234315948, "grad_norm": 0.6104469895362854, "learning_rate": 1e-05, "loss": 0.3771, "step": 916 }, { "epoch": 5.526077097505669, "grad_norm": 0.5976306200027466, "learning_rate": 1e-05, "loss": 0.3948, "step": 917 }, { "epoch": 5.532123960695389, "grad_norm": 0.6166002154350281, "learning_rate": 1e-05, "loss": 0.4081, "step": 918 }, { "epoch": 5.5381708238851095, "grad_norm": 0.6089850664138794, "learning_rate": 1e-05, "loss": 0.3705, "step": 919 }, { "epoch": 5.54421768707483, "grad_norm": 0.5671292543411255, "learning_rate": 1e-05, "loss": 0.3995, "step": 920 }, { "epoch": 5.550264550264551, "grad_norm": 0.564931333065033, "learning_rate": 1e-05, "loss": 0.4052, "step": 921 }, { "epoch": 5.556311413454271, "grad_norm": 0.5451776385307312, "learning_rate": 1e-05, "loss": 0.3877, "step": 922 }, { "epoch": 5.562358276643991, "grad_norm": 0.5680835843086243, "learning_rate": 1e-05, "loss": 0.41, "step": 923 }, { "epoch": 5.568405139833711, "grad_norm": 0.6245042085647583, "learning_rate": 1e-05, "loss": 0.4112, "step": 924 }, { "epoch": 5.574452003023431, "grad_norm": 0.572341799736023, "learning_rate": 1e-05, "loss": 0.385, "step": 925 }, { "epoch": 5.580498866213152, "grad_norm": 0.5925994515419006, "learning_rate": 1e-05, "loss": 0.4166, "step": 926 }, { "epoch": 5.586545729402872, "grad_norm": 0.610834538936615, "learning_rate": 1e-05, "loss": 0.378, "step": 927 }, { "epoch": 5.592592592592593, "grad_norm": 0.6370876431465149, "learning_rate": 1e-05, "loss": 0.3933, "step": 928 }, { "epoch": 5.598639455782313, "grad_norm": 0.5732808709144592, "learning_rate": 1e-05, "loss": 0.4175, "step": 929 }, { "epoch": 5.604686318972034, "grad_norm": 0.5638501048088074, "learning_rate": 1e-05, "loss": 0.3749, "step": 930 }, { "epoch": 5.610733182161754, "grad_norm": 0.5768354535102844, "learning_rate": 1e-05, "loss": 0.3989, "step": 931 }, { "epoch": 5.616780045351474, "grad_norm": 0.6740099787712097, "learning_rate": 1e-05, "loss": 0.3968, "step": 932 }, { "epoch": 5.622826908541194, "grad_norm": 0.5936634540557861, "learning_rate": 1e-05, "loss": 0.4047, "step": 933 }, { "epoch": 5.628873771730914, "grad_norm": 0.5882788300514221, "learning_rate": 1e-05, "loss": 0.4128, "step": 934 }, { "epoch": 5.634920634920634, "grad_norm": 0.621610701084137, "learning_rate": 1e-05, "loss": 0.4241, "step": 935 }, { "epoch": 5.6409674981103555, "grad_norm": 0.6270392537117004, "learning_rate": 1e-05, "loss": 0.4124, "step": 936 }, { "epoch": 5.647014361300076, "grad_norm": 0.5942815542221069, "learning_rate": 1e-05, "loss": 0.4079, "step": 937 }, { "epoch": 5.653061224489796, "grad_norm": 0.6260045170783997, "learning_rate": 1e-05, "loss": 0.4332, "step": 938 }, { "epoch": 5.659108087679516, "grad_norm": 0.61594557762146, "learning_rate": 1e-05, "loss": 0.4223, "step": 939 }, { "epoch": 5.665154950869237, "grad_norm": 0.5981598496437073, "learning_rate": 1e-05, "loss": 0.385, "step": 940 }, { "epoch": 5.671201814058957, "grad_norm": 0.6147893667221069, "learning_rate": 1e-05, "loss": 0.4107, "step": 941 }, { "epoch": 5.677248677248677, "grad_norm": 0.559853732585907, "learning_rate": 1e-05, "loss": 0.3838, "step": 942 }, { "epoch": 5.683295540438397, "grad_norm": 0.6140523552894592, "learning_rate": 1e-05, "loss": 0.3735, "step": 943 }, { "epoch": 5.6893424036281175, "grad_norm": 0.5514016151428223, "learning_rate": 1e-05, "loss": 0.4096, "step": 944 }, { "epoch": 5.6953892668178385, "grad_norm": 0.6033036112785339, "learning_rate": 1e-05, "loss": 0.3979, "step": 945 }, { "epoch": 5.701436130007559, "grad_norm": 0.58734530210495, "learning_rate": 1e-05, "loss": 0.375, "step": 946 }, { "epoch": 5.707482993197279, "grad_norm": 0.60529625415802, "learning_rate": 1e-05, "loss": 0.397, "step": 947 }, { "epoch": 5.713529856386999, "grad_norm": 0.7469598650932312, "learning_rate": 1e-05, "loss": 0.3889, "step": 948 }, { "epoch": 5.71957671957672, "grad_norm": 0.6164667010307312, "learning_rate": 1e-05, "loss": 0.3839, "step": 949 }, { "epoch": 5.72562358276644, "grad_norm": 0.7155433297157288, "learning_rate": 1e-05, "loss": 0.4118, "step": 950 }, { "epoch": 5.73167044595616, "grad_norm": 0.5666788816452026, "learning_rate": 1e-05, "loss": 0.3938, "step": 951 }, { "epoch": 5.73771730914588, "grad_norm": 0.5971105694770813, "learning_rate": 1e-05, "loss": 0.3936, "step": 952 }, { "epoch": 5.7437641723356005, "grad_norm": 0.6156784296035767, "learning_rate": 1e-05, "loss": 0.3914, "step": 953 }, { "epoch": 5.749811035525322, "grad_norm": 0.5921567678451538, "learning_rate": 1e-05, "loss": 0.3638, "step": 954 }, { "epoch": 5.755857898715042, "grad_norm": 0.5251359939575195, "learning_rate": 1e-05, "loss": 0.3943, "step": 955 }, { "epoch": 5.761904761904762, "grad_norm": 0.7488421201705933, "learning_rate": 1e-05, "loss": 0.4003, "step": 956 }, { "epoch": 5.767951625094482, "grad_norm": 0.731593668460846, "learning_rate": 1e-05, "loss": 0.4208, "step": 957 }, { "epoch": 5.773998488284203, "grad_norm": 0.7554077506065369, "learning_rate": 1e-05, "loss": 0.4212, "step": 958 }, { "epoch": 5.780045351473923, "grad_norm": 0.5795413255691528, "learning_rate": 1e-05, "loss": 0.3742, "step": 959 }, { "epoch": 5.786092214663643, "grad_norm": 0.5424965620040894, "learning_rate": 1e-05, "loss": 0.3962, "step": 960 }, { "epoch": 5.7921390778533635, "grad_norm": 0.5732012391090393, "learning_rate": 1e-05, "loss": 0.4199, "step": 961 }, { "epoch": 5.798185941043084, "grad_norm": 0.6036521196365356, "learning_rate": 1e-05, "loss": 0.3948, "step": 962 }, { "epoch": 5.804232804232804, "grad_norm": 0.6323441863059998, "learning_rate": 1e-05, "loss": 0.3859, "step": 963 }, { "epoch": 5.810279667422525, "grad_norm": 0.6568187475204468, "learning_rate": 1e-05, "loss": 0.3786, "step": 964 }, { "epoch": 5.816326530612245, "grad_norm": 0.5969505906105042, "learning_rate": 1e-05, "loss": 0.4168, "step": 965 }, { "epoch": 5.822373393801965, "grad_norm": 0.59014892578125, "learning_rate": 1e-05, "loss": 0.3786, "step": 966 }, { "epoch": 5.828420256991685, "grad_norm": 0.5850319862365723, "learning_rate": 1e-05, "loss": 0.4142, "step": 967 }, { "epoch": 5.834467120181406, "grad_norm": 0.6419547200202942, "learning_rate": 1e-05, "loss": 0.4044, "step": 968 }, { "epoch": 5.840513983371126, "grad_norm": 0.6006651520729065, "learning_rate": 1e-05, "loss": 0.4095, "step": 969 }, { "epoch": 5.8465608465608465, "grad_norm": 0.6315831542015076, "learning_rate": 1e-05, "loss": 0.4233, "step": 970 }, { "epoch": 5.852607709750567, "grad_norm": 0.6401822566986084, "learning_rate": 1e-05, "loss": 0.4129, "step": 971 }, { "epoch": 5.858654572940287, "grad_norm": 0.6018669605255127, "learning_rate": 1e-05, "loss": 0.4324, "step": 972 }, { "epoch": 5.864701436130008, "grad_norm": 0.7159103155136108, "learning_rate": 1e-05, "loss": 0.4018, "step": 973 }, { "epoch": 5.870748299319728, "grad_norm": 0.6411283612251282, "learning_rate": 1e-05, "loss": 0.3873, "step": 974 }, { "epoch": 5.876795162509448, "grad_norm": 0.6044118404388428, "learning_rate": 1e-05, "loss": 0.3752, "step": 975 }, { "epoch": 5.882842025699168, "grad_norm": 0.5958818197250366, "learning_rate": 1e-05, "loss": 0.395, "step": 976 }, { "epoch": 5.888888888888889, "grad_norm": 0.5821768641471863, "learning_rate": 1e-05, "loss": 0.3708, "step": 977 }, { "epoch": 5.894935752078609, "grad_norm": 0.5968974232673645, "learning_rate": 1e-05, "loss": 0.3981, "step": 978 }, { "epoch": 5.90098261526833, "grad_norm": 0.6307818293571472, "learning_rate": 1e-05, "loss": 0.3961, "step": 979 }, { "epoch": 5.90702947845805, "grad_norm": 0.5695503354072571, "learning_rate": 1e-05, "loss": 0.4096, "step": 980 }, { "epoch": 5.91307634164777, "grad_norm": 0.5654571652412415, "learning_rate": 1e-05, "loss": 0.4058, "step": 981 }, { "epoch": 5.919123204837491, "grad_norm": 0.6613664627075195, "learning_rate": 1e-05, "loss": 0.3939, "step": 982 }, { "epoch": 5.925170068027211, "grad_norm": 0.6234195232391357, "learning_rate": 1e-05, "loss": 0.3856, "step": 983 }, { "epoch": 5.931216931216931, "grad_norm": 0.6724370718002319, "learning_rate": 1e-05, "loss": 0.4009, "step": 984 }, { "epoch": 5.937263794406651, "grad_norm": 0.5534976124763489, "learning_rate": 1e-05, "loss": 0.3864, "step": 985 }, { "epoch": 5.943310657596372, "grad_norm": 0.5684236288070679, "learning_rate": 1e-05, "loss": 0.4045, "step": 986 }, { "epoch": 5.9493575207860925, "grad_norm": 0.5994831919670105, "learning_rate": 1e-05, "loss": 0.3794, "step": 987 }, { "epoch": 5.955404383975813, "grad_norm": 0.6198112368583679, "learning_rate": 1e-05, "loss": 0.4043, "step": 988 }, { "epoch": 5.961451247165533, "grad_norm": 0.7067509889602661, "learning_rate": 1e-05, "loss": 0.4246, "step": 989 }, { "epoch": 5.967498110355253, "grad_norm": 0.5762273669242859, "learning_rate": 1e-05, "loss": 0.4125, "step": 990 }, { "epoch": 5.973544973544973, "grad_norm": 0.5969759225845337, "learning_rate": 1e-05, "loss": 0.3983, "step": 991 }, { "epoch": 5.979591836734694, "grad_norm": 0.6566991806030273, "learning_rate": 1e-05, "loss": 0.3972, "step": 992 }, { "epoch": 5.985638699924414, "grad_norm": 0.5882044434547424, "learning_rate": 1e-05, "loss": 0.3983, "step": 993 }, { "epoch": 5.991685563114134, "grad_norm": 0.5768678784370422, "learning_rate": 1e-05, "loss": 0.3931, "step": 994 }, { "epoch": 5.9977324263038545, "grad_norm": 0.6125848293304443, "learning_rate": 1e-05, "loss": 0.4236, "step": 995 }, { "epoch": 6.0, "grad_norm": 0.6704882383346558, "learning_rate": 1e-05, "loss": 0.1546, "step": 996 }, { "epoch": 6.00604686318972, "grad_norm": 0.626146674156189, "learning_rate": 1e-05, "loss": 0.39, "step": 997 }, { "epoch": 6.01209372637944, "grad_norm": 0.5594955086708069, "learning_rate": 1e-05, "loss": 0.382, "step": 998 }, { "epoch": 6.018140589569161, "grad_norm": 0.6357175707817078, "learning_rate": 1e-05, "loss": 0.3907, "step": 999 }, { "epoch": 6.0241874527588815, "grad_norm": 0.5786148905754089, "learning_rate": 1e-05, "loss": 0.3663, "step": 1000 }, { "epoch": 6.030234315948602, "grad_norm": 0.6609814167022705, "learning_rate": 1e-05, "loss": 0.4152, "step": 1001 }, { "epoch": 6.036281179138322, "grad_norm": 0.5704321265220642, "learning_rate": 1e-05, "loss": 0.3884, "step": 1002 }, { "epoch": 6.042328042328043, "grad_norm": 0.5733149647712708, "learning_rate": 1e-05, "loss": 0.3905, "step": 1003 }, { "epoch": 6.048374905517763, "grad_norm": 0.5637567639350891, "learning_rate": 1e-05, "loss": 0.3929, "step": 1004 }, { "epoch": 6.054421768707483, "grad_norm": 0.5884572267532349, "learning_rate": 1e-05, "loss": 0.3972, "step": 1005 }, { "epoch": 6.060468631897203, "grad_norm": 0.5700960755348206, "learning_rate": 1e-05, "loss": 0.3914, "step": 1006 }, { "epoch": 6.066515495086923, "grad_norm": 0.6400357484817505, "learning_rate": 1e-05, "loss": 0.4231, "step": 1007 }, { "epoch": 6.072562358276644, "grad_norm": 0.6089173555374146, "learning_rate": 1e-05, "loss": 0.4007, "step": 1008 }, { "epoch": 6.0786092214663645, "grad_norm": 0.6267716288566589, "learning_rate": 1e-05, "loss": 0.4092, "step": 1009 }, { "epoch": 6.084656084656085, "grad_norm": 0.5675733685493469, "learning_rate": 1e-05, "loss": 0.3954, "step": 1010 }, { "epoch": 6.090702947845805, "grad_norm": 0.6339293718338013, "learning_rate": 1e-05, "loss": 0.3967, "step": 1011 }, { "epoch": 6.096749811035525, "grad_norm": 0.6186630129814148, "learning_rate": 1e-05, "loss": 0.3904, "step": 1012 }, { "epoch": 6.102796674225246, "grad_norm": 0.6005612015724182, "learning_rate": 1e-05, "loss": 0.3717, "step": 1013 }, { "epoch": 6.108843537414966, "grad_norm": 0.6890124082565308, "learning_rate": 1e-05, "loss": 0.4033, "step": 1014 }, { "epoch": 6.114890400604686, "grad_norm": 0.5295209288597107, "learning_rate": 1e-05, "loss": 0.4035, "step": 1015 }, { "epoch": 6.120937263794406, "grad_norm": 0.597955048084259, "learning_rate": 1e-05, "loss": 0.3713, "step": 1016 }, { "epoch": 6.1269841269841265, "grad_norm": 0.560454785823822, "learning_rate": 1e-05, "loss": 0.3649, "step": 1017 }, { "epoch": 6.133030990173848, "grad_norm": 0.63104647397995, "learning_rate": 1e-05, "loss": 0.4013, "step": 1018 }, { "epoch": 6.139077853363568, "grad_norm": 0.6220104694366455, "learning_rate": 1e-05, "loss": 0.3839, "step": 1019 }, { "epoch": 6.145124716553288, "grad_norm": 0.6061534285545349, "learning_rate": 1e-05, "loss": 0.3957, "step": 1020 }, { "epoch": 6.151171579743008, "grad_norm": 0.572428822517395, "learning_rate": 1e-05, "loss": 0.3815, "step": 1021 }, { "epoch": 6.157218442932729, "grad_norm": 0.582599937915802, "learning_rate": 1e-05, "loss": 0.401, "step": 1022 }, { "epoch": 6.163265306122449, "grad_norm": 0.6033139824867249, "learning_rate": 1e-05, "loss": 0.3802, "step": 1023 }, { "epoch": 6.169312169312169, "grad_norm": 0.5503534078598022, "learning_rate": 1e-05, "loss": 0.3921, "step": 1024 }, { "epoch": 6.1753590325018894, "grad_norm": 0.5670724511146545, "learning_rate": 1e-05, "loss": 0.4048, "step": 1025 }, { "epoch": 6.18140589569161, "grad_norm": 0.5847631096839905, "learning_rate": 1e-05, "loss": 0.3703, "step": 1026 }, { "epoch": 6.187452758881331, "grad_norm": 0.5787602663040161, "learning_rate": 1e-05, "loss": 0.3689, "step": 1027 }, { "epoch": 6.193499622071051, "grad_norm": 0.5777236223220825, "learning_rate": 1e-05, "loss": 0.3975, "step": 1028 }, { "epoch": 6.199546485260771, "grad_norm": 0.6359131336212158, "learning_rate": 1e-05, "loss": 0.4379, "step": 1029 }, { "epoch": 6.205593348450491, "grad_norm": 0.6322382688522339, "learning_rate": 1e-05, "loss": 0.3973, "step": 1030 }, { "epoch": 6.211640211640212, "grad_norm": 0.5580959320068359, "learning_rate": 1e-05, "loss": 0.3993, "step": 1031 }, { "epoch": 6.217687074829932, "grad_norm": 0.6174151301383972, "learning_rate": 1e-05, "loss": 0.3827, "step": 1032 }, { "epoch": 6.223733938019652, "grad_norm": 0.6057486534118652, "learning_rate": 1e-05, "loss": 0.4113, "step": 1033 }, { "epoch": 6.2297808012093725, "grad_norm": 0.5489740371704102, "learning_rate": 1e-05, "loss": 0.3838, "step": 1034 }, { "epoch": 6.235827664399093, "grad_norm": 0.625714123249054, "learning_rate": 1e-05, "loss": 0.3957, "step": 1035 }, { "epoch": 6.241874527588814, "grad_norm": 0.632999837398529, "learning_rate": 1e-05, "loss": 0.3899, "step": 1036 }, { "epoch": 6.247921390778534, "grad_norm": 0.6939270496368408, "learning_rate": 1e-05, "loss": 0.3861, "step": 1037 }, { "epoch": 6.253968253968254, "grad_norm": 0.6037151217460632, "learning_rate": 1e-05, "loss": 0.3705, "step": 1038 }, { "epoch": 6.260015117157974, "grad_norm": 0.6195087432861328, "learning_rate": 1e-05, "loss": 0.4448, "step": 1039 }, { "epoch": 6.266061980347694, "grad_norm": 0.5904221534729004, "learning_rate": 1e-05, "loss": 0.3953, "step": 1040 }, { "epoch": 6.272108843537415, "grad_norm": 0.5576536655426025, "learning_rate": 1e-05, "loss": 0.3824, "step": 1041 }, { "epoch": 6.278155706727135, "grad_norm": 0.553329348564148, "learning_rate": 1e-05, "loss": 0.3622, "step": 1042 }, { "epoch": 6.2842025699168556, "grad_norm": 0.5733786821365356, "learning_rate": 1e-05, "loss": 0.39, "step": 1043 }, { "epoch": 6.290249433106576, "grad_norm": 0.5372657179832458, "learning_rate": 1e-05, "loss": 0.4047, "step": 1044 }, { "epoch": 6.296296296296296, "grad_norm": 0.5832306742668152, "learning_rate": 1e-05, "loss": 0.3705, "step": 1045 }, { "epoch": 6.302343159486017, "grad_norm": 0.692181408405304, "learning_rate": 1e-05, "loss": 0.3614, "step": 1046 }, { "epoch": 6.308390022675737, "grad_norm": 0.616969108581543, "learning_rate": 1e-05, "loss": 0.3877, "step": 1047 }, { "epoch": 6.314436885865457, "grad_norm": 0.6414992809295654, "learning_rate": 1e-05, "loss": 0.3922, "step": 1048 }, { "epoch": 6.320483749055177, "grad_norm": 0.5722635388374329, "learning_rate": 1e-05, "loss": 0.36, "step": 1049 }, { "epoch": 6.326530612244898, "grad_norm": 0.598700761795044, "learning_rate": 1e-05, "loss": 0.3802, "step": 1050 }, { "epoch": 6.3325774754346185, "grad_norm": 0.6156331300735474, "learning_rate": 1e-05, "loss": 0.3843, "step": 1051 }, { "epoch": 6.338624338624339, "grad_norm": 0.5832606554031372, "learning_rate": 1e-05, "loss": 0.3806, "step": 1052 }, { "epoch": 6.344671201814059, "grad_norm": 0.6689149141311646, "learning_rate": 1e-05, "loss": 0.369, "step": 1053 }, { "epoch": 6.350718065003779, "grad_norm": 0.5656020641326904, "learning_rate": 1e-05, "loss": 0.3854, "step": 1054 }, { "epoch": 6.3567649281935, "grad_norm": 0.5650486946105957, "learning_rate": 1e-05, "loss": 0.3777, "step": 1055 }, { "epoch": 6.36281179138322, "grad_norm": 0.5749934315681458, "learning_rate": 1e-05, "loss": 0.357, "step": 1056 }, { "epoch": 6.36885865457294, "grad_norm": 0.6060725450515747, "learning_rate": 1e-05, "loss": 0.4092, "step": 1057 }, { "epoch": 6.37490551776266, "grad_norm": 0.6173314452171326, "learning_rate": 1e-05, "loss": 0.3916, "step": 1058 }, { "epoch": 6.380952380952381, "grad_norm": 0.6772918701171875, "learning_rate": 1e-05, "loss": 0.3941, "step": 1059 }, { "epoch": 6.3869992441421015, "grad_norm": 0.6189582943916321, "learning_rate": 1e-05, "loss": 0.3985, "step": 1060 }, { "epoch": 6.393046107331822, "grad_norm": 0.6059213280677795, "learning_rate": 1e-05, "loss": 0.3817, "step": 1061 }, { "epoch": 6.399092970521542, "grad_norm": 0.583820104598999, "learning_rate": 1e-05, "loss": 0.3791, "step": 1062 }, { "epoch": 6.405139833711262, "grad_norm": 0.5963466763496399, "learning_rate": 1e-05, "loss": 0.4106, "step": 1063 }, { "epoch": 6.411186696900983, "grad_norm": 0.5991074442863464, "learning_rate": 1e-05, "loss": 0.3802, "step": 1064 }, { "epoch": 6.417233560090703, "grad_norm": 0.6934362649917603, "learning_rate": 1e-05, "loss": 0.4128, "step": 1065 }, { "epoch": 6.423280423280423, "grad_norm": 0.702696681022644, "learning_rate": 1e-05, "loss": 0.3762, "step": 1066 }, { "epoch": 6.429327286470143, "grad_norm": 0.5832969546318054, "learning_rate": 1e-05, "loss": 0.3953, "step": 1067 }, { "epoch": 6.4353741496598635, "grad_norm": 0.6187334656715393, "learning_rate": 1e-05, "loss": 0.3929, "step": 1068 }, { "epoch": 6.441421012849585, "grad_norm": 0.5777467489242554, "learning_rate": 1e-05, "loss": 0.3941, "step": 1069 }, { "epoch": 6.447467876039305, "grad_norm": 0.5246186256408691, "learning_rate": 1e-05, "loss": 0.3712, "step": 1070 }, { "epoch": 6.453514739229025, "grad_norm": 0.584656298160553, "learning_rate": 1e-05, "loss": 0.3893, "step": 1071 }, { "epoch": 6.459561602418745, "grad_norm": 0.5781977772712708, "learning_rate": 1e-05, "loss": 0.3684, "step": 1072 }, { "epoch": 6.465608465608465, "grad_norm": 0.5531890392303467, "learning_rate": 1e-05, "loss": 0.3917, "step": 1073 }, { "epoch": 6.471655328798186, "grad_norm": 0.5769007802009583, "learning_rate": 1e-05, "loss": 0.3875, "step": 1074 }, { "epoch": 6.477702191987906, "grad_norm": 0.5969364643096924, "learning_rate": 1e-05, "loss": 0.3824, "step": 1075 }, { "epoch": 6.4837490551776265, "grad_norm": 0.5764490365982056, "learning_rate": 1e-05, "loss": 0.3621, "step": 1076 }, { "epoch": 6.489795918367347, "grad_norm": 0.6414207816123962, "learning_rate": 1e-05, "loss": 0.383, "step": 1077 }, { "epoch": 6.495842781557068, "grad_norm": 0.5860772132873535, "learning_rate": 1e-05, "loss": 0.391, "step": 1078 }, { "epoch": 6.501889644746788, "grad_norm": 0.7548459768295288, "learning_rate": 1e-05, "loss": 0.4154, "step": 1079 }, { "epoch": 6.507936507936508, "grad_norm": 0.5952665209770203, "learning_rate": 1e-05, "loss": 0.3982, "step": 1080 }, { "epoch": 6.513983371126228, "grad_norm": 0.6825617551803589, "learning_rate": 1e-05, "loss": 0.3984, "step": 1081 }, { "epoch": 6.520030234315948, "grad_norm": 0.6093886494636536, "learning_rate": 1e-05, "loss": 0.3656, "step": 1082 }, { "epoch": 6.526077097505669, "grad_norm": 0.6860135197639465, "learning_rate": 1e-05, "loss": 0.398, "step": 1083 }, { "epoch": 6.532123960695389, "grad_norm": 0.5913981199264526, "learning_rate": 1e-05, "loss": 0.3999, "step": 1084 }, { "epoch": 6.5381708238851095, "grad_norm": 0.54076087474823, "learning_rate": 1e-05, "loss": 0.3741, "step": 1085 }, { "epoch": 6.54421768707483, "grad_norm": 0.5773366093635559, "learning_rate": 1e-05, "loss": 0.3761, "step": 1086 }, { "epoch": 6.550264550264551, "grad_norm": 0.5920423269271851, "learning_rate": 1e-05, "loss": 0.376, "step": 1087 }, { "epoch": 6.556311413454271, "grad_norm": 0.6666449308395386, "learning_rate": 1e-05, "loss": 0.3616, "step": 1088 }, { "epoch": 6.562358276643991, "grad_norm": 0.7087470889091492, "learning_rate": 1e-05, "loss": 0.3976, "step": 1089 }, { "epoch": 6.568405139833711, "grad_norm": 0.6392657160758972, "learning_rate": 1e-05, "loss": 0.4238, "step": 1090 }, { "epoch": 6.574452003023431, "grad_norm": 0.6267151236534119, "learning_rate": 1e-05, "loss": 0.3803, "step": 1091 }, { "epoch": 6.580498866213152, "grad_norm": 0.5592185854911804, "learning_rate": 1e-05, "loss": 0.3657, "step": 1092 }, { "epoch": 6.586545729402872, "grad_norm": 0.5780152082443237, "learning_rate": 1e-05, "loss": 0.3728, "step": 1093 }, { "epoch": 6.592592592592593, "grad_norm": 0.5514151453971863, "learning_rate": 1e-05, "loss": 0.3846, "step": 1094 }, { "epoch": 6.598639455782313, "grad_norm": 0.6035811305046082, "learning_rate": 1e-05, "loss": 0.3991, "step": 1095 }, { "epoch": 6.604686318972034, "grad_norm": 0.6005744338035583, "learning_rate": 1e-05, "loss": 0.3869, "step": 1096 }, { "epoch": 6.610733182161754, "grad_norm": 0.5692220330238342, "learning_rate": 1e-05, "loss": 0.3838, "step": 1097 }, { "epoch": 6.616780045351474, "grad_norm": 0.7577038407325745, "learning_rate": 1e-05, "loss": 0.4094, "step": 1098 }, { "epoch": 6.622826908541194, "grad_norm": 0.5737401843070984, "learning_rate": 1e-05, "loss": 0.3775, "step": 1099 }, { "epoch": 6.628873771730914, "grad_norm": 0.587811291217804, "learning_rate": 1e-05, "loss": 0.3893, "step": 1100 }, { "epoch": 6.634920634920634, "grad_norm": 0.6433628797531128, "learning_rate": 1e-05, "loss": 0.3956, "step": 1101 }, { "epoch": 6.6409674981103555, "grad_norm": 0.5342737436294556, "learning_rate": 1e-05, "loss": 0.3942, "step": 1102 }, { "epoch": 6.647014361300076, "grad_norm": 0.5537495017051697, "learning_rate": 1e-05, "loss": 0.3973, "step": 1103 }, { "epoch": 6.653061224489796, "grad_norm": 0.6303655505180359, "learning_rate": 1e-05, "loss": 0.407, "step": 1104 }, { "epoch": 6.659108087679516, "grad_norm": 0.6136273145675659, "learning_rate": 1e-05, "loss": 0.3997, "step": 1105 }, { "epoch": 6.665154950869237, "grad_norm": 0.6944162249565125, "learning_rate": 1e-05, "loss": 0.4083, "step": 1106 }, { "epoch": 6.671201814058957, "grad_norm": 0.581649124622345, "learning_rate": 1e-05, "loss": 0.397, "step": 1107 }, { "epoch": 6.677248677248677, "grad_norm": 0.5745250582695007, "learning_rate": 1e-05, "loss": 0.3701, "step": 1108 }, { "epoch": 6.683295540438397, "grad_norm": 0.5984821319580078, "learning_rate": 1e-05, "loss": 0.3897, "step": 1109 }, { "epoch": 6.6893424036281175, "grad_norm": 0.7539675235748291, "learning_rate": 1e-05, "loss": 0.3825, "step": 1110 }, { "epoch": 6.6953892668178385, "grad_norm": 0.5996633768081665, "learning_rate": 1e-05, "loss": 0.4012, "step": 1111 }, { "epoch": 6.701436130007559, "grad_norm": 0.5695548057556152, "learning_rate": 1e-05, "loss": 0.3991, "step": 1112 }, { "epoch": 6.707482993197279, "grad_norm": 0.6170526742935181, "learning_rate": 1e-05, "loss": 0.4011, "step": 1113 }, { "epoch": 6.713529856386999, "grad_norm": 0.5623385310173035, "learning_rate": 1e-05, "loss": 0.3938, "step": 1114 }, { "epoch": 6.71957671957672, "grad_norm": 0.6712462902069092, "learning_rate": 1e-05, "loss": 0.3894, "step": 1115 }, { "epoch": 6.72562358276644, "grad_norm": 0.6125419735908508, "learning_rate": 1e-05, "loss": 0.429, "step": 1116 }, { "epoch": 6.73167044595616, "grad_norm": 0.601152241230011, "learning_rate": 1e-05, "loss": 0.3945, "step": 1117 }, { "epoch": 6.73771730914588, "grad_norm": 0.5922526717185974, "learning_rate": 1e-05, "loss": 0.3821, "step": 1118 }, { "epoch": 6.7437641723356005, "grad_norm": 0.5829781889915466, "learning_rate": 1e-05, "loss": 0.3524, "step": 1119 }, { "epoch": 6.749811035525322, "grad_norm": 0.5353279113769531, "learning_rate": 1e-05, "loss": 0.3802, "step": 1120 }, { "epoch": 6.755857898715042, "grad_norm": 0.6298103928565979, "learning_rate": 1e-05, "loss": 0.3829, "step": 1121 }, { "epoch": 6.761904761904762, "grad_norm": 0.6201866269111633, "learning_rate": 1e-05, "loss": 0.3794, "step": 1122 }, { "epoch": 6.767951625094482, "grad_norm": 0.5725922584533691, "learning_rate": 1e-05, "loss": 0.395, "step": 1123 }, { "epoch": 6.773998488284203, "grad_norm": 0.6192656755447388, "learning_rate": 1e-05, "loss": 0.4021, "step": 1124 }, { "epoch": 6.780045351473923, "grad_norm": 0.6989803910255432, "learning_rate": 1e-05, "loss": 0.4133, "step": 1125 }, { "epoch": 6.786092214663643, "grad_norm": 0.6113109588623047, "learning_rate": 1e-05, "loss": 0.3868, "step": 1126 }, { "epoch": 6.7921390778533635, "grad_norm": 0.5841162204742432, "learning_rate": 1e-05, "loss": 0.3844, "step": 1127 }, { "epoch": 6.798185941043084, "grad_norm": 0.5753156542778015, "learning_rate": 1e-05, "loss": 0.4127, "step": 1128 }, { "epoch": 6.804232804232804, "grad_norm": 0.6350629329681396, "learning_rate": 1e-05, "loss": 0.3946, "step": 1129 }, { "epoch": 6.810279667422525, "grad_norm": 0.57342529296875, "learning_rate": 1e-05, "loss": 0.3933, "step": 1130 }, { "epoch": 6.816326530612245, "grad_norm": 0.6518730521202087, "learning_rate": 1e-05, "loss": 0.3773, "step": 1131 }, { "epoch": 6.822373393801965, "grad_norm": 0.5810918211936951, "learning_rate": 1e-05, "loss": 0.3896, "step": 1132 }, { "epoch": 6.828420256991685, "grad_norm": 0.5996600985527039, "learning_rate": 1e-05, "loss": 0.3766, "step": 1133 }, { "epoch": 6.834467120181406, "grad_norm": 0.5950771570205688, "learning_rate": 1e-05, "loss": 0.3935, "step": 1134 }, { "epoch": 6.840513983371126, "grad_norm": 0.5845481753349304, "learning_rate": 1e-05, "loss": 0.3748, "step": 1135 }, { "epoch": 6.8465608465608465, "grad_norm": 0.5470344424247742, "learning_rate": 1e-05, "loss": 0.376, "step": 1136 }, { "epoch": 6.852607709750567, "grad_norm": 0.6088722348213196, "learning_rate": 1e-05, "loss": 0.3764, "step": 1137 }, { "epoch": 6.858654572940287, "grad_norm": 0.5826227068901062, "learning_rate": 1e-05, "loss": 0.4259, "step": 1138 }, { "epoch": 6.864701436130008, "grad_norm": 0.6227845549583435, "learning_rate": 1e-05, "loss": 0.3998, "step": 1139 }, { "epoch": 6.870748299319728, "grad_norm": 0.5977078676223755, "learning_rate": 1e-05, "loss": 0.3881, "step": 1140 }, { "epoch": 6.876795162509448, "grad_norm": 0.5821226239204407, "learning_rate": 1e-05, "loss": 0.381, "step": 1141 }, { "epoch": 6.882842025699168, "grad_norm": 0.7231923937797546, "learning_rate": 1e-05, "loss": 0.409, "step": 1142 }, { "epoch": 6.888888888888889, "grad_norm": 0.5804541110992432, "learning_rate": 1e-05, "loss": 0.3889, "step": 1143 }, { "epoch": 6.894935752078609, "grad_norm": 0.5993728637695312, "learning_rate": 1e-05, "loss": 0.3804, "step": 1144 }, { "epoch": 6.90098261526833, "grad_norm": 0.6269215941429138, "learning_rate": 1e-05, "loss": 0.3821, "step": 1145 }, { "epoch": 6.90702947845805, "grad_norm": 0.5805836319923401, "learning_rate": 1e-05, "loss": 0.4183, "step": 1146 }, { "epoch": 6.91307634164777, "grad_norm": 0.5604917407035828, "learning_rate": 1e-05, "loss": 0.3833, "step": 1147 }, { "epoch": 6.919123204837491, "grad_norm": 0.6047624945640564, "learning_rate": 1e-05, "loss": 0.3974, "step": 1148 }, { "epoch": 6.925170068027211, "grad_norm": 0.743976354598999, "learning_rate": 1e-05, "loss": 0.3923, "step": 1149 }, { "epoch": 6.931216931216931, "grad_norm": 0.5439193844795227, "learning_rate": 1e-05, "loss": 0.4065, "step": 1150 }, { "epoch": 6.937263794406651, "grad_norm": 0.617384672164917, "learning_rate": 1e-05, "loss": 0.3876, "step": 1151 }, { "epoch": 6.943310657596372, "grad_norm": 0.6834210157394409, "learning_rate": 1e-05, "loss": 0.3818, "step": 1152 }, { "epoch": 6.9493575207860925, "grad_norm": 0.5699087381362915, "learning_rate": 1e-05, "loss": 0.4014, "step": 1153 }, { "epoch": 6.955404383975813, "grad_norm": 0.5869408249855042, "learning_rate": 1e-05, "loss": 0.4098, "step": 1154 }, { "epoch": 6.961451247165533, "grad_norm": 0.6371724009513855, "learning_rate": 1e-05, "loss": 0.4033, "step": 1155 }, { "epoch": 6.961451247165533, "step": 1155, "total_flos": 7.579944923502635e+19, "train_loss": 0.4424313517359944, "train_runtime": 87942.4205, "train_samples_per_second": 1.685, "train_steps_per_second": 0.013 } ], "logging_steps": 1.0, "max_steps": 1155, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.579944923502635e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }