{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9962264150943394, "eval_steps": 500, "global_step": 1986, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015094339622641509, "grad_norm": 42.84351638901055, "learning_rate": 2.5125628140703517e-07, "loss": 11.7003, "step": 1 }, { "epoch": 0.0030188679245283017, "grad_norm": 40.58796594063912, "learning_rate": 5.025125628140703e-07, "loss": 11.756, "step": 2 }, { "epoch": 0.004528301886792453, "grad_norm": 42.144032983881885, "learning_rate": 7.537688442211055e-07, "loss": 11.7907, "step": 3 }, { "epoch": 0.0060377358490566035, "grad_norm": 45.72964350820413, "learning_rate": 1.0050251256281407e-06, "loss": 11.6821, "step": 4 }, { "epoch": 0.007547169811320755, "grad_norm": 40.81047058078221, "learning_rate": 1.256281407035176e-06, "loss": 11.8394, "step": 5 }, { "epoch": 0.009056603773584906, "grad_norm": 42.899640519128475, "learning_rate": 1.507537688442211e-06, "loss": 11.6455, "step": 6 }, { "epoch": 0.010566037735849057, "grad_norm": 40.231481376946206, "learning_rate": 1.7587939698492463e-06, "loss": 11.7821, "step": 7 }, { "epoch": 0.012075471698113207, "grad_norm": 43.18420575417782, "learning_rate": 2.0100502512562813e-06, "loss": 11.51, "step": 8 }, { "epoch": 0.013584905660377358, "grad_norm": 45.40072785894473, "learning_rate": 2.261306532663317e-06, "loss": 11.4937, "step": 9 }, { "epoch": 0.01509433962264151, "grad_norm": 60.478523691312404, "learning_rate": 2.512562814070352e-06, "loss": 10.4386, "step": 10 }, { "epoch": 0.01660377358490566, "grad_norm": 55.94613742853918, "learning_rate": 2.7638190954773874e-06, "loss": 10.2241, "step": 11 }, { "epoch": 0.018113207547169812, "grad_norm": 54.818513224626024, "learning_rate": 3.015075376884422e-06, "loss": 10.6026, "step": 12 }, { "epoch": 0.019622641509433963, "grad_norm": 84.73466989691137, "learning_rate": 3.2663316582914575e-06, "loss": 6.2547, "step": 13 }, { "epoch": 0.021132075471698115, "grad_norm": 76.688984609838, "learning_rate": 3.5175879396984926e-06, "loss": 5.6541, "step": 14 }, { "epoch": 0.022641509433962263, "grad_norm": 76.71708348082493, "learning_rate": 3.7688442211055276e-06, "loss": 5.5968, "step": 15 }, { "epoch": 0.024150943396226414, "grad_norm": 57.53302315459275, "learning_rate": 4.020100502512563e-06, "loss": 4.4551, "step": 16 }, { "epoch": 0.025660377358490565, "grad_norm": 20.381484799022463, "learning_rate": 4.271356783919598e-06, "loss": 2.6246, "step": 17 }, { "epoch": 0.027169811320754716, "grad_norm": 7.4120018391528975, "learning_rate": 4.522613065326634e-06, "loss": 1.9439, "step": 18 }, { "epoch": 0.028679245283018868, "grad_norm": 7.246473146417547, "learning_rate": 4.773869346733668e-06, "loss": 2.0671, "step": 19 }, { "epoch": 0.03018867924528302, "grad_norm": 5.317897222021412, "learning_rate": 5.025125628140704e-06, "loss": 1.8909, "step": 20 }, { "epoch": 0.03169811320754717, "grad_norm": 4.371583867174312, "learning_rate": 5.276381909547739e-06, "loss": 1.706, "step": 21 }, { "epoch": 0.03320754716981132, "grad_norm": 3.7247484208320563, "learning_rate": 5.527638190954775e-06, "loss": 1.6542, "step": 22 }, { "epoch": 0.03471698113207547, "grad_norm": 3.5432475784664277, "learning_rate": 5.778894472361809e-06, "loss": 1.8566, "step": 23 }, { "epoch": 0.036226415094339624, "grad_norm": 2.2793446224909015, "learning_rate": 6.030150753768844e-06, "loss": 1.6225, "step": 24 }, { "epoch": 0.03773584905660377, "grad_norm": 2.976629270506673, "learning_rate": 6.2814070351758795e-06, "loss": 1.8027, "step": 25 }, { "epoch": 0.03924528301886793, "grad_norm": 1.9292710465468252, "learning_rate": 6.532663316582915e-06, "loss": 1.6694, "step": 26 }, { "epoch": 0.040754716981132075, "grad_norm": 1.8235949344084434, "learning_rate": 6.7839195979899505e-06, "loss": 1.3449, "step": 27 }, { "epoch": 0.04226415094339623, "grad_norm": 1.2509325845880814, "learning_rate": 7.035175879396985e-06, "loss": 1.4428, "step": 28 }, { "epoch": 0.04377358490566038, "grad_norm": 1.1446202521586268, "learning_rate": 7.28643216080402e-06, "loss": 1.2865, "step": 29 }, { "epoch": 0.045283018867924525, "grad_norm": 1.036741546377771, "learning_rate": 7.537688442211055e-06, "loss": 1.4755, "step": 30 }, { "epoch": 0.04679245283018868, "grad_norm": 1.0321415589665388, "learning_rate": 7.788944723618092e-06, "loss": 1.5658, "step": 31 }, { "epoch": 0.04830188679245283, "grad_norm": 0.8390420469610276, "learning_rate": 8.040201005025125e-06, "loss": 1.2749, "step": 32 }, { "epoch": 0.04981132075471698, "grad_norm": 0.8773961154600026, "learning_rate": 8.291457286432161e-06, "loss": 1.3381, "step": 33 }, { "epoch": 0.05132075471698113, "grad_norm": 0.688788854923463, "learning_rate": 8.542713567839196e-06, "loss": 1.2844, "step": 34 }, { "epoch": 0.052830188679245285, "grad_norm": 0.697731142261381, "learning_rate": 8.793969849246232e-06, "loss": 1.0224, "step": 35 }, { "epoch": 0.05433962264150943, "grad_norm": 0.7481698939939284, "learning_rate": 9.045226130653267e-06, "loss": 1.3223, "step": 36 }, { "epoch": 0.05584905660377359, "grad_norm": 0.713480143085657, "learning_rate": 9.296482412060301e-06, "loss": 1.3433, "step": 37 }, { "epoch": 0.057358490566037736, "grad_norm": 0.5516147418737927, "learning_rate": 9.547738693467337e-06, "loss": 1.1849, "step": 38 }, { "epoch": 0.05886792452830188, "grad_norm": 0.5489764999546655, "learning_rate": 9.798994974874372e-06, "loss": 1.221, "step": 39 }, { "epoch": 0.06037735849056604, "grad_norm": 0.5976907585489283, "learning_rate": 1.0050251256281408e-05, "loss": 1.1651, "step": 40 }, { "epoch": 0.061886792452830186, "grad_norm": 0.5729987457779835, "learning_rate": 1.0301507537688443e-05, "loss": 1.2038, "step": 41 }, { "epoch": 0.06339622641509433, "grad_norm": 0.849789141484363, "learning_rate": 1.0552763819095479e-05, "loss": 0.975, "step": 42 }, { "epoch": 0.0649056603773585, "grad_norm": 0.5378335024110182, "learning_rate": 1.0804020100502512e-05, "loss": 1.1693, "step": 43 }, { "epoch": 0.06641509433962264, "grad_norm": 0.4605169598599595, "learning_rate": 1.105527638190955e-05, "loss": 1.0198, "step": 44 }, { "epoch": 0.06792452830188679, "grad_norm": 0.4716228872243182, "learning_rate": 1.1306532663316583e-05, "loss": 1.0925, "step": 45 }, { "epoch": 0.06943396226415094, "grad_norm": 0.4669934577396619, "learning_rate": 1.1557788944723619e-05, "loss": 1.1274, "step": 46 }, { "epoch": 0.0709433962264151, "grad_norm": 0.4621587559628666, "learning_rate": 1.1809045226130654e-05, "loss": 1.1123, "step": 47 }, { "epoch": 0.07245283018867925, "grad_norm": 0.7302537885626854, "learning_rate": 1.2060301507537688e-05, "loss": 1.0177, "step": 48 }, { "epoch": 0.0739622641509434, "grad_norm": 0.4731493256116344, "learning_rate": 1.2311557788944725e-05, "loss": 1.0597, "step": 49 }, { "epoch": 0.07547169811320754, "grad_norm": 0.44703159598258635, "learning_rate": 1.2562814070351759e-05, "loss": 1.1184, "step": 50 }, { "epoch": 0.07698113207547169, "grad_norm": 0.37947111567149666, "learning_rate": 1.2814070351758795e-05, "loss": 0.9152, "step": 51 }, { "epoch": 0.07849056603773585, "grad_norm": 0.4138677678629633, "learning_rate": 1.306532663316583e-05, "loss": 0.9698, "step": 52 }, { "epoch": 0.08, "grad_norm": 0.4210863662592338, "learning_rate": 1.3316582914572864e-05, "loss": 1.0264, "step": 53 }, { "epoch": 0.08150943396226415, "grad_norm": 0.4657161680493391, "learning_rate": 1.3567839195979901e-05, "loss": 1.2491, "step": 54 }, { "epoch": 0.0830188679245283, "grad_norm": 0.4322144912028561, "learning_rate": 1.3819095477386935e-05, "loss": 1.0837, "step": 55 }, { "epoch": 0.08452830188679246, "grad_norm": 0.4895899695520348, "learning_rate": 1.407035175879397e-05, "loss": 0.9843, "step": 56 }, { "epoch": 0.0860377358490566, "grad_norm": 0.38329697027276266, "learning_rate": 1.4321608040201007e-05, "loss": 0.9874, "step": 57 }, { "epoch": 0.08754716981132075, "grad_norm": 0.3728506554128249, "learning_rate": 1.457286432160804e-05, "loss": 1.0505, "step": 58 }, { "epoch": 0.0890566037735849, "grad_norm": 0.42582396506199066, "learning_rate": 1.4824120603015077e-05, "loss": 1.0177, "step": 59 }, { "epoch": 0.09056603773584905, "grad_norm": 0.4039105071166847, "learning_rate": 1.507537688442211e-05, "loss": 1.0522, "step": 60 }, { "epoch": 0.09207547169811321, "grad_norm": 0.39048018099504467, "learning_rate": 1.5326633165829146e-05, "loss": 1.1106, "step": 61 }, { "epoch": 0.09358490566037736, "grad_norm": 0.3598639795497603, "learning_rate": 1.5577889447236183e-05, "loss": 0.9756, "step": 62 }, { "epoch": 0.09509433962264151, "grad_norm": 0.33978834353660775, "learning_rate": 1.5829145728643217e-05, "loss": 0.9874, "step": 63 }, { "epoch": 0.09660377358490566, "grad_norm": 0.3571676859401004, "learning_rate": 1.608040201005025e-05, "loss": 0.9647, "step": 64 }, { "epoch": 0.09811320754716982, "grad_norm": 0.3713953220516257, "learning_rate": 1.6331658291457288e-05, "loss": 1.0445, "step": 65 }, { "epoch": 0.09962264150943397, "grad_norm": 0.3708364360600743, "learning_rate": 1.6582914572864322e-05, "loss": 1.0168, "step": 66 }, { "epoch": 0.10113207547169811, "grad_norm": 0.3673356575030503, "learning_rate": 1.683417085427136e-05, "loss": 1.0713, "step": 67 }, { "epoch": 0.10264150943396226, "grad_norm": 0.3817704093554328, "learning_rate": 1.7085427135678393e-05, "loss": 1.1859, "step": 68 }, { "epoch": 0.10415094339622641, "grad_norm": 0.35553437445641883, "learning_rate": 1.7336683417085427e-05, "loss": 0.9106, "step": 69 }, { "epoch": 0.10566037735849057, "grad_norm": 0.3549271646935127, "learning_rate": 1.7587939698492464e-05, "loss": 0.9792, "step": 70 }, { "epoch": 0.10716981132075472, "grad_norm": 0.34909558103973126, "learning_rate": 1.7839195979899497e-05, "loss": 0.9646, "step": 71 }, { "epoch": 0.10867924528301887, "grad_norm": 0.3777705845997939, "learning_rate": 1.8090452261306535e-05, "loss": 1.0047, "step": 72 }, { "epoch": 0.11018867924528301, "grad_norm": 0.34807538051544334, "learning_rate": 1.834170854271357e-05, "loss": 1.0855, "step": 73 }, { "epoch": 0.11169811320754718, "grad_norm": 0.3253777447991679, "learning_rate": 1.8592964824120602e-05, "loss": 0.9121, "step": 74 }, { "epoch": 0.11320754716981132, "grad_norm": 0.3734270413143541, "learning_rate": 1.884422110552764e-05, "loss": 1.0115, "step": 75 }, { "epoch": 0.11471698113207547, "grad_norm": 0.3746609650254723, "learning_rate": 1.9095477386934673e-05, "loss": 1.0999, "step": 76 }, { "epoch": 0.11622641509433962, "grad_norm": 0.3218308743491144, "learning_rate": 1.934673366834171e-05, "loss": 0.9628, "step": 77 }, { "epoch": 0.11773584905660377, "grad_norm": 0.378328515287124, "learning_rate": 1.9597989949748744e-05, "loss": 0.8928, "step": 78 }, { "epoch": 0.11924528301886793, "grad_norm": 0.385332164332375, "learning_rate": 1.984924623115578e-05, "loss": 1.11, "step": 79 }, { "epoch": 0.12075471698113208, "grad_norm": 0.36828879580340973, "learning_rate": 2.0100502512562815e-05, "loss": 0.9687, "step": 80 }, { "epoch": 0.12226415094339622, "grad_norm": 0.37180628983371017, "learning_rate": 2.035175879396985e-05, "loss": 0.9488, "step": 81 }, { "epoch": 0.12377358490566037, "grad_norm": 0.3655463598030299, "learning_rate": 2.0603015075376886e-05, "loss": 0.9635, "step": 82 }, { "epoch": 0.12528301886792453, "grad_norm": 0.3973839411165058, "learning_rate": 2.085427135678392e-05, "loss": 0.9539, "step": 83 }, { "epoch": 0.12679245283018867, "grad_norm": 0.8889038300232109, "learning_rate": 2.1105527638190957e-05, "loss": 0.9591, "step": 84 }, { "epoch": 0.12830188679245283, "grad_norm": 0.4159535642199707, "learning_rate": 2.135678391959799e-05, "loss": 0.9782, "step": 85 }, { "epoch": 0.129811320754717, "grad_norm": 0.4055951184040692, "learning_rate": 2.1608040201005025e-05, "loss": 0.9885, "step": 86 }, { "epoch": 0.13132075471698113, "grad_norm": 0.3740604222184423, "learning_rate": 2.1859296482412062e-05, "loss": 0.9783, "step": 87 }, { "epoch": 0.1328301886792453, "grad_norm": 0.41043743276277883, "learning_rate": 2.21105527638191e-05, "loss": 0.9789, "step": 88 }, { "epoch": 0.13433962264150942, "grad_norm": 0.3744677884389674, "learning_rate": 2.2361809045226133e-05, "loss": 0.8614, "step": 89 }, { "epoch": 0.13584905660377358, "grad_norm": 0.3674741281297594, "learning_rate": 2.2613065326633167e-05, "loss": 0.8971, "step": 90 }, { "epoch": 0.13735849056603774, "grad_norm": 0.41628170180596963, "learning_rate": 2.28643216080402e-05, "loss": 0.8112, "step": 91 }, { "epoch": 0.13886792452830188, "grad_norm": 0.4508017917622187, "learning_rate": 2.3115577889447238e-05, "loss": 1.0037, "step": 92 }, { "epoch": 0.14037735849056604, "grad_norm": 0.5170078422537966, "learning_rate": 2.3366834170854275e-05, "loss": 1.0547, "step": 93 }, { "epoch": 0.1418867924528302, "grad_norm": 0.5212510340969881, "learning_rate": 2.361809045226131e-05, "loss": 1.0543, "step": 94 }, { "epoch": 0.14339622641509434, "grad_norm": 0.6121067181622158, "learning_rate": 2.3869346733668342e-05, "loss": 0.9061, "step": 95 }, { "epoch": 0.1449056603773585, "grad_norm": 0.5238865642185518, "learning_rate": 2.4120603015075376e-05, "loss": 0.9103, "step": 96 }, { "epoch": 0.14641509433962263, "grad_norm": 0.4171766577781957, "learning_rate": 2.4371859296482413e-05, "loss": 0.8452, "step": 97 }, { "epoch": 0.1479245283018868, "grad_norm": 0.5448433488826973, "learning_rate": 2.462311557788945e-05, "loss": 1.0022, "step": 98 }, { "epoch": 0.14943396226415095, "grad_norm": 1.0734992777741617, "learning_rate": 2.4874371859296484e-05, "loss": 0.943, "step": 99 }, { "epoch": 0.1509433962264151, "grad_norm": 0.49449168180642084, "learning_rate": 2.5125628140703518e-05, "loss": 0.9584, "step": 100 }, { "epoch": 0.15245283018867925, "grad_norm": 0.5088881724419568, "learning_rate": 2.5376884422110552e-05, "loss": 0.8866, "step": 101 }, { "epoch": 0.15396226415094338, "grad_norm": 0.508102690727942, "learning_rate": 2.562814070351759e-05, "loss": 0.9326, "step": 102 }, { "epoch": 0.15547169811320755, "grad_norm": 0.4653650117166445, "learning_rate": 2.5879396984924626e-05, "loss": 0.953, "step": 103 }, { "epoch": 0.1569811320754717, "grad_norm": 0.46750559798962293, "learning_rate": 2.613065326633166e-05, "loss": 0.8609, "step": 104 }, { "epoch": 0.15849056603773584, "grad_norm": 0.6835796303360562, "learning_rate": 2.6381909547738694e-05, "loss": 0.8787, "step": 105 }, { "epoch": 0.16, "grad_norm": 0.5141911707155195, "learning_rate": 2.6633165829145728e-05, "loss": 0.9585, "step": 106 }, { "epoch": 0.16150943396226414, "grad_norm": 0.4883615708108046, "learning_rate": 2.6884422110552765e-05, "loss": 0.9979, "step": 107 }, { "epoch": 0.1630188679245283, "grad_norm": 0.5190216260067821, "learning_rate": 2.7135678391959802e-05, "loss": 0.865, "step": 108 }, { "epoch": 0.16452830188679246, "grad_norm": 0.408410057009239, "learning_rate": 2.738693467336684e-05, "loss": 0.9051, "step": 109 }, { "epoch": 0.1660377358490566, "grad_norm": 0.4906888730107437, "learning_rate": 2.763819095477387e-05, "loss": 0.8904, "step": 110 }, { "epoch": 0.16754716981132076, "grad_norm": 0.42357291845895245, "learning_rate": 2.7889447236180903e-05, "loss": 0.8314, "step": 111 }, { "epoch": 0.16905660377358492, "grad_norm": 1.440741980111514, "learning_rate": 2.814070351758794e-05, "loss": 1.0156, "step": 112 }, { "epoch": 0.17056603773584905, "grad_norm": 0.5124296806329448, "learning_rate": 2.8391959798994978e-05, "loss": 0.8618, "step": 113 }, { "epoch": 0.1720754716981132, "grad_norm": 0.5963788620163426, "learning_rate": 2.8643216080402015e-05, "loss": 0.964, "step": 114 }, { "epoch": 0.17358490566037735, "grad_norm": 0.45527147444306076, "learning_rate": 2.8894472361809045e-05, "loss": 0.8331, "step": 115 }, { "epoch": 0.1750943396226415, "grad_norm": 0.5115219921915511, "learning_rate": 2.914572864321608e-05, "loss": 0.9331, "step": 116 }, { "epoch": 0.17660377358490567, "grad_norm": 4.556437650890275, "learning_rate": 2.9396984924623116e-05, "loss": 0.962, "step": 117 }, { "epoch": 0.1781132075471698, "grad_norm": 0.5312376654687548, "learning_rate": 2.9648241206030153e-05, "loss": 0.8834, "step": 118 }, { "epoch": 0.17962264150943397, "grad_norm": 0.45200310784615344, "learning_rate": 2.989949748743719e-05, "loss": 0.9321, "step": 119 }, { "epoch": 0.1811320754716981, "grad_norm": 0.49479911246054076, "learning_rate": 3.015075376884422e-05, "loss": 0.8612, "step": 120 }, { "epoch": 0.18264150943396226, "grad_norm": 0.5356223053603791, "learning_rate": 3.0402010050251255e-05, "loss": 0.9651, "step": 121 }, { "epoch": 0.18415094339622642, "grad_norm": 0.4775869273954035, "learning_rate": 3.065326633165829e-05, "loss": 0.9167, "step": 122 }, { "epoch": 0.18566037735849056, "grad_norm": 0.38759830910589615, "learning_rate": 3.0904522613065326e-05, "loss": 0.7914, "step": 123 }, { "epoch": 0.18716981132075472, "grad_norm": 0.505778754703735, "learning_rate": 3.1155778894472366e-05, "loss": 0.9004, "step": 124 }, { "epoch": 0.18867924528301888, "grad_norm": 1.5610813600582665, "learning_rate": 3.14070351758794e-05, "loss": 0.823, "step": 125 }, { "epoch": 0.19018867924528302, "grad_norm": 0.5033816078558214, "learning_rate": 3.1658291457286434e-05, "loss": 0.9111, "step": 126 }, { "epoch": 0.19169811320754718, "grad_norm": 0.5411937400048007, "learning_rate": 3.190954773869347e-05, "loss": 0.8375, "step": 127 }, { "epoch": 0.1932075471698113, "grad_norm": 0.4287407951685938, "learning_rate": 3.21608040201005e-05, "loss": 0.8805, "step": 128 }, { "epoch": 0.19471698113207547, "grad_norm": 0.5109121770551573, "learning_rate": 3.241206030150754e-05, "loss": 0.9597, "step": 129 }, { "epoch": 0.19622641509433963, "grad_norm": 0.4260153095022451, "learning_rate": 3.2663316582914576e-05, "loss": 0.998, "step": 130 }, { "epoch": 0.19773584905660377, "grad_norm": 0.44701240738286824, "learning_rate": 3.291457286432161e-05, "loss": 0.8782, "step": 131 }, { "epoch": 0.19924528301886793, "grad_norm": 0.44157622535461144, "learning_rate": 3.3165829145728643e-05, "loss": 0.8841, "step": 132 }, { "epoch": 0.20075471698113206, "grad_norm": 0.4801398431121367, "learning_rate": 3.341708542713568e-05, "loss": 0.8644, "step": 133 }, { "epoch": 0.20226415094339623, "grad_norm": 0.36530196405233833, "learning_rate": 3.366834170854272e-05, "loss": 0.8119, "step": 134 }, { "epoch": 0.2037735849056604, "grad_norm": 0.5570490282592016, "learning_rate": 3.391959798994975e-05, "loss": 0.8855, "step": 135 }, { "epoch": 0.20528301886792452, "grad_norm": 0.3986151374909903, "learning_rate": 3.4170854271356785e-05, "loss": 0.8876, "step": 136 }, { "epoch": 0.20679245283018868, "grad_norm": 0.43926099950410996, "learning_rate": 3.442211055276382e-05, "loss": 0.8575, "step": 137 }, { "epoch": 0.20830188679245282, "grad_norm": 0.45592611275226874, "learning_rate": 3.467336683417085e-05, "loss": 0.8576, "step": 138 }, { "epoch": 0.20981132075471698, "grad_norm": 0.5049788202212411, "learning_rate": 3.4924623115577894e-05, "loss": 0.9399, "step": 139 }, { "epoch": 0.21132075471698114, "grad_norm": 0.3968817831624458, "learning_rate": 3.517587939698493e-05, "loss": 0.884, "step": 140 }, { "epoch": 0.21283018867924527, "grad_norm": 0.5034047348280332, "learning_rate": 3.542713567839196e-05, "loss": 0.9453, "step": 141 }, { "epoch": 0.21433962264150944, "grad_norm": 0.38193650268073553, "learning_rate": 3.5678391959798995e-05, "loss": 0.8446, "step": 142 }, { "epoch": 0.2158490566037736, "grad_norm": 0.47293401837423554, "learning_rate": 3.592964824120603e-05, "loss": 0.8357, "step": 143 }, { "epoch": 0.21735849056603773, "grad_norm": 0.5128727207732378, "learning_rate": 3.618090452261307e-05, "loss": 0.8878, "step": 144 }, { "epoch": 0.2188679245283019, "grad_norm": 0.4490745977510919, "learning_rate": 3.64321608040201e-05, "loss": 0.7525, "step": 145 }, { "epoch": 0.22037735849056603, "grad_norm": 0.5285464668249072, "learning_rate": 3.668341708542714e-05, "loss": 0.8317, "step": 146 }, { "epoch": 0.2218867924528302, "grad_norm": 0.4684615864196027, "learning_rate": 3.693467336683417e-05, "loss": 0.8679, "step": 147 }, { "epoch": 0.22339622641509435, "grad_norm": 0.4384754233570824, "learning_rate": 3.7185929648241204e-05, "loss": 0.9187, "step": 148 }, { "epoch": 0.22490566037735849, "grad_norm": 0.4959217477046417, "learning_rate": 3.7437185929648245e-05, "loss": 0.9176, "step": 149 }, { "epoch": 0.22641509433962265, "grad_norm": 0.513351882366079, "learning_rate": 3.768844221105528e-05, "loss": 0.9131, "step": 150 }, { "epoch": 0.22792452830188678, "grad_norm": 0.46013492291199676, "learning_rate": 3.793969849246231e-05, "loss": 0.8977, "step": 151 }, { "epoch": 0.22943396226415094, "grad_norm": 0.4228108610077172, "learning_rate": 3.8190954773869346e-05, "loss": 0.8378, "step": 152 }, { "epoch": 0.2309433962264151, "grad_norm": 0.5211386844607362, "learning_rate": 3.844221105527639e-05, "loss": 0.8609, "step": 153 }, { "epoch": 0.23245283018867924, "grad_norm": 0.42815407260277805, "learning_rate": 3.869346733668342e-05, "loss": 0.797, "step": 154 }, { "epoch": 0.2339622641509434, "grad_norm": 0.44022812612987366, "learning_rate": 3.8944723618090455e-05, "loss": 0.8717, "step": 155 }, { "epoch": 0.23547169811320753, "grad_norm": 0.4131388371605724, "learning_rate": 3.919597989949749e-05, "loss": 0.812, "step": 156 }, { "epoch": 0.2369811320754717, "grad_norm": 0.6710269473568021, "learning_rate": 3.944723618090452e-05, "loss": 0.8191, "step": 157 }, { "epoch": 0.23849056603773586, "grad_norm": 0.4897114594610496, "learning_rate": 3.969849246231156e-05, "loss": 0.7659, "step": 158 }, { "epoch": 0.24, "grad_norm": 0.5108095802087971, "learning_rate": 3.9949748743718597e-05, "loss": 0.8281, "step": 159 }, { "epoch": 0.24150943396226415, "grad_norm": 0.6108421680937045, "learning_rate": 4.020100502512563e-05, "loss": 0.8847, "step": 160 }, { "epoch": 0.24301886792452831, "grad_norm": 0.5616258226397572, "learning_rate": 4.0452261306532664e-05, "loss": 0.8054, "step": 161 }, { "epoch": 0.24452830188679245, "grad_norm": 0.45309534243186583, "learning_rate": 4.07035175879397e-05, "loss": 0.8975, "step": 162 }, { "epoch": 0.2460377358490566, "grad_norm": 0.5773610924314223, "learning_rate": 4.095477386934674e-05, "loss": 0.9968, "step": 163 }, { "epoch": 0.24754716981132074, "grad_norm": 0.4469837688813638, "learning_rate": 4.120603015075377e-05, "loss": 0.8012, "step": 164 }, { "epoch": 0.2490566037735849, "grad_norm": 2.054275267813225, "learning_rate": 4.1457286432160806e-05, "loss": 0.8895, "step": 165 }, { "epoch": 0.25056603773584907, "grad_norm": 0.5614011880192892, "learning_rate": 4.170854271356784e-05, "loss": 0.9091, "step": 166 }, { "epoch": 0.25207547169811323, "grad_norm": 0.5803759031771593, "learning_rate": 4.1959798994974874e-05, "loss": 0.9152, "step": 167 }, { "epoch": 0.25358490566037734, "grad_norm": 0.6748698350397639, "learning_rate": 4.2211055276381914e-05, "loss": 0.8804, "step": 168 }, { "epoch": 0.2550943396226415, "grad_norm": 0.5901722233952649, "learning_rate": 4.246231155778895e-05, "loss": 0.8415, "step": 169 }, { "epoch": 0.25660377358490566, "grad_norm": 0.6343703843442664, "learning_rate": 4.271356783919598e-05, "loss": 0.8981, "step": 170 }, { "epoch": 0.2581132075471698, "grad_norm": 0.5181992447670495, "learning_rate": 4.2964824120603016e-05, "loss": 0.8191, "step": 171 }, { "epoch": 0.259622641509434, "grad_norm": 0.4542387485161782, "learning_rate": 4.321608040201005e-05, "loss": 0.8212, "step": 172 }, { "epoch": 0.2611320754716981, "grad_norm": 0.5151193905845751, "learning_rate": 4.346733668341709e-05, "loss": 0.8486, "step": 173 }, { "epoch": 0.26264150943396225, "grad_norm": 1.037000010215159, "learning_rate": 4.3718592964824124e-05, "loss": 0.813, "step": 174 }, { "epoch": 0.2641509433962264, "grad_norm": 0.4911515809403663, "learning_rate": 4.396984924623116e-05, "loss": 0.8018, "step": 175 }, { "epoch": 0.2656603773584906, "grad_norm": 0.4701828499446655, "learning_rate": 4.42211055276382e-05, "loss": 0.8715, "step": 176 }, { "epoch": 0.26716981132075474, "grad_norm": 0.4950297728241695, "learning_rate": 4.4472361809045225e-05, "loss": 0.8019, "step": 177 }, { "epoch": 0.26867924528301884, "grad_norm": 0.5589767820377305, "learning_rate": 4.4723618090452266e-05, "loss": 0.9524, "step": 178 }, { "epoch": 0.270188679245283, "grad_norm": 0.571633016201735, "learning_rate": 4.49748743718593e-05, "loss": 0.8792, "step": 179 }, { "epoch": 0.27169811320754716, "grad_norm": 0.476203248798549, "learning_rate": 4.522613065326633e-05, "loss": 0.8768, "step": 180 }, { "epoch": 0.2732075471698113, "grad_norm": 0.48088169221041444, "learning_rate": 4.5477386934673374e-05, "loss": 0.7918, "step": 181 }, { "epoch": 0.2747169811320755, "grad_norm": 0.5571799591230107, "learning_rate": 4.57286432160804e-05, "loss": 0.8038, "step": 182 }, { "epoch": 0.27622641509433965, "grad_norm": 0.5527228247900622, "learning_rate": 4.597989949748744e-05, "loss": 0.8643, "step": 183 }, { "epoch": 0.27773584905660376, "grad_norm": 0.5211423771128341, "learning_rate": 4.6231155778894475e-05, "loss": 0.8878, "step": 184 }, { "epoch": 0.2792452830188679, "grad_norm": 0.6572932038519691, "learning_rate": 4.648241206030151e-05, "loss": 0.8722, "step": 185 }, { "epoch": 0.2807547169811321, "grad_norm": 0.7101877955695006, "learning_rate": 4.673366834170855e-05, "loss": 0.725, "step": 186 }, { "epoch": 0.28226415094339624, "grad_norm": 0.49463109890029217, "learning_rate": 4.6984924623115577e-05, "loss": 0.8044, "step": 187 }, { "epoch": 0.2837735849056604, "grad_norm": 0.6761004422971036, "learning_rate": 4.723618090452262e-05, "loss": 0.8928, "step": 188 }, { "epoch": 0.2852830188679245, "grad_norm": 0.7308836814149202, "learning_rate": 4.748743718592965e-05, "loss": 0.7881, "step": 189 }, { "epoch": 0.28679245283018867, "grad_norm": 0.5752705782759212, "learning_rate": 4.7738693467336685e-05, "loss": 0.7849, "step": 190 }, { "epoch": 0.28830188679245283, "grad_norm": 0.6124086779896891, "learning_rate": 4.7989949748743725e-05, "loss": 0.8595, "step": 191 }, { "epoch": 0.289811320754717, "grad_norm": 0.9703844768396788, "learning_rate": 4.824120603015075e-05, "loss": 0.8965, "step": 192 }, { "epoch": 0.29132075471698116, "grad_norm": 0.920385457687424, "learning_rate": 4.849246231155779e-05, "loss": 0.8885, "step": 193 }, { "epoch": 0.29283018867924526, "grad_norm": 0.5451187115485865, "learning_rate": 4.874371859296483e-05, "loss": 0.8876, "step": 194 }, { "epoch": 0.2943396226415094, "grad_norm": 0.8924359703337917, "learning_rate": 4.899497487437186e-05, "loss": 0.7749, "step": 195 }, { "epoch": 0.2958490566037736, "grad_norm": 0.6987574115677371, "learning_rate": 4.92462311557789e-05, "loss": 0.8024, "step": 196 }, { "epoch": 0.29735849056603775, "grad_norm": 1.0879879193939195, "learning_rate": 4.949748743718593e-05, "loss": 0.908, "step": 197 }, { "epoch": 0.2988679245283019, "grad_norm": 0.7248543407154597, "learning_rate": 4.974874371859297e-05, "loss": 0.8631, "step": 198 }, { "epoch": 0.300377358490566, "grad_norm": 0.8236665003667092, "learning_rate": 5e-05, "loss": 0.7335, "step": 199 }, { "epoch": 0.3018867924528302, "grad_norm": 0.6243498075203732, "learning_rate": 4.9972020145495246e-05, "loss": 0.8225, "step": 200 }, { "epoch": 0.30339622641509434, "grad_norm": 0.9010469648278001, "learning_rate": 4.994404029099049e-05, "loss": 0.8684, "step": 201 }, { "epoch": 0.3049056603773585, "grad_norm": 0.8500224224154146, "learning_rate": 4.991606043648573e-05, "loss": 0.7941, "step": 202 }, { "epoch": 0.30641509433962266, "grad_norm": 0.5361097858505313, "learning_rate": 4.9888080581980976e-05, "loss": 0.8696, "step": 203 }, { "epoch": 0.30792452830188677, "grad_norm": 0.9288603951781687, "learning_rate": 4.986010072747622e-05, "loss": 0.9553, "step": 204 }, { "epoch": 0.30943396226415093, "grad_norm": 0.7165689385459199, "learning_rate": 4.983212087297146e-05, "loss": 0.7807, "step": 205 }, { "epoch": 0.3109433962264151, "grad_norm": 0.6324302878103103, "learning_rate": 4.9804141018466706e-05, "loss": 0.826, "step": 206 }, { "epoch": 0.31245283018867925, "grad_norm": 1.0280953874798517, "learning_rate": 4.977616116396195e-05, "loss": 0.8772, "step": 207 }, { "epoch": 0.3139622641509434, "grad_norm": 0.6365774522631158, "learning_rate": 4.974818130945719e-05, "loss": 0.7882, "step": 208 }, { "epoch": 0.3154716981132075, "grad_norm": 0.7866464392944168, "learning_rate": 4.9720201454952436e-05, "loss": 0.8591, "step": 209 }, { "epoch": 0.3169811320754717, "grad_norm": 0.9900793181165777, "learning_rate": 4.969222160044768e-05, "loss": 0.9166, "step": 210 }, { "epoch": 0.31849056603773584, "grad_norm": 0.5027834608849903, "learning_rate": 4.966424174594292e-05, "loss": 0.8475, "step": 211 }, { "epoch": 0.32, "grad_norm": 0.8149475773947513, "learning_rate": 4.9636261891438166e-05, "loss": 0.8877, "step": 212 }, { "epoch": 0.32150943396226417, "grad_norm": 0.7936117976140553, "learning_rate": 4.960828203693341e-05, "loss": 0.7943, "step": 213 }, { "epoch": 0.3230188679245283, "grad_norm": 0.6047908551122195, "learning_rate": 4.958030218242865e-05, "loss": 0.7487, "step": 214 }, { "epoch": 0.32452830188679244, "grad_norm": 0.8210359919252752, "learning_rate": 4.9552322327923896e-05, "loss": 0.8052, "step": 215 }, { "epoch": 0.3260377358490566, "grad_norm": 0.5010377931393522, "learning_rate": 4.952434247341914e-05, "loss": 0.8029, "step": 216 }, { "epoch": 0.32754716981132076, "grad_norm": 0.6159395499082359, "learning_rate": 4.949636261891438e-05, "loss": 0.7816, "step": 217 }, { "epoch": 0.3290566037735849, "grad_norm": 0.6021660774660482, "learning_rate": 4.9468382764409626e-05, "loss": 0.8034, "step": 218 }, { "epoch": 0.3305660377358491, "grad_norm": 0.48862257577764984, "learning_rate": 4.944040290990487e-05, "loss": 0.7641, "step": 219 }, { "epoch": 0.3320754716981132, "grad_norm": 0.7291022201204262, "learning_rate": 4.941242305540011e-05, "loss": 0.7622, "step": 220 }, { "epoch": 0.33358490566037735, "grad_norm": 0.5680273830366244, "learning_rate": 4.9384443200895356e-05, "loss": 0.8032, "step": 221 }, { "epoch": 0.3350943396226415, "grad_norm": 0.7388526494706458, "learning_rate": 4.93564633463906e-05, "loss": 0.7876, "step": 222 }, { "epoch": 0.3366037735849057, "grad_norm": 0.8366607526648661, "learning_rate": 4.932848349188584e-05, "loss": 0.838, "step": 223 }, { "epoch": 0.33811320754716984, "grad_norm": 0.5362665065766964, "learning_rate": 4.930050363738109e-05, "loss": 0.8709, "step": 224 }, { "epoch": 0.33962264150943394, "grad_norm": 0.7046748659670787, "learning_rate": 4.927252378287633e-05, "loss": 0.8295, "step": 225 }, { "epoch": 0.3411320754716981, "grad_norm": 0.5709018100667398, "learning_rate": 4.924454392837157e-05, "loss": 0.7931, "step": 226 }, { "epoch": 0.34264150943396227, "grad_norm": 1.5265206975752903, "learning_rate": 4.9216564073866817e-05, "loss": 0.7451, "step": 227 }, { "epoch": 0.3441509433962264, "grad_norm": 0.6830742695717387, "learning_rate": 4.918858421936206e-05, "loss": 0.8248, "step": 228 }, { "epoch": 0.3456603773584906, "grad_norm": 0.7664933367072962, "learning_rate": 4.91606043648573e-05, "loss": 0.8626, "step": 229 }, { "epoch": 0.3471698113207547, "grad_norm": 0.4891691600978454, "learning_rate": 4.913262451035255e-05, "loss": 0.8749, "step": 230 }, { "epoch": 0.34867924528301886, "grad_norm": 1.1932498298234255, "learning_rate": 4.910464465584779e-05, "loss": 0.7708, "step": 231 }, { "epoch": 0.350188679245283, "grad_norm": 0.6049277141915559, "learning_rate": 4.907666480134303e-05, "loss": 0.7572, "step": 232 }, { "epoch": 0.3516981132075472, "grad_norm": 0.6315896027669823, "learning_rate": 4.9048684946838284e-05, "loss": 0.8363, "step": 233 }, { "epoch": 0.35320754716981134, "grad_norm": 0.688434404877305, "learning_rate": 4.902070509233353e-05, "loss": 0.8554, "step": 234 }, { "epoch": 0.35471698113207545, "grad_norm": 0.5422581659132807, "learning_rate": 4.899272523782877e-05, "loss": 0.8488, "step": 235 }, { "epoch": 0.3562264150943396, "grad_norm": 0.591458349973424, "learning_rate": 4.896474538332401e-05, "loss": 0.8144, "step": 236 }, { "epoch": 0.35773584905660377, "grad_norm": 0.5604699909131293, "learning_rate": 4.893676552881925e-05, "loss": 0.7237, "step": 237 }, { "epoch": 0.35924528301886793, "grad_norm": 0.4673344116856225, "learning_rate": 4.8908785674314494e-05, "loss": 0.7766, "step": 238 }, { "epoch": 0.3607547169811321, "grad_norm": 0.5436449382903117, "learning_rate": 4.888080581980974e-05, "loss": 0.8197, "step": 239 }, { "epoch": 0.3622641509433962, "grad_norm": 0.479468226600264, "learning_rate": 4.885282596530498e-05, "loss": 0.7939, "step": 240 }, { "epoch": 0.36377358490566036, "grad_norm": 0.6602471005804883, "learning_rate": 4.8824846110800224e-05, "loss": 0.8495, "step": 241 }, { "epoch": 0.3652830188679245, "grad_norm": 0.4571917624290203, "learning_rate": 4.879686625629547e-05, "loss": 0.7913, "step": 242 }, { "epoch": 0.3667924528301887, "grad_norm": 0.6232210769659097, "learning_rate": 4.876888640179072e-05, "loss": 0.7552, "step": 243 }, { "epoch": 0.36830188679245285, "grad_norm": 0.5204436625742763, "learning_rate": 4.874090654728596e-05, "loss": 0.7701, "step": 244 }, { "epoch": 0.36981132075471695, "grad_norm": 0.6612096193726886, "learning_rate": 4.8712926692781204e-05, "loss": 0.7847, "step": 245 }, { "epoch": 0.3713207547169811, "grad_norm": 0.45379455740824437, "learning_rate": 4.868494683827644e-05, "loss": 0.8053, "step": 246 }, { "epoch": 0.3728301886792453, "grad_norm": 0.5973811927067406, "learning_rate": 4.8656966983771684e-05, "loss": 0.8246, "step": 247 }, { "epoch": 0.37433962264150944, "grad_norm": 0.5220120154434879, "learning_rate": 4.862898712926693e-05, "loss": 0.8486, "step": 248 }, { "epoch": 0.3758490566037736, "grad_norm": 0.5431811554175676, "learning_rate": 4.860100727476217e-05, "loss": 0.8141, "step": 249 }, { "epoch": 0.37735849056603776, "grad_norm": 0.5029866404820176, "learning_rate": 4.8573027420257414e-05, "loss": 0.7952, "step": 250 }, { "epoch": 0.37886792452830187, "grad_norm": 0.5898909870291161, "learning_rate": 4.854504756575266e-05, "loss": 0.8547, "step": 251 }, { "epoch": 0.38037735849056603, "grad_norm": 0.6238187627468014, "learning_rate": 4.851706771124791e-05, "loss": 0.7726, "step": 252 }, { "epoch": 0.3818867924528302, "grad_norm": 0.42756782085548817, "learning_rate": 4.848908785674315e-05, "loss": 0.7988, "step": 253 }, { "epoch": 0.38339622641509435, "grad_norm": 0.5943288699361647, "learning_rate": 4.8461108002238394e-05, "loss": 0.8483, "step": 254 }, { "epoch": 0.3849056603773585, "grad_norm": 0.44627827351325955, "learning_rate": 4.843312814773364e-05, "loss": 0.7907, "step": 255 }, { "epoch": 0.3864150943396226, "grad_norm": 0.6245548346580109, "learning_rate": 4.840514829322888e-05, "loss": 0.7789, "step": 256 }, { "epoch": 0.3879245283018868, "grad_norm": 0.5052966445102851, "learning_rate": 4.837716843872412e-05, "loss": 0.8078, "step": 257 }, { "epoch": 0.38943396226415095, "grad_norm": 0.5086194272663704, "learning_rate": 4.834918858421936e-05, "loss": 0.797, "step": 258 }, { "epoch": 0.3909433962264151, "grad_norm": 0.4917833870150589, "learning_rate": 4.8321208729714604e-05, "loss": 0.7691, "step": 259 }, { "epoch": 0.39245283018867927, "grad_norm": 0.5338995503228233, "learning_rate": 4.829322887520985e-05, "loss": 0.7981, "step": 260 }, { "epoch": 0.3939622641509434, "grad_norm": 0.5412599367495976, "learning_rate": 4.82652490207051e-05, "loss": 0.8443, "step": 261 }, { "epoch": 0.39547169811320754, "grad_norm": 0.580624564144757, "learning_rate": 4.823726916620034e-05, "loss": 0.7907, "step": 262 }, { "epoch": 0.3969811320754717, "grad_norm": 0.48369587610402004, "learning_rate": 4.8209289311695584e-05, "loss": 0.794, "step": 263 }, { "epoch": 0.39849056603773586, "grad_norm": 0.49030096209333235, "learning_rate": 4.818130945719083e-05, "loss": 0.7584, "step": 264 }, { "epoch": 0.4, "grad_norm": 0.5061198889128261, "learning_rate": 4.815332960268607e-05, "loss": 0.7352, "step": 265 }, { "epoch": 0.40150943396226413, "grad_norm": 0.44245029843710665, "learning_rate": 4.8125349748181314e-05, "loss": 0.7376, "step": 266 }, { "epoch": 0.4030188679245283, "grad_norm": 0.5564346808520174, "learning_rate": 4.809736989367655e-05, "loss": 0.7939, "step": 267 }, { "epoch": 0.40452830188679245, "grad_norm": 0.4231713875112469, "learning_rate": 4.8069390039171794e-05, "loss": 0.7596, "step": 268 }, { "epoch": 0.4060377358490566, "grad_norm": 0.5072296368778142, "learning_rate": 4.804141018466704e-05, "loss": 0.9397, "step": 269 }, { "epoch": 0.4075471698113208, "grad_norm": 1.3091372981391216, "learning_rate": 4.801343033016228e-05, "loss": 0.7321, "step": 270 }, { "epoch": 0.4090566037735849, "grad_norm": 0.7068197146851543, "learning_rate": 4.798545047565753e-05, "loss": 0.8049, "step": 271 }, { "epoch": 0.41056603773584904, "grad_norm": 0.39121782749476086, "learning_rate": 4.7957470621152775e-05, "loss": 0.707, "step": 272 }, { "epoch": 0.4120754716981132, "grad_norm": 0.6092417359403263, "learning_rate": 4.792949076664802e-05, "loss": 0.7187, "step": 273 }, { "epoch": 0.41358490566037737, "grad_norm": 0.44996301987627735, "learning_rate": 4.790151091214326e-05, "loss": 0.8322, "step": 274 }, { "epoch": 0.41509433962264153, "grad_norm": 0.6626369105932329, "learning_rate": 4.7873531057638505e-05, "loss": 0.7857, "step": 275 }, { "epoch": 0.41660377358490563, "grad_norm": 0.4967773382857501, "learning_rate": 4.784555120313375e-05, "loss": 0.746, "step": 276 }, { "epoch": 0.4181132075471698, "grad_norm": 0.5435211685588449, "learning_rate": 4.7817571348628985e-05, "loss": 0.7965, "step": 277 }, { "epoch": 0.41962264150943396, "grad_norm": 0.7022029449146439, "learning_rate": 4.778959149412423e-05, "loss": 0.8653, "step": 278 }, { "epoch": 0.4211320754716981, "grad_norm": 0.45601467205881, "learning_rate": 4.776161163961947e-05, "loss": 0.777, "step": 279 }, { "epoch": 0.4226415094339623, "grad_norm": 0.6644957716637454, "learning_rate": 4.773363178511472e-05, "loss": 0.8214, "step": 280 }, { "epoch": 0.4241509433962264, "grad_norm": 0.6246544429191916, "learning_rate": 4.7705651930609965e-05, "loss": 0.8785, "step": 281 }, { "epoch": 0.42566037735849055, "grad_norm": 0.5291406456848594, "learning_rate": 4.767767207610521e-05, "loss": 0.7691, "step": 282 }, { "epoch": 0.4271698113207547, "grad_norm": 0.7456610522310195, "learning_rate": 4.764969222160045e-05, "loss": 0.8264, "step": 283 }, { "epoch": 0.4286792452830189, "grad_norm": 0.6353984731391955, "learning_rate": 4.7621712367095695e-05, "loss": 0.7689, "step": 284 }, { "epoch": 0.43018867924528303, "grad_norm": 0.5508771074957549, "learning_rate": 4.759373251259094e-05, "loss": 0.803, "step": 285 }, { "epoch": 0.4316981132075472, "grad_norm": 0.6135542045293, "learning_rate": 4.756575265808618e-05, "loss": 0.7338, "step": 286 }, { "epoch": 0.4332075471698113, "grad_norm": 0.5030983850050345, "learning_rate": 4.7537772803581425e-05, "loss": 0.818, "step": 287 }, { "epoch": 0.43471698113207546, "grad_norm": 0.7052367727423026, "learning_rate": 4.750979294907666e-05, "loss": 0.8135, "step": 288 }, { "epoch": 0.4362264150943396, "grad_norm": 0.533554107770091, "learning_rate": 4.748181309457191e-05, "loss": 0.9047, "step": 289 }, { "epoch": 0.4377358490566038, "grad_norm": 0.881159613978913, "learning_rate": 4.7453833240067155e-05, "loss": 0.884, "step": 290 }, { "epoch": 0.43924528301886795, "grad_norm": 0.7306080850175966, "learning_rate": 4.74258533855624e-05, "loss": 0.8131, "step": 291 }, { "epoch": 0.44075471698113206, "grad_norm": 0.5044741998175036, "learning_rate": 4.739787353105764e-05, "loss": 0.7513, "step": 292 }, { "epoch": 0.4422641509433962, "grad_norm": 0.7168412587080375, "learning_rate": 4.7369893676552885e-05, "loss": 0.7788, "step": 293 }, { "epoch": 0.4437735849056604, "grad_norm": 0.4558320123761158, "learning_rate": 4.734191382204813e-05, "loss": 0.7893, "step": 294 }, { "epoch": 0.44528301886792454, "grad_norm": 0.7940272573905822, "learning_rate": 4.731393396754337e-05, "loss": 0.8386, "step": 295 }, { "epoch": 0.4467924528301887, "grad_norm": 0.5513668143022887, "learning_rate": 4.7285954113038615e-05, "loss": 0.7833, "step": 296 }, { "epoch": 0.4483018867924528, "grad_norm": 0.7079029126017938, "learning_rate": 4.725797425853386e-05, "loss": 0.7196, "step": 297 }, { "epoch": 0.44981132075471697, "grad_norm": 1.229514968171817, "learning_rate": 4.72299944040291e-05, "loss": 0.8989, "step": 298 }, { "epoch": 0.45132075471698113, "grad_norm": 0.6404714941336582, "learning_rate": 4.7202014549524345e-05, "loss": 0.8317, "step": 299 }, { "epoch": 0.4528301886792453, "grad_norm": 2.4628741313154188, "learning_rate": 4.717403469501959e-05, "loss": 0.8618, "step": 300 }, { "epoch": 0.45433962264150946, "grad_norm": 0.8912363562097001, "learning_rate": 4.714605484051483e-05, "loss": 0.8371, "step": 301 }, { "epoch": 0.45584905660377356, "grad_norm": 0.7676920170222373, "learning_rate": 4.7118074986010076e-05, "loss": 0.7677, "step": 302 }, { "epoch": 0.4573584905660377, "grad_norm": 0.9101103649802816, "learning_rate": 4.709009513150532e-05, "loss": 0.7726, "step": 303 }, { "epoch": 0.4588679245283019, "grad_norm": 0.865381015702614, "learning_rate": 4.706211527700056e-05, "loss": 0.77, "step": 304 }, { "epoch": 0.46037735849056605, "grad_norm": 0.629343619599737, "learning_rate": 4.7034135422495806e-05, "loss": 0.7224, "step": 305 }, { "epoch": 0.4618867924528302, "grad_norm": 0.9800276759621851, "learning_rate": 4.700615556799105e-05, "loss": 0.8232, "step": 306 }, { "epoch": 0.4633962264150943, "grad_norm": 0.5404048701315444, "learning_rate": 4.697817571348629e-05, "loss": 0.7897, "step": 307 }, { "epoch": 0.4649056603773585, "grad_norm": 1.0278186663808309, "learning_rate": 4.6950195858981536e-05, "loss": 0.8159, "step": 308 }, { "epoch": 0.46641509433962264, "grad_norm": 0.5823665222431843, "learning_rate": 4.692221600447678e-05, "loss": 0.7906, "step": 309 }, { "epoch": 0.4679245283018868, "grad_norm": 0.7252680198432144, "learning_rate": 4.689423614997202e-05, "loss": 0.764, "step": 310 }, { "epoch": 0.46943396226415096, "grad_norm": 0.6570038646216206, "learning_rate": 4.6866256295467266e-05, "loss": 0.6926, "step": 311 }, { "epoch": 0.47094339622641507, "grad_norm": 0.5621715975516278, "learning_rate": 4.683827644096251e-05, "loss": 0.6961, "step": 312 }, { "epoch": 0.47245283018867923, "grad_norm": 0.4779432536942869, "learning_rate": 4.681029658645775e-05, "loss": 0.7378, "step": 313 }, { "epoch": 0.4739622641509434, "grad_norm": 0.674459154650339, "learning_rate": 4.6782316731952996e-05, "loss": 0.7874, "step": 314 }, { "epoch": 0.47547169811320755, "grad_norm": 0.5004466668430579, "learning_rate": 4.675433687744824e-05, "loss": 0.7406, "step": 315 }, { "epoch": 0.4769811320754717, "grad_norm": 0.5395088155154139, "learning_rate": 4.672635702294348e-05, "loss": 0.7362, "step": 316 }, { "epoch": 0.4784905660377359, "grad_norm": 0.6418834874570026, "learning_rate": 4.6698377168438726e-05, "loss": 0.8157, "step": 317 }, { "epoch": 0.48, "grad_norm": 0.4797662965902652, "learning_rate": 4.667039731393397e-05, "loss": 0.7916, "step": 318 }, { "epoch": 0.48150943396226414, "grad_norm": 0.6366002378504793, "learning_rate": 4.664241745942921e-05, "loss": 0.8928, "step": 319 }, { "epoch": 0.4830188679245283, "grad_norm": 0.7131620418095839, "learning_rate": 4.6614437604924456e-05, "loss": 0.7556, "step": 320 }, { "epoch": 0.48452830188679247, "grad_norm": 0.5851313011583172, "learning_rate": 4.65864577504197e-05, "loss": 0.8102, "step": 321 }, { "epoch": 0.48603773584905663, "grad_norm": 0.5129745799858986, "learning_rate": 4.655847789591494e-05, "loss": 0.7388, "step": 322 }, { "epoch": 0.48754716981132074, "grad_norm": 0.48227342480016644, "learning_rate": 4.6530498041410186e-05, "loss": 0.7968, "step": 323 }, { "epoch": 0.4890566037735849, "grad_norm": 0.5765939446109118, "learning_rate": 4.650251818690543e-05, "loss": 0.8276, "step": 324 }, { "epoch": 0.49056603773584906, "grad_norm": 0.45210550750524087, "learning_rate": 4.647453833240067e-05, "loss": 0.7838, "step": 325 }, { "epoch": 0.4920754716981132, "grad_norm": 0.5235976941093462, "learning_rate": 4.6446558477895916e-05, "loss": 0.8524, "step": 326 }, { "epoch": 0.4935849056603774, "grad_norm": 0.948045987049377, "learning_rate": 4.641857862339116e-05, "loss": 0.783, "step": 327 }, { "epoch": 0.4950943396226415, "grad_norm": 0.8405622911075784, "learning_rate": 4.63905987688864e-05, "loss": 0.7448, "step": 328 }, { "epoch": 0.49660377358490565, "grad_norm": 0.5163025902946453, "learning_rate": 4.6362618914381646e-05, "loss": 0.8237, "step": 329 }, { "epoch": 0.4981132075471698, "grad_norm": 0.7136811122203169, "learning_rate": 4.633463905987689e-05, "loss": 0.8279, "step": 330 }, { "epoch": 0.499622641509434, "grad_norm": 0.5478782108688454, "learning_rate": 4.630665920537213e-05, "loss": 0.7627, "step": 331 }, { "epoch": 0.5011320754716981, "grad_norm": 0.5191383894592315, "learning_rate": 4.6278679350867376e-05, "loss": 0.7477, "step": 332 }, { "epoch": 0.5026415094339622, "grad_norm": 0.7132287751115131, "learning_rate": 4.625069949636262e-05, "loss": 0.715, "step": 333 }, { "epoch": 0.5041509433962265, "grad_norm": 0.5596228616176007, "learning_rate": 4.622271964185786e-05, "loss": 0.8015, "step": 334 }, { "epoch": 0.5056603773584906, "grad_norm": 0.44498739497428585, "learning_rate": 4.6194739787353107e-05, "loss": 0.7529, "step": 335 }, { "epoch": 0.5071698113207547, "grad_norm": 0.5546353422236862, "learning_rate": 4.616675993284835e-05, "loss": 0.7459, "step": 336 }, { "epoch": 0.5086792452830189, "grad_norm": 0.45280156933343746, "learning_rate": 4.613878007834359e-05, "loss": 0.8087, "step": 337 }, { "epoch": 0.510188679245283, "grad_norm": 0.6178648389283045, "learning_rate": 4.611080022383884e-05, "loss": 0.8791, "step": 338 }, { "epoch": 0.5116981132075472, "grad_norm": 0.4595941605540381, "learning_rate": 4.608282036933409e-05, "loss": 0.7174, "step": 339 }, { "epoch": 0.5132075471698113, "grad_norm": 0.5584877878966316, "learning_rate": 4.605484051482932e-05, "loss": 0.7559, "step": 340 }, { "epoch": 0.5147169811320754, "grad_norm": 0.5173754218021847, "learning_rate": 4.602686066032457e-05, "loss": 0.7863, "step": 341 }, { "epoch": 0.5162264150943396, "grad_norm": 0.5740142718980299, "learning_rate": 4.599888080581981e-05, "loss": 0.7521, "step": 342 }, { "epoch": 0.5177358490566037, "grad_norm": 0.48616061117595205, "learning_rate": 4.5970900951315053e-05, "loss": 0.7239, "step": 343 }, { "epoch": 0.519245283018868, "grad_norm": 0.5948531580198754, "learning_rate": 4.59429210968103e-05, "loss": 0.7209, "step": 344 }, { "epoch": 0.5207547169811321, "grad_norm": 0.7290623947785152, "learning_rate": 4.591494124230554e-05, "loss": 0.759, "step": 345 }, { "epoch": 0.5222641509433962, "grad_norm": 0.488056932359024, "learning_rate": 4.5886961387800783e-05, "loss": 0.7456, "step": 346 }, { "epoch": 0.5237735849056604, "grad_norm": 0.47054544364109835, "learning_rate": 4.5858981533296034e-05, "loss": 0.7124, "step": 347 }, { "epoch": 0.5252830188679245, "grad_norm": 0.50177713952636, "learning_rate": 4.583100167879128e-05, "loss": 0.7519, "step": 348 }, { "epoch": 0.5267924528301887, "grad_norm": 0.6140162666917494, "learning_rate": 4.580302182428652e-05, "loss": 0.7098, "step": 349 }, { "epoch": 0.5283018867924528, "grad_norm": 0.4266279539939302, "learning_rate": 4.577504196978176e-05, "loss": 0.7665, "step": 350 }, { "epoch": 0.5298113207547169, "grad_norm": 0.9041107079579056, "learning_rate": 4.5747062115277e-05, "loss": 0.7677, "step": 351 }, { "epoch": 0.5313207547169811, "grad_norm": 0.5038590825788706, "learning_rate": 4.5719082260772244e-05, "loss": 0.7628, "step": 352 }, { "epoch": 0.5328301886792453, "grad_norm": 0.44535742550996604, "learning_rate": 4.569110240626749e-05, "loss": 0.7568, "step": 353 }, { "epoch": 0.5343396226415095, "grad_norm": 0.4862471874939931, "learning_rate": 4.566312255176273e-05, "loss": 0.7696, "step": 354 }, { "epoch": 0.5358490566037736, "grad_norm": 0.4386852797735026, "learning_rate": 4.5635142697257974e-05, "loss": 0.7495, "step": 355 }, { "epoch": 0.5373584905660377, "grad_norm": 0.6316124118050598, "learning_rate": 4.560716284275322e-05, "loss": 0.8228, "step": 356 }, { "epoch": 0.5388679245283019, "grad_norm": 0.41211245426770826, "learning_rate": 4.557918298824847e-05, "loss": 0.7613, "step": 357 }, { "epoch": 0.540377358490566, "grad_norm": 0.5868461457556423, "learning_rate": 4.555120313374371e-05, "loss": 0.7506, "step": 358 }, { "epoch": 0.5418867924528302, "grad_norm": 0.4717702823928158, "learning_rate": 4.5523223279238954e-05, "loss": 0.8838, "step": 359 }, { "epoch": 0.5433962264150943, "grad_norm": 0.5003003408121309, "learning_rate": 4.54952434247342e-05, "loss": 0.8818, "step": 360 }, { "epoch": 0.5449056603773584, "grad_norm": 0.4907938538260416, "learning_rate": 4.5467263570229434e-05, "loss": 0.7319, "step": 361 }, { "epoch": 0.5464150943396227, "grad_norm": 0.4644540043260514, "learning_rate": 4.543928371572468e-05, "loss": 0.735, "step": 362 }, { "epoch": 0.5479245283018868, "grad_norm": 0.5560732867320977, "learning_rate": 4.541130386121992e-05, "loss": 0.8294, "step": 363 }, { "epoch": 0.549433962264151, "grad_norm": 0.41552200602803446, "learning_rate": 4.5383324006715164e-05, "loss": 0.8152, "step": 364 }, { "epoch": 0.5509433962264151, "grad_norm": 0.539878788115719, "learning_rate": 4.535534415221041e-05, "loss": 0.7746, "step": 365 }, { "epoch": 0.5524528301886793, "grad_norm": 0.4414859813055167, "learning_rate": 4.532736429770566e-05, "loss": 0.7332, "step": 366 }, { "epoch": 0.5539622641509434, "grad_norm": 0.4321477596135969, "learning_rate": 4.52993844432009e-05, "loss": 0.7245, "step": 367 }, { "epoch": 0.5554716981132075, "grad_norm": 1.2570879787382243, "learning_rate": 4.5271404588696144e-05, "loss": 0.7191, "step": 368 }, { "epoch": 0.5569811320754717, "grad_norm": 0.5742427583402032, "learning_rate": 4.524342473419139e-05, "loss": 0.8317, "step": 369 }, { "epoch": 0.5584905660377358, "grad_norm": 0.4741231113436004, "learning_rate": 4.521544487968663e-05, "loss": 0.6925, "step": 370 }, { "epoch": 0.56, "grad_norm": 0.48896613714320825, "learning_rate": 4.518746502518187e-05, "loss": 0.6983, "step": 371 }, { "epoch": 0.5615094339622642, "grad_norm": 0.42917322921737827, "learning_rate": 4.515948517067711e-05, "loss": 0.7469, "step": 372 }, { "epoch": 0.5630188679245283, "grad_norm": 0.41025766821703724, "learning_rate": 4.5131505316172354e-05, "loss": 0.7545, "step": 373 }, { "epoch": 0.5645283018867925, "grad_norm": 0.5635148674930043, "learning_rate": 4.51035254616676e-05, "loss": 0.8145, "step": 374 }, { "epoch": 0.5660377358490566, "grad_norm": 0.40552510627195454, "learning_rate": 4.507554560716285e-05, "loss": 0.7851, "step": 375 }, { "epoch": 0.5675471698113208, "grad_norm": 0.7307038202791475, "learning_rate": 4.504756575265809e-05, "loss": 0.8569, "step": 376 }, { "epoch": 0.5690566037735849, "grad_norm": 1.0811129556259813, "learning_rate": 4.5019585898153335e-05, "loss": 0.9044, "step": 377 }, { "epoch": 0.570566037735849, "grad_norm": 0.42008950694302577, "learning_rate": 4.499160604364858e-05, "loss": 0.8506, "step": 378 }, { "epoch": 0.5720754716981132, "grad_norm": 0.42648022512202355, "learning_rate": 4.496362618914382e-05, "loss": 0.8346, "step": 379 }, { "epoch": 0.5735849056603773, "grad_norm": 0.45926513207861036, "learning_rate": 4.4935646334639065e-05, "loss": 0.8785, "step": 380 }, { "epoch": 0.5750943396226416, "grad_norm": 0.43277141797293067, "learning_rate": 4.49076664801343e-05, "loss": 0.7923, "step": 381 }, { "epoch": 0.5766037735849057, "grad_norm": 0.4171395355778019, "learning_rate": 4.4879686625629545e-05, "loss": 0.7133, "step": 382 }, { "epoch": 0.5781132075471698, "grad_norm": 0.49673115482652136, "learning_rate": 4.485170677112479e-05, "loss": 0.7325, "step": 383 }, { "epoch": 0.579622641509434, "grad_norm": 0.7116762388295127, "learning_rate": 4.482372691662003e-05, "loss": 0.7799, "step": 384 }, { "epoch": 0.5811320754716981, "grad_norm": 0.8174353919858245, "learning_rate": 4.479574706211528e-05, "loss": 0.8845, "step": 385 }, { "epoch": 0.5826415094339623, "grad_norm": 0.8335644956676623, "learning_rate": 4.4767767207610525e-05, "loss": 0.7319, "step": 386 }, { "epoch": 0.5841509433962264, "grad_norm": 0.6876169170804783, "learning_rate": 4.473978735310577e-05, "loss": 0.8176, "step": 387 }, { "epoch": 0.5856603773584905, "grad_norm": 0.5536550742512403, "learning_rate": 4.471180749860101e-05, "loss": 0.8086, "step": 388 }, { "epoch": 0.5871698113207547, "grad_norm": 0.5323019116550912, "learning_rate": 4.4683827644096255e-05, "loss": 0.7065, "step": 389 }, { "epoch": 0.5886792452830188, "grad_norm": 0.5135931602373224, "learning_rate": 4.46558477895915e-05, "loss": 0.7253, "step": 390 }, { "epoch": 0.5901886792452831, "grad_norm": 0.4559259275389328, "learning_rate": 4.462786793508674e-05, "loss": 0.7999, "step": 391 }, { "epoch": 0.5916981132075472, "grad_norm": 0.44803810661731486, "learning_rate": 4.459988808058198e-05, "loss": 0.8239, "step": 392 }, { "epoch": 0.5932075471698113, "grad_norm": 0.5069482759856541, "learning_rate": 4.457190822607722e-05, "loss": 0.8289, "step": 393 }, { "epoch": 0.5947169811320755, "grad_norm": 0.5671863335272355, "learning_rate": 4.454392837157247e-05, "loss": 0.8187, "step": 394 }, { "epoch": 0.5962264150943396, "grad_norm": 0.4606809083404723, "learning_rate": 4.4515948517067715e-05, "loss": 0.8192, "step": 395 }, { "epoch": 0.5977358490566038, "grad_norm": 0.38018446697126707, "learning_rate": 4.448796866256296e-05, "loss": 0.7691, "step": 396 }, { "epoch": 0.5992452830188679, "grad_norm": 0.4323225254781683, "learning_rate": 4.44599888080582e-05, "loss": 0.8939, "step": 397 }, { "epoch": 0.600754716981132, "grad_norm": 0.428772129920215, "learning_rate": 4.4432008953553445e-05, "loss": 0.7648, "step": 398 }, { "epoch": 0.6022641509433962, "grad_norm": 0.4106799432786821, "learning_rate": 4.440402909904869e-05, "loss": 0.7747, "step": 399 }, { "epoch": 0.6037735849056604, "grad_norm": 0.9201724559585969, "learning_rate": 4.437604924454393e-05, "loss": 0.7752, "step": 400 }, { "epoch": 0.6052830188679246, "grad_norm": 0.63142220777098, "learning_rate": 4.4348069390039175e-05, "loss": 0.7367, "step": 401 }, { "epoch": 0.6067924528301887, "grad_norm": 0.5430276304717978, "learning_rate": 4.432008953553441e-05, "loss": 0.804, "step": 402 }, { "epoch": 0.6083018867924528, "grad_norm": 0.439182439759478, "learning_rate": 4.429210968102966e-05, "loss": 0.8251, "step": 403 }, { "epoch": 0.609811320754717, "grad_norm": 0.5486244365234464, "learning_rate": 4.4264129826524905e-05, "loss": 0.7075, "step": 404 }, { "epoch": 0.6113207547169811, "grad_norm": 0.5277673001485089, "learning_rate": 4.423614997202015e-05, "loss": 0.7607, "step": 405 }, { "epoch": 0.6128301886792453, "grad_norm": 0.5256998794976435, "learning_rate": 4.420817011751539e-05, "loss": 0.7948, "step": 406 }, { "epoch": 0.6143396226415094, "grad_norm": 0.5122728349415228, "learning_rate": 4.4180190263010635e-05, "loss": 0.8268, "step": 407 }, { "epoch": 0.6158490566037735, "grad_norm": 0.5089282410409556, "learning_rate": 4.415221040850588e-05, "loss": 0.7666, "step": 408 }, { "epoch": 0.6173584905660378, "grad_norm": 1.4029487791422663, "learning_rate": 4.412423055400112e-05, "loss": 0.7687, "step": 409 }, { "epoch": 0.6188679245283019, "grad_norm": 1.0081149280553756, "learning_rate": 4.4096250699496365e-05, "loss": 0.7524, "step": 410 }, { "epoch": 0.6203773584905661, "grad_norm": 0.9006929253576336, "learning_rate": 4.406827084499161e-05, "loss": 0.8247, "step": 411 }, { "epoch": 0.6218867924528302, "grad_norm": 0.8411745640797793, "learning_rate": 4.4040290990486845e-05, "loss": 0.8105, "step": 412 }, { "epoch": 0.6233962264150943, "grad_norm": 0.7294710313107107, "learning_rate": 4.4012311135982096e-05, "loss": 0.748, "step": 413 }, { "epoch": 0.6249056603773585, "grad_norm": 4.837670218659803, "learning_rate": 4.398433128147734e-05, "loss": 0.8315, "step": 414 }, { "epoch": 0.6264150943396226, "grad_norm": 0.7291575178965143, "learning_rate": 4.395635142697258e-05, "loss": 0.7576, "step": 415 }, { "epoch": 0.6279245283018868, "grad_norm": 0.67449080523453, "learning_rate": 4.3928371572467826e-05, "loss": 0.7537, "step": 416 }, { "epoch": 0.6294339622641509, "grad_norm": 0.5951820027076258, "learning_rate": 4.390039171796307e-05, "loss": 0.7073, "step": 417 }, { "epoch": 0.630943396226415, "grad_norm": 0.7759167017515514, "learning_rate": 4.387241186345831e-05, "loss": 0.7775, "step": 418 }, { "epoch": 0.6324528301886793, "grad_norm": 0.65776925709979, "learning_rate": 4.3844432008953556e-05, "loss": 0.7754, "step": 419 }, { "epoch": 0.6339622641509434, "grad_norm": 0.783091467326888, "learning_rate": 4.38164521544488e-05, "loss": 0.7448, "step": 420 }, { "epoch": 0.6354716981132076, "grad_norm": 0.6111140171989418, "learning_rate": 4.378847229994404e-05, "loss": 0.7472, "step": 421 }, { "epoch": 0.6369811320754717, "grad_norm": 0.6125601796793868, "learning_rate": 4.3760492445439286e-05, "loss": 0.7546, "step": 422 }, { "epoch": 0.6384905660377358, "grad_norm": 0.5915644550948223, "learning_rate": 4.373251259093453e-05, "loss": 0.8102, "step": 423 }, { "epoch": 0.64, "grad_norm": 4.073736000326432, "learning_rate": 4.370453273642977e-05, "loss": 0.7923, "step": 424 }, { "epoch": 0.6415094339622641, "grad_norm": 0.8436199611615184, "learning_rate": 4.3676552881925016e-05, "loss": 0.8023, "step": 425 }, { "epoch": 0.6430188679245283, "grad_norm": 0.5606909686356082, "learning_rate": 4.364857302742026e-05, "loss": 0.7346, "step": 426 }, { "epoch": 0.6445283018867924, "grad_norm": 0.7771075034666999, "learning_rate": 4.36205931729155e-05, "loss": 0.6807, "step": 427 }, { "epoch": 0.6460377358490565, "grad_norm": 0.5192219758704827, "learning_rate": 4.3592613318410746e-05, "loss": 0.7678, "step": 428 }, { "epoch": 0.6475471698113208, "grad_norm": 0.6083104527276569, "learning_rate": 4.356463346390599e-05, "loss": 0.7552, "step": 429 }, { "epoch": 0.6490566037735849, "grad_norm": 0.601019714720564, "learning_rate": 4.353665360940123e-05, "loss": 0.8074, "step": 430 }, { "epoch": 0.6505660377358491, "grad_norm": 0.6089857562150989, "learning_rate": 4.3508673754896476e-05, "loss": 0.7508, "step": 431 }, { "epoch": 0.6520754716981132, "grad_norm": 0.6249210482138133, "learning_rate": 4.348069390039172e-05, "loss": 0.7558, "step": 432 }, { "epoch": 0.6535849056603774, "grad_norm": 2.537695860786402, "learning_rate": 4.345271404588696e-05, "loss": 0.817, "step": 433 }, { "epoch": 0.6550943396226415, "grad_norm": 0.9217891149931049, "learning_rate": 4.3424734191382206e-05, "loss": 0.7971, "step": 434 }, { "epoch": 0.6566037735849056, "grad_norm": 0.5519943971261175, "learning_rate": 4.339675433687745e-05, "loss": 0.8088, "step": 435 }, { "epoch": 0.6581132075471698, "grad_norm": 0.7425761436709358, "learning_rate": 4.336877448237269e-05, "loss": 0.756, "step": 436 }, { "epoch": 0.659622641509434, "grad_norm": 0.6942699916554731, "learning_rate": 4.3340794627867936e-05, "loss": 0.8032, "step": 437 }, { "epoch": 0.6611320754716982, "grad_norm": 0.5334615246424551, "learning_rate": 4.331281477336318e-05, "loss": 0.7998, "step": 438 }, { "epoch": 0.6626415094339623, "grad_norm": 0.8292482053092843, "learning_rate": 4.328483491885842e-05, "loss": 0.7431, "step": 439 }, { "epoch": 0.6641509433962264, "grad_norm": 0.4530926505666787, "learning_rate": 4.3256855064353666e-05, "loss": 0.7596, "step": 440 }, { "epoch": 0.6656603773584906, "grad_norm": 0.6624652358047928, "learning_rate": 4.322887520984891e-05, "loss": 0.7712, "step": 441 }, { "epoch": 0.6671698113207547, "grad_norm": 0.5064411908735837, "learning_rate": 4.320089535534415e-05, "loss": 0.695, "step": 442 }, { "epoch": 0.6686792452830189, "grad_norm": 0.5377369847550588, "learning_rate": 4.31729155008394e-05, "loss": 0.756, "step": 443 }, { "epoch": 0.670188679245283, "grad_norm": 0.4983199069825207, "learning_rate": 4.314493564633464e-05, "loss": 0.7807, "step": 444 }, { "epoch": 0.6716981132075471, "grad_norm": 0.44158119758359315, "learning_rate": 4.311695579182988e-05, "loss": 0.7754, "step": 445 }, { "epoch": 0.6732075471698113, "grad_norm": 0.5596653554139499, "learning_rate": 4.3088975937325127e-05, "loss": 0.8195, "step": 446 }, { "epoch": 0.6747169811320755, "grad_norm": 0.43436077791659994, "learning_rate": 4.306099608282037e-05, "loss": 0.7604, "step": 447 }, { "epoch": 0.6762264150943397, "grad_norm": 0.5030810215462936, "learning_rate": 4.303301622831561e-05, "loss": 0.6785, "step": 448 }, { "epoch": 0.6777358490566038, "grad_norm": 0.5482694189131312, "learning_rate": 4.300503637381086e-05, "loss": 0.7199, "step": 449 }, { "epoch": 0.6792452830188679, "grad_norm": 0.5967150137777674, "learning_rate": 4.29770565193061e-05, "loss": 0.7983, "step": 450 }, { "epoch": 0.6807547169811321, "grad_norm": 0.5940918534861581, "learning_rate": 4.294907666480134e-05, "loss": 0.7591, "step": 451 }, { "epoch": 0.6822641509433962, "grad_norm": 0.5427565481245458, "learning_rate": 4.2921096810296593e-05, "loss": 0.8326, "step": 452 }, { "epoch": 0.6837735849056604, "grad_norm": 2.638916003074341, "learning_rate": 4.289311695579184e-05, "loss": 0.7776, "step": 453 }, { "epoch": 0.6852830188679245, "grad_norm": 0.8031629641842678, "learning_rate": 4.2865137101287073e-05, "loss": 0.7061, "step": 454 }, { "epoch": 0.6867924528301886, "grad_norm": 0.4705714421622477, "learning_rate": 4.283715724678232e-05, "loss": 0.8115, "step": 455 }, { "epoch": 0.6883018867924529, "grad_norm": 0.694102381672302, "learning_rate": 4.280917739227756e-05, "loss": 0.7455, "step": 456 }, { "epoch": 0.689811320754717, "grad_norm": 0.7026076420751877, "learning_rate": 4.2781197537772804e-05, "loss": 0.7419, "step": 457 }, { "epoch": 0.6913207547169812, "grad_norm": 0.5435116018268908, "learning_rate": 4.275321768326805e-05, "loss": 0.7141, "step": 458 }, { "epoch": 0.6928301886792453, "grad_norm": 0.7695258633555748, "learning_rate": 4.272523782876329e-05, "loss": 0.7067, "step": 459 }, { "epoch": 0.6943396226415094, "grad_norm": 1.7975424022373265, "learning_rate": 4.2697257974258534e-05, "loss": 0.8697, "step": 460 }, { "epoch": 0.6958490566037736, "grad_norm": 0.6317051374255238, "learning_rate": 4.2669278119753784e-05, "loss": 0.7739, "step": 461 }, { "epoch": 0.6973584905660377, "grad_norm": 0.6351687672902928, "learning_rate": 4.264129826524903e-05, "loss": 0.6983, "step": 462 }, { "epoch": 0.6988679245283019, "grad_norm": 0.6244759183757971, "learning_rate": 4.261331841074427e-05, "loss": 0.7031, "step": 463 }, { "epoch": 0.700377358490566, "grad_norm": 0.5514796773294356, "learning_rate": 4.258533855623951e-05, "loss": 0.6884, "step": 464 }, { "epoch": 0.7018867924528301, "grad_norm": 0.4726505169399417, "learning_rate": 4.255735870173475e-05, "loss": 0.7382, "step": 465 }, { "epoch": 0.7033962264150944, "grad_norm": 0.6739904189805587, "learning_rate": 4.2529378847229994e-05, "loss": 0.8122, "step": 466 }, { "epoch": 0.7049056603773585, "grad_norm": 0.4821838176068059, "learning_rate": 4.250139899272524e-05, "loss": 0.8558, "step": 467 }, { "epoch": 0.7064150943396227, "grad_norm": 0.44367485643876686, "learning_rate": 4.247341913822048e-05, "loss": 0.7216, "step": 468 }, { "epoch": 0.7079245283018868, "grad_norm": 0.4529362658881182, "learning_rate": 4.2445439283715724e-05, "loss": 0.6532, "step": 469 }, { "epoch": 0.7094339622641509, "grad_norm": 0.3961737139001265, "learning_rate": 4.241745942921097e-05, "loss": 0.7866, "step": 470 }, { "epoch": 0.7109433962264151, "grad_norm": 0.5258997289167945, "learning_rate": 4.238947957470622e-05, "loss": 0.8513, "step": 471 }, { "epoch": 0.7124528301886792, "grad_norm": 0.40132597205607645, "learning_rate": 4.236149972020146e-05, "loss": 0.8006, "step": 472 }, { "epoch": 0.7139622641509434, "grad_norm": 0.4178549903280029, "learning_rate": 4.2333519865696704e-05, "loss": 0.7349, "step": 473 }, { "epoch": 0.7154716981132075, "grad_norm": 0.6137285870140219, "learning_rate": 4.230554001119195e-05, "loss": 0.8465, "step": 474 }, { "epoch": 0.7169811320754716, "grad_norm": 0.549511124878987, "learning_rate": 4.2277560156687184e-05, "loss": 0.6763, "step": 475 }, { "epoch": 0.7184905660377359, "grad_norm": 0.46406879130022355, "learning_rate": 4.224958030218243e-05, "loss": 0.7497, "step": 476 }, { "epoch": 0.72, "grad_norm": 0.4888047862828803, "learning_rate": 4.222160044767767e-05, "loss": 0.7669, "step": 477 }, { "epoch": 0.7215094339622642, "grad_norm": 0.49711406759007537, "learning_rate": 4.2193620593172914e-05, "loss": 0.8006, "step": 478 }, { "epoch": 0.7230188679245283, "grad_norm": 0.4176159200268463, "learning_rate": 4.216564073866816e-05, "loss": 0.7379, "step": 479 }, { "epoch": 0.7245283018867924, "grad_norm": 0.5327893586007508, "learning_rate": 4.213766088416341e-05, "loss": 0.8138, "step": 480 }, { "epoch": 0.7260377358490566, "grad_norm": 0.9142602019147515, "learning_rate": 4.210968102965865e-05, "loss": 0.7227, "step": 481 }, { "epoch": 0.7275471698113207, "grad_norm": 0.8778784458357308, "learning_rate": 4.2081701175153894e-05, "loss": 0.7579, "step": 482 }, { "epoch": 0.7290566037735849, "grad_norm": 0.6964388295748577, "learning_rate": 4.205372132064914e-05, "loss": 0.712, "step": 483 }, { "epoch": 0.730566037735849, "grad_norm": 0.43020746781080765, "learning_rate": 4.202574146614438e-05, "loss": 0.7171, "step": 484 }, { "epoch": 0.7320754716981132, "grad_norm": 0.5749225465192066, "learning_rate": 4.199776161163962e-05, "loss": 0.7308, "step": 485 }, { "epoch": 0.7335849056603774, "grad_norm": 0.6045836791728121, "learning_rate": 4.196978175713486e-05, "loss": 0.7837, "step": 486 }, { "epoch": 0.7350943396226415, "grad_norm": 0.853172024259124, "learning_rate": 4.1941801902630104e-05, "loss": 0.7823, "step": 487 }, { "epoch": 0.7366037735849057, "grad_norm": 0.8328941937503014, "learning_rate": 4.191382204812535e-05, "loss": 0.7805, "step": 488 }, { "epoch": 0.7381132075471698, "grad_norm": 0.5856813273142618, "learning_rate": 4.18858421936206e-05, "loss": 0.7613, "step": 489 }, { "epoch": 0.7396226415094339, "grad_norm": 0.7017696574876483, "learning_rate": 4.185786233911584e-05, "loss": 0.7977, "step": 490 }, { "epoch": 0.7411320754716981, "grad_norm": 0.5883283825983562, "learning_rate": 4.1829882484611085e-05, "loss": 0.6595, "step": 491 }, { "epoch": 0.7426415094339622, "grad_norm": 0.9189100296218359, "learning_rate": 4.180190263010633e-05, "loss": 0.8225, "step": 492 }, { "epoch": 0.7441509433962264, "grad_norm": 0.4621273821982856, "learning_rate": 4.177392277560157e-05, "loss": 0.7267, "step": 493 }, { "epoch": 0.7456603773584906, "grad_norm": 0.7263731461661145, "learning_rate": 4.1745942921096815e-05, "loss": 0.8351, "step": 494 }, { "epoch": 0.7471698113207547, "grad_norm": 0.4239681400253427, "learning_rate": 4.171796306659206e-05, "loss": 0.8602, "step": 495 }, { "epoch": 0.7486792452830189, "grad_norm": 0.5587028545439432, "learning_rate": 4.1689983212087295e-05, "loss": 0.7781, "step": 496 }, { "epoch": 0.750188679245283, "grad_norm": 0.43677162709699907, "learning_rate": 4.166200335758254e-05, "loss": 0.7885, "step": 497 }, { "epoch": 0.7516981132075472, "grad_norm": 0.5956328426453763, "learning_rate": 4.163402350307778e-05, "loss": 0.7078, "step": 498 }, { "epoch": 0.7532075471698113, "grad_norm": 0.44802871978378656, "learning_rate": 4.160604364857303e-05, "loss": 0.7486, "step": 499 }, { "epoch": 0.7547169811320755, "grad_norm": 0.40451743130792983, "learning_rate": 4.1578063794068275e-05, "loss": 0.7091, "step": 500 }, { "epoch": 0.7562264150943396, "grad_norm": 0.4482261465687673, "learning_rate": 4.155008393956352e-05, "loss": 0.708, "step": 501 }, { "epoch": 0.7577358490566037, "grad_norm": 4.209187428133594, "learning_rate": 4.152210408505876e-05, "loss": 0.7711, "step": 502 }, { "epoch": 0.759245283018868, "grad_norm": 0.6084246843903299, "learning_rate": 4.1494124230554005e-05, "loss": 0.7152, "step": 503 }, { "epoch": 0.7607547169811321, "grad_norm": 0.5301187311477114, "learning_rate": 4.146614437604925e-05, "loss": 0.822, "step": 504 }, { "epoch": 0.7622641509433963, "grad_norm": 0.5775612395962103, "learning_rate": 4.143816452154449e-05, "loss": 0.773, "step": 505 }, { "epoch": 0.7637735849056604, "grad_norm": 0.5169776427958961, "learning_rate": 4.141018466703973e-05, "loss": 0.7756, "step": 506 }, { "epoch": 0.7652830188679245, "grad_norm": 0.6175580792907378, "learning_rate": 4.138220481253497e-05, "loss": 0.7714, "step": 507 }, { "epoch": 0.7667924528301887, "grad_norm": 0.5202456866228283, "learning_rate": 4.135422495803022e-05, "loss": 0.7569, "step": 508 }, { "epoch": 0.7683018867924528, "grad_norm": 0.43230254405968893, "learning_rate": 4.1326245103525465e-05, "loss": 0.7701, "step": 509 }, { "epoch": 0.769811320754717, "grad_norm": 0.6199856475998696, "learning_rate": 4.129826524902071e-05, "loss": 0.7365, "step": 510 }, { "epoch": 0.7713207547169811, "grad_norm": 0.4590557643196903, "learning_rate": 4.127028539451595e-05, "loss": 0.7349, "step": 511 }, { "epoch": 0.7728301886792452, "grad_norm": 0.5947228546936734, "learning_rate": 4.1242305540011195e-05, "loss": 0.7292, "step": 512 }, { "epoch": 0.7743396226415095, "grad_norm": 0.503790040694598, "learning_rate": 4.121432568550644e-05, "loss": 0.7439, "step": 513 }, { "epoch": 0.7758490566037736, "grad_norm": 0.6227639804784848, "learning_rate": 4.118634583100168e-05, "loss": 0.7778, "step": 514 }, { "epoch": 0.7773584905660378, "grad_norm": 0.7695881075574827, "learning_rate": 4.1158365976496925e-05, "loss": 0.7474, "step": 515 }, { "epoch": 0.7788679245283019, "grad_norm": 0.4832882825699558, "learning_rate": 4.113038612199216e-05, "loss": 0.7468, "step": 516 }, { "epoch": 0.780377358490566, "grad_norm": 0.5553008775060718, "learning_rate": 4.110240626748741e-05, "loss": 0.6952, "step": 517 }, { "epoch": 0.7818867924528302, "grad_norm": 0.5887331512376914, "learning_rate": 4.1074426412982655e-05, "loss": 0.753, "step": 518 }, { "epoch": 0.7833962264150943, "grad_norm": 0.44943535047602, "learning_rate": 4.10464465584779e-05, "loss": 0.7084, "step": 519 }, { "epoch": 0.7849056603773585, "grad_norm": 0.6695367567635757, "learning_rate": 4.101846670397314e-05, "loss": 0.8359, "step": 520 }, { "epoch": 0.7864150943396226, "grad_norm": 0.4641289136731032, "learning_rate": 4.0990486849468386e-05, "loss": 0.727, "step": 521 }, { "epoch": 0.7879245283018868, "grad_norm": 0.7406198663798486, "learning_rate": 4.096250699496363e-05, "loss": 0.7915, "step": 522 }, { "epoch": 0.789433962264151, "grad_norm": 0.44176398853870474, "learning_rate": 4.093452714045887e-05, "loss": 0.6947, "step": 523 }, { "epoch": 0.7909433962264151, "grad_norm": 0.48042954012719924, "learning_rate": 4.0906547285954116e-05, "loss": 0.7552, "step": 524 }, { "epoch": 0.7924528301886793, "grad_norm": 0.5842724845306636, "learning_rate": 4.087856743144936e-05, "loss": 0.8175, "step": 525 }, { "epoch": 0.7939622641509434, "grad_norm": 0.4284726371980202, "learning_rate": 4.08505875769446e-05, "loss": 0.7494, "step": 526 }, { "epoch": 0.7954716981132075, "grad_norm": 0.5254149807506523, "learning_rate": 4.0822607722439846e-05, "loss": 0.7549, "step": 527 }, { "epoch": 0.7969811320754717, "grad_norm": 0.48365346339231174, "learning_rate": 4.079462786793509e-05, "loss": 0.6465, "step": 528 }, { "epoch": 0.7984905660377358, "grad_norm": 0.5361901921704368, "learning_rate": 4.076664801343033e-05, "loss": 0.7893, "step": 529 }, { "epoch": 0.8, "grad_norm": 0.4450391820564859, "learning_rate": 4.0738668158925576e-05, "loss": 0.7596, "step": 530 }, { "epoch": 0.8015094339622642, "grad_norm": 0.4238114156508658, "learning_rate": 4.071068830442082e-05, "loss": 0.7685, "step": 531 }, { "epoch": 0.8030188679245283, "grad_norm": 0.48625846049027577, "learning_rate": 4.068270844991606e-05, "loss": 0.6887, "step": 532 }, { "epoch": 0.8045283018867925, "grad_norm": 0.4417344646967358, "learning_rate": 4.0654728595411306e-05, "loss": 0.8144, "step": 533 }, { "epoch": 0.8060377358490566, "grad_norm": 0.38616318769298813, "learning_rate": 4.062674874090655e-05, "loss": 0.7711, "step": 534 }, { "epoch": 0.8075471698113208, "grad_norm": 0.4732687688025921, "learning_rate": 4.059876888640179e-05, "loss": 0.7262, "step": 535 }, { "epoch": 0.8090566037735849, "grad_norm": 0.3662632839035369, "learning_rate": 4.0570789031897036e-05, "loss": 0.7762, "step": 536 }, { "epoch": 0.810566037735849, "grad_norm": 0.41660011598030444, "learning_rate": 4.054280917739228e-05, "loss": 0.8063, "step": 537 }, { "epoch": 0.8120754716981132, "grad_norm": 0.5061770396979196, "learning_rate": 4.051482932288752e-05, "loss": 0.8055, "step": 538 }, { "epoch": 0.8135849056603773, "grad_norm": 0.4321594676145302, "learning_rate": 4.0486849468382766e-05, "loss": 0.6895, "step": 539 }, { "epoch": 0.8150943396226416, "grad_norm": 0.4154273903737854, "learning_rate": 4.045886961387801e-05, "loss": 0.8246, "step": 540 }, { "epoch": 0.8166037735849057, "grad_norm": 0.44089593064248783, "learning_rate": 4.043088975937325e-05, "loss": 0.6919, "step": 541 }, { "epoch": 0.8181132075471698, "grad_norm": 0.3931592707406957, "learning_rate": 4.0402909904868496e-05, "loss": 0.7232, "step": 542 }, { "epoch": 0.819622641509434, "grad_norm": 0.3824488662725976, "learning_rate": 4.037493005036374e-05, "loss": 0.678, "step": 543 }, { "epoch": 0.8211320754716981, "grad_norm": 0.47459855214234453, "learning_rate": 4.034695019585898e-05, "loss": 0.6916, "step": 544 }, { "epoch": 0.8226415094339623, "grad_norm": 0.4092162364813964, "learning_rate": 4.0318970341354226e-05, "loss": 0.7284, "step": 545 }, { "epoch": 0.8241509433962264, "grad_norm": 0.40158642010989914, "learning_rate": 4.029099048684947e-05, "loss": 0.7858, "step": 546 }, { "epoch": 0.8256603773584905, "grad_norm": 0.47547007845195716, "learning_rate": 4.026301063234472e-05, "loss": 0.7484, "step": 547 }, { "epoch": 0.8271698113207547, "grad_norm": 0.41353726994458534, "learning_rate": 4.0235030777839956e-05, "loss": 0.6895, "step": 548 }, { "epoch": 0.8286792452830188, "grad_norm": 0.4261847130879816, "learning_rate": 4.02070509233352e-05, "loss": 0.7615, "step": 549 }, { "epoch": 0.8301886792452831, "grad_norm": 0.3713485609487883, "learning_rate": 4.017907106883044e-05, "loss": 0.702, "step": 550 }, { "epoch": 0.8316981132075472, "grad_norm": 0.4940211687654344, "learning_rate": 4.0151091214325686e-05, "loss": 0.7926, "step": 551 }, { "epoch": 0.8332075471698113, "grad_norm": 0.4418268573162059, "learning_rate": 4.012311135982093e-05, "loss": 0.7477, "step": 552 }, { "epoch": 0.8347169811320755, "grad_norm": 0.4296889446715816, "learning_rate": 4.009513150531617e-05, "loss": 0.7066, "step": 553 }, { "epoch": 0.8362264150943396, "grad_norm": 0.43481881270259315, "learning_rate": 4.0067151650811416e-05, "loss": 0.7104, "step": 554 }, { "epoch": 0.8377358490566038, "grad_norm": 0.444949300217321, "learning_rate": 4.003917179630666e-05, "loss": 0.8165, "step": 555 }, { "epoch": 0.8392452830188679, "grad_norm": 0.4167038227484225, "learning_rate": 4.00111919418019e-05, "loss": 0.7655, "step": 556 }, { "epoch": 0.840754716981132, "grad_norm": 0.46442021729984984, "learning_rate": 3.998321208729715e-05, "loss": 0.8047, "step": 557 }, { "epoch": 0.8422641509433962, "grad_norm": 0.452338392615249, "learning_rate": 3.995523223279239e-05, "loss": 0.6587, "step": 558 }, { "epoch": 0.8437735849056603, "grad_norm": 0.41153779842520494, "learning_rate": 3.992725237828763e-05, "loss": 0.7026, "step": 559 }, { "epoch": 0.8452830188679246, "grad_norm": 2.6556838900087136, "learning_rate": 3.989927252378288e-05, "loss": 0.7276, "step": 560 }, { "epoch": 0.8467924528301887, "grad_norm": 0.4051057742589149, "learning_rate": 3.987129266927812e-05, "loss": 0.7598, "step": 561 }, { "epoch": 0.8483018867924528, "grad_norm": 0.4783860215225303, "learning_rate": 3.9843312814773363e-05, "loss": 0.7517, "step": 562 }, { "epoch": 0.849811320754717, "grad_norm": 0.379963266361194, "learning_rate": 3.981533296026861e-05, "loss": 0.7294, "step": 563 }, { "epoch": 0.8513207547169811, "grad_norm": 0.3995738900162358, "learning_rate": 3.978735310576385e-05, "loss": 0.7339, "step": 564 }, { "epoch": 0.8528301886792453, "grad_norm": 0.4429737030646909, "learning_rate": 3.9759373251259093e-05, "loss": 0.7687, "step": 565 }, { "epoch": 0.8543396226415094, "grad_norm": 0.35025220758613634, "learning_rate": 3.9731393396754344e-05, "loss": 0.7217, "step": 566 }, { "epoch": 0.8558490566037736, "grad_norm": 0.5313539267701104, "learning_rate": 3.970341354224959e-05, "loss": 0.6841, "step": 567 }, { "epoch": 0.8573584905660377, "grad_norm": 0.44159011121842684, "learning_rate": 3.9675433687744824e-05, "loss": 0.7777, "step": 568 }, { "epoch": 0.8588679245283019, "grad_norm": 0.5032221283954244, "learning_rate": 3.964745383324007e-05, "loss": 0.7903, "step": 569 }, { "epoch": 0.8603773584905661, "grad_norm": 0.5209160538052351, "learning_rate": 3.961947397873531e-05, "loss": 0.7832, "step": 570 }, { "epoch": 0.8618867924528302, "grad_norm": 0.4117034313241634, "learning_rate": 3.9591494124230554e-05, "loss": 0.7483, "step": 571 }, { "epoch": 0.8633962264150944, "grad_norm": 0.6056396156171621, "learning_rate": 3.95635142697258e-05, "loss": 0.7845, "step": 572 }, { "epoch": 0.8649056603773585, "grad_norm": 0.4032653916231986, "learning_rate": 3.953553441522104e-05, "loss": 0.7263, "step": 573 }, { "epoch": 0.8664150943396226, "grad_norm": 0.45120438719371664, "learning_rate": 3.9507554560716284e-05, "loss": 0.6961, "step": 574 }, { "epoch": 0.8679245283018868, "grad_norm": 0.4944595803156947, "learning_rate": 3.9479574706211534e-05, "loss": 0.7188, "step": 575 }, { "epoch": 0.8694339622641509, "grad_norm": 0.46427671257440184, "learning_rate": 3.945159485170678e-05, "loss": 0.798, "step": 576 }, { "epoch": 0.8709433962264151, "grad_norm": 0.6434219625626204, "learning_rate": 3.942361499720202e-05, "loss": 0.7815, "step": 577 }, { "epoch": 0.8724528301886793, "grad_norm": 0.4620657198048205, "learning_rate": 3.9395635142697264e-05, "loss": 0.7327, "step": 578 }, { "epoch": 0.8739622641509434, "grad_norm": 0.40262252638288304, "learning_rate": 3.93676552881925e-05, "loss": 0.7607, "step": 579 }, { "epoch": 0.8754716981132076, "grad_norm": 0.45983570698328446, "learning_rate": 3.9339675433687744e-05, "loss": 0.7012, "step": 580 }, { "epoch": 0.8769811320754717, "grad_norm": 0.40363702713690075, "learning_rate": 3.931169557918299e-05, "loss": 0.7372, "step": 581 }, { "epoch": 0.8784905660377359, "grad_norm": 0.3907346317291026, "learning_rate": 3.928371572467823e-05, "loss": 0.7761, "step": 582 }, { "epoch": 0.88, "grad_norm": 0.3990895685405329, "learning_rate": 3.9255735870173474e-05, "loss": 0.726, "step": 583 }, { "epoch": 0.8815094339622641, "grad_norm": 0.3729973185332231, "learning_rate": 3.922775601566872e-05, "loss": 0.7677, "step": 584 }, { "epoch": 0.8830188679245283, "grad_norm": 0.4089477576652253, "learning_rate": 3.919977616116397e-05, "loss": 0.7048, "step": 585 }, { "epoch": 0.8845283018867924, "grad_norm": 0.3945532227748924, "learning_rate": 3.917179630665921e-05, "loss": 0.7714, "step": 586 }, { "epoch": 0.8860377358490567, "grad_norm": 0.5140900259958995, "learning_rate": 3.9143816452154454e-05, "loss": 0.7382, "step": 587 }, { "epoch": 0.8875471698113208, "grad_norm": 0.38891901989138217, "learning_rate": 3.91158365976497e-05, "loss": 0.8208, "step": 588 }, { "epoch": 0.8890566037735849, "grad_norm": 0.49569695718152956, "learning_rate": 3.9087856743144934e-05, "loss": 0.7436, "step": 589 }, { "epoch": 0.8905660377358491, "grad_norm": 0.43285829098870593, "learning_rate": 3.905987688864018e-05, "loss": 0.7365, "step": 590 }, { "epoch": 0.8920754716981132, "grad_norm": 0.36885687866534356, "learning_rate": 3.903189703413542e-05, "loss": 0.7022, "step": 591 }, { "epoch": 0.8935849056603774, "grad_norm": 0.43241043983514127, "learning_rate": 3.9003917179630664e-05, "loss": 0.8662, "step": 592 }, { "epoch": 0.8950943396226415, "grad_norm": 0.3649761816139934, "learning_rate": 3.897593732512591e-05, "loss": 0.7368, "step": 593 }, { "epoch": 0.8966037735849056, "grad_norm": 0.6268302532119931, "learning_rate": 3.894795747062116e-05, "loss": 0.7893, "step": 594 }, { "epoch": 0.8981132075471698, "grad_norm": 0.3667724539461183, "learning_rate": 3.89199776161164e-05, "loss": 0.7488, "step": 595 }, { "epoch": 0.8996226415094339, "grad_norm": 0.48371976093317937, "learning_rate": 3.8891997761611645e-05, "loss": 0.7535, "step": 596 }, { "epoch": 0.9011320754716982, "grad_norm": 0.4239299088649613, "learning_rate": 3.886401790710689e-05, "loss": 0.7663, "step": 597 }, { "epoch": 0.9026415094339623, "grad_norm": 0.3779199263406811, "learning_rate": 3.883603805260213e-05, "loss": 0.7457, "step": 598 }, { "epoch": 0.9041509433962264, "grad_norm": 0.9168684585034359, "learning_rate": 3.880805819809737e-05, "loss": 0.7519, "step": 599 }, { "epoch": 0.9056603773584906, "grad_norm": 0.3661469557119822, "learning_rate": 3.878007834359261e-05, "loss": 0.7733, "step": 600 }, { "epoch": 0.9071698113207547, "grad_norm": 3.9823358965275806, "learning_rate": 3.8752098489087855e-05, "loss": 0.9001, "step": 601 }, { "epoch": 0.9086792452830189, "grad_norm": 0.7852186633819904, "learning_rate": 3.87241186345831e-05, "loss": 0.7157, "step": 602 }, { "epoch": 0.910188679245283, "grad_norm": 0.5408079896292302, "learning_rate": 3.869613878007835e-05, "loss": 0.7022, "step": 603 }, { "epoch": 0.9116981132075471, "grad_norm": 0.7834797565859986, "learning_rate": 3.866815892557359e-05, "loss": 0.6932, "step": 604 }, { "epoch": 0.9132075471698113, "grad_norm": 0.5709707609818889, "learning_rate": 3.8640179071068835e-05, "loss": 0.7641, "step": 605 }, { "epoch": 0.9147169811320754, "grad_norm": 0.5337869168215333, "learning_rate": 3.861219921656408e-05, "loss": 0.7806, "step": 606 }, { "epoch": 0.9162264150943397, "grad_norm": 0.45607562967499066, "learning_rate": 3.858421936205932e-05, "loss": 0.6848, "step": 607 }, { "epoch": 0.9177358490566038, "grad_norm": 0.5120296828216754, "learning_rate": 3.8556239507554565e-05, "loss": 0.7603, "step": 608 }, { "epoch": 0.9192452830188679, "grad_norm": 0.4344514448742209, "learning_rate": 3.852825965304981e-05, "loss": 0.6694, "step": 609 }, { "epoch": 0.9207547169811321, "grad_norm": 0.5404883518478442, "learning_rate": 3.8500279798545045e-05, "loss": 0.7228, "step": 610 }, { "epoch": 0.9222641509433962, "grad_norm": 0.4970271679168196, "learning_rate": 3.847229994404029e-05, "loss": 0.7797, "step": 611 }, { "epoch": 0.9237735849056604, "grad_norm": 0.49505737921449006, "learning_rate": 3.844432008953553e-05, "loss": 0.7037, "step": 612 }, { "epoch": 0.9252830188679245, "grad_norm": 0.7739566407768966, "learning_rate": 3.841634023503078e-05, "loss": 0.7542, "step": 613 }, { "epoch": 0.9267924528301886, "grad_norm": 0.4238126871917205, "learning_rate": 3.8388360380526025e-05, "loss": 0.7202, "step": 614 }, { "epoch": 0.9283018867924528, "grad_norm": 0.6151008080921576, "learning_rate": 3.836038052602127e-05, "loss": 0.7636, "step": 615 }, { "epoch": 0.929811320754717, "grad_norm": 0.42881880105092096, "learning_rate": 3.833240067151651e-05, "loss": 0.7613, "step": 616 }, { "epoch": 0.9313207547169812, "grad_norm": 0.5146237853082963, "learning_rate": 3.8304420817011755e-05, "loss": 0.715, "step": 617 }, { "epoch": 0.9328301886792453, "grad_norm": 0.3714673908627866, "learning_rate": 3.8276440962507e-05, "loss": 0.745, "step": 618 }, { "epoch": 0.9343396226415094, "grad_norm": 0.4650679347629378, "learning_rate": 3.824846110800224e-05, "loss": 0.7422, "step": 619 }, { "epoch": 0.9358490566037736, "grad_norm": 1.0749664641896852, "learning_rate": 3.822048125349748e-05, "loss": 0.7985, "step": 620 }, { "epoch": 0.9373584905660377, "grad_norm": 0.40863953645268636, "learning_rate": 3.819250139899272e-05, "loss": 0.7385, "step": 621 }, { "epoch": 0.9388679245283019, "grad_norm": 0.40252351812274834, "learning_rate": 3.816452154448797e-05, "loss": 0.7643, "step": 622 }, { "epoch": 0.940377358490566, "grad_norm": 0.43559572158086535, "learning_rate": 3.8136541689983215e-05, "loss": 0.7439, "step": 623 }, { "epoch": 0.9418867924528301, "grad_norm": 0.9641369155300525, "learning_rate": 3.810856183547846e-05, "loss": 0.7718, "step": 624 }, { "epoch": 0.9433962264150944, "grad_norm": 0.4310013200104187, "learning_rate": 3.80805819809737e-05, "loss": 0.7925, "step": 625 }, { "epoch": 0.9449056603773585, "grad_norm": 0.4194039790316717, "learning_rate": 3.8052602126468945e-05, "loss": 0.7287, "step": 626 }, { "epoch": 0.9464150943396227, "grad_norm": 0.4665354497735913, "learning_rate": 3.802462227196419e-05, "loss": 0.6991, "step": 627 }, { "epoch": 0.9479245283018868, "grad_norm": 0.45730976704052134, "learning_rate": 3.799664241745943e-05, "loss": 0.7642, "step": 628 }, { "epoch": 0.9494339622641509, "grad_norm": 0.3856324295687261, "learning_rate": 3.7968662562954675e-05, "loss": 0.6671, "step": 629 }, { "epoch": 0.9509433962264151, "grad_norm": 0.4107928434108811, "learning_rate": 3.794068270844992e-05, "loss": 0.7561, "step": 630 }, { "epoch": 0.9524528301886792, "grad_norm": 0.44719174208858464, "learning_rate": 3.791270285394516e-05, "loss": 0.8235, "step": 631 }, { "epoch": 0.9539622641509434, "grad_norm": 0.4008367508350329, "learning_rate": 3.7884722999440406e-05, "loss": 0.7292, "step": 632 }, { "epoch": 0.9554716981132075, "grad_norm": 0.4391202885533926, "learning_rate": 3.785674314493565e-05, "loss": 0.8191, "step": 633 }, { "epoch": 0.9569811320754718, "grad_norm": 0.42891554570166274, "learning_rate": 3.782876329043089e-05, "loss": 0.8442, "step": 634 }, { "epoch": 0.9584905660377359, "grad_norm": 0.43000876777383873, "learning_rate": 3.7800783435926136e-05, "loss": 0.7543, "step": 635 }, { "epoch": 0.96, "grad_norm": 0.4155832959292569, "learning_rate": 3.777280358142138e-05, "loss": 0.729, "step": 636 }, { "epoch": 0.9615094339622642, "grad_norm": 1.3022894310168054, "learning_rate": 3.774482372691662e-05, "loss": 0.7624, "step": 637 }, { "epoch": 0.9630188679245283, "grad_norm": 0.39110065047725334, "learning_rate": 3.7716843872411866e-05, "loss": 0.7788, "step": 638 }, { "epoch": 0.9645283018867925, "grad_norm": 0.4420204425883805, "learning_rate": 3.768886401790711e-05, "loss": 0.8003, "step": 639 }, { "epoch": 0.9660377358490566, "grad_norm": 0.47846340432718537, "learning_rate": 3.766088416340235e-05, "loss": 0.661, "step": 640 }, { "epoch": 0.9675471698113207, "grad_norm": 0.45386680890224157, "learning_rate": 3.7632904308897596e-05, "loss": 0.7336, "step": 641 }, { "epoch": 0.9690566037735849, "grad_norm": 0.4666290433954947, "learning_rate": 3.760492445439284e-05, "loss": 0.8496, "step": 642 }, { "epoch": 0.970566037735849, "grad_norm": 0.5022567954654142, "learning_rate": 3.757694459988808e-05, "loss": 0.7529, "step": 643 }, { "epoch": 0.9720754716981133, "grad_norm": 0.49014277465359335, "learning_rate": 3.7548964745383326e-05, "loss": 0.7722, "step": 644 }, { "epoch": 0.9735849056603774, "grad_norm": 0.5190484716624026, "learning_rate": 3.752098489087857e-05, "loss": 0.723, "step": 645 }, { "epoch": 0.9750943396226415, "grad_norm": 0.41311399967316736, "learning_rate": 3.749300503637381e-05, "loss": 0.7089, "step": 646 }, { "epoch": 0.9766037735849057, "grad_norm": 0.4396273185284637, "learning_rate": 3.7465025181869056e-05, "loss": 0.7921, "step": 647 }, { "epoch": 0.9781132075471698, "grad_norm": 0.4228832100437926, "learning_rate": 3.74370453273643e-05, "loss": 0.7376, "step": 648 }, { "epoch": 0.979622641509434, "grad_norm": 0.42404331809881607, "learning_rate": 3.740906547285954e-05, "loss": 0.8042, "step": 649 }, { "epoch": 0.9811320754716981, "grad_norm": 0.43384836343190214, "learning_rate": 3.7381085618354786e-05, "loss": 0.7457, "step": 650 }, { "epoch": 0.9826415094339622, "grad_norm": 0.40991552369681833, "learning_rate": 3.735310576385003e-05, "loss": 0.7271, "step": 651 }, { "epoch": 0.9841509433962264, "grad_norm": 0.42720312031263036, "learning_rate": 3.732512590934527e-05, "loss": 0.6783, "step": 652 }, { "epoch": 0.9856603773584905, "grad_norm": 0.3820997114893428, "learning_rate": 3.7297146054840516e-05, "loss": 0.741, "step": 653 }, { "epoch": 0.9871698113207548, "grad_norm": 0.3859225460582329, "learning_rate": 3.726916620033576e-05, "loss": 0.7616, "step": 654 }, { "epoch": 0.9886792452830189, "grad_norm": 0.3184257704684841, "learning_rate": 3.7241186345831e-05, "loss": 0.7018, "step": 655 }, { "epoch": 0.990188679245283, "grad_norm": 5.0525018446915455, "learning_rate": 3.7213206491326246e-05, "loss": 0.6822, "step": 656 }, { "epoch": 0.9916981132075472, "grad_norm": 0.4774160070976838, "learning_rate": 3.718522663682149e-05, "loss": 0.759, "step": 657 }, { "epoch": 0.9932075471698113, "grad_norm": 0.38749071520309747, "learning_rate": 3.715724678231673e-05, "loss": 0.681, "step": 658 }, { "epoch": 0.9947169811320755, "grad_norm": 0.3565441160393527, "learning_rate": 3.7129266927811976e-05, "loss": 0.7235, "step": 659 }, { "epoch": 0.9962264150943396, "grad_norm": 0.3620064697768303, "learning_rate": 3.710128707330722e-05, "loss": 0.702, "step": 660 }, { "epoch": 0.9977358490566037, "grad_norm": 0.3997483511508158, "learning_rate": 3.707330721880247e-05, "loss": 0.7241, "step": 661 }, { "epoch": 0.999245283018868, "grad_norm": 0.9076134480414219, "learning_rate": 3.7045327364297706e-05, "loss": 0.7179, "step": 662 }, { "epoch": 1.0, "grad_norm": 0.9076134480414219, "learning_rate": 3.701734750979295e-05, "loss": 0.7348, "step": 663 }, { "epoch": 1.001509433962264, "grad_norm": 0.6627110764246336, "learning_rate": 3.698936765528819e-05, "loss": 0.6885, "step": 664 }, { "epoch": 1.0030188679245282, "grad_norm": 10.266344586969133, "learning_rate": 3.6961387800783437e-05, "loss": 1.0522, "step": 665 }, { "epoch": 1.0045283018867925, "grad_norm": 0.810534888309623, "learning_rate": 3.693340794627868e-05, "loss": 0.6195, "step": 666 }, { "epoch": 1.0060377358490566, "grad_norm": 1.7755771547569201, "learning_rate": 3.690542809177392e-05, "loss": 0.6254, "step": 667 }, { "epoch": 1.0075471698113208, "grad_norm": 0.6892452310322204, "learning_rate": 3.687744823726917e-05, "loss": 0.6086, "step": 668 }, { "epoch": 1.0090566037735849, "grad_norm": 0.5926694715314103, "learning_rate": 3.684946838276441e-05, "loss": 0.5926, "step": 669 }, { "epoch": 1.010566037735849, "grad_norm": 0.41200219649494063, "learning_rate": 3.682148852825965e-05, "loss": 0.6277, "step": 670 }, { "epoch": 1.0120754716981133, "grad_norm": 0.8158461895891851, "learning_rate": 3.6793508673754903e-05, "loss": 0.6335, "step": 671 }, { "epoch": 1.0135849056603774, "grad_norm": 0.601172077942669, "learning_rate": 3.676552881925014e-05, "loss": 0.6547, "step": 672 }, { "epoch": 1.0150943396226415, "grad_norm": 0.4134386101441369, "learning_rate": 3.6737548964745383e-05, "loss": 0.6927, "step": 673 }, { "epoch": 1.0166037735849056, "grad_norm": 0.5120306933334605, "learning_rate": 3.670956911024063e-05, "loss": 0.6646, "step": 674 }, { "epoch": 1.0181132075471697, "grad_norm": 0.5525313287248008, "learning_rate": 3.668158925573587e-05, "loss": 0.6283, "step": 675 }, { "epoch": 1.019622641509434, "grad_norm": 0.42590696095726077, "learning_rate": 3.6653609401231114e-05, "loss": 0.7062, "step": 676 }, { "epoch": 1.0211320754716982, "grad_norm": 0.724226525712548, "learning_rate": 3.662562954672636e-05, "loss": 0.6758, "step": 677 }, { "epoch": 1.0226415094339623, "grad_norm": 0.5674130150381019, "learning_rate": 3.65976496922216e-05, "loss": 0.6043, "step": 678 }, { "epoch": 1.0241509433962264, "grad_norm": 0.4079281128046953, "learning_rate": 3.6569669837716844e-05, "loss": 0.6014, "step": 679 }, { "epoch": 1.0256603773584905, "grad_norm": 0.46953990551509994, "learning_rate": 3.6541689983212094e-05, "loss": 0.7177, "step": 680 }, { "epoch": 1.0271698113207548, "grad_norm": 0.4545817525144365, "learning_rate": 3.651371012870734e-05, "loss": 0.6133, "step": 681 }, { "epoch": 1.028679245283019, "grad_norm": 0.38317634414943963, "learning_rate": 3.648573027420258e-05, "loss": 0.5863, "step": 682 }, { "epoch": 1.030188679245283, "grad_norm": 0.4097662107255479, "learning_rate": 3.645775041969782e-05, "loss": 0.6544, "step": 683 }, { "epoch": 1.0316981132075471, "grad_norm": 0.5667177083373588, "learning_rate": 3.642977056519306e-05, "loss": 0.6172, "step": 684 }, { "epoch": 1.0332075471698112, "grad_norm": 0.3606672570163167, "learning_rate": 3.6401790710688304e-05, "loss": 0.6386, "step": 685 }, { "epoch": 1.0347169811320756, "grad_norm": 0.4911306296070032, "learning_rate": 3.637381085618355e-05, "loss": 0.6652, "step": 686 }, { "epoch": 1.0362264150943397, "grad_norm": 2.4217324647576763, "learning_rate": 3.634583100167879e-05, "loss": 0.7402, "step": 687 }, { "epoch": 1.0377358490566038, "grad_norm": 0.4824187002755876, "learning_rate": 3.6317851147174034e-05, "loss": 0.711, "step": 688 }, { "epoch": 1.0392452830188679, "grad_norm": 0.49721674246607617, "learning_rate": 3.6289871292669284e-05, "loss": 0.5908, "step": 689 }, { "epoch": 1.040754716981132, "grad_norm": 0.3743482427307361, "learning_rate": 3.626189143816453e-05, "loss": 0.6319, "step": 690 }, { "epoch": 1.0422641509433963, "grad_norm": 0.43291507293459125, "learning_rate": 3.623391158365977e-05, "loss": 0.6705, "step": 691 }, { "epoch": 1.0437735849056604, "grad_norm": 0.5208705268592786, "learning_rate": 3.6205931729155014e-05, "loss": 0.6039, "step": 692 }, { "epoch": 1.0452830188679245, "grad_norm": 0.410450044767452, "learning_rate": 3.617795187465025e-05, "loss": 0.6525, "step": 693 }, { "epoch": 1.0467924528301886, "grad_norm": 0.4022445567989694, "learning_rate": 3.6149972020145494e-05, "loss": 0.6954, "step": 694 }, { "epoch": 1.0483018867924527, "grad_norm": 0.47132444327306144, "learning_rate": 3.612199216564074e-05, "loss": 0.6521, "step": 695 }, { "epoch": 1.049811320754717, "grad_norm": 0.3875966766440471, "learning_rate": 3.609401231113598e-05, "loss": 0.614, "step": 696 }, { "epoch": 1.0513207547169812, "grad_norm": 0.451663223959601, "learning_rate": 3.6066032456631224e-05, "loss": 0.6445, "step": 697 }, { "epoch": 1.0528301886792453, "grad_norm": 0.41172392506051253, "learning_rate": 3.603805260212647e-05, "loss": 0.6422, "step": 698 }, { "epoch": 1.0543396226415094, "grad_norm": 0.375540946924188, "learning_rate": 3.601007274762172e-05, "loss": 0.652, "step": 699 }, { "epoch": 1.0558490566037735, "grad_norm": 0.3717387172700223, "learning_rate": 3.598209289311696e-05, "loss": 0.6777, "step": 700 }, { "epoch": 1.0573584905660378, "grad_norm": 0.424782758304706, "learning_rate": 3.5954113038612204e-05, "loss": 0.6228, "step": 701 }, { "epoch": 1.058867924528302, "grad_norm": 0.3391925209354729, "learning_rate": 3.592613318410745e-05, "loss": 0.6024, "step": 702 }, { "epoch": 1.060377358490566, "grad_norm": 0.32914720950015897, "learning_rate": 3.5898153329602684e-05, "loss": 0.6457, "step": 703 }, { "epoch": 1.0618867924528301, "grad_norm": 1.0554743822729729, "learning_rate": 3.587017347509793e-05, "loss": 0.5799, "step": 704 }, { "epoch": 1.0633962264150942, "grad_norm": 0.39339053656170486, "learning_rate": 3.584219362059317e-05, "loss": 0.5598, "step": 705 }, { "epoch": 1.0649056603773586, "grad_norm": 0.40135937465077187, "learning_rate": 3.5814213766088414e-05, "loss": 0.6226, "step": 706 }, { "epoch": 1.0664150943396227, "grad_norm": 0.43288336102917757, "learning_rate": 3.578623391158366e-05, "loss": 0.6274, "step": 707 }, { "epoch": 1.0679245283018868, "grad_norm": 0.4521991854235038, "learning_rate": 3.575825405707891e-05, "loss": 0.6483, "step": 708 }, { "epoch": 1.0694339622641509, "grad_norm": 0.402604567263997, "learning_rate": 3.573027420257415e-05, "loss": 0.6309, "step": 709 }, { "epoch": 1.070943396226415, "grad_norm": 0.46402872137647033, "learning_rate": 3.5702294348069395e-05, "loss": 0.639, "step": 710 }, { "epoch": 1.0724528301886793, "grad_norm": 0.4017030194373752, "learning_rate": 3.567431449356464e-05, "loss": 0.6329, "step": 711 }, { "epoch": 1.0739622641509434, "grad_norm": 0.47618502583548106, "learning_rate": 3.564633463905988e-05, "loss": 0.7084, "step": 712 }, { "epoch": 1.0754716981132075, "grad_norm": 0.41264204371650326, "learning_rate": 3.5618354784555125e-05, "loss": 0.6365, "step": 713 }, { "epoch": 1.0769811320754716, "grad_norm": 0.38772697844532533, "learning_rate": 3.559037493005036e-05, "loss": 0.6308, "step": 714 }, { "epoch": 1.0784905660377357, "grad_norm": 0.46449961773150183, "learning_rate": 3.5562395075545605e-05, "loss": 0.6279, "step": 715 }, { "epoch": 1.08, "grad_norm": 0.45078141741988764, "learning_rate": 3.553441522104085e-05, "loss": 0.6256, "step": 716 }, { "epoch": 1.0815094339622642, "grad_norm": 0.5161091002718486, "learning_rate": 3.55064353665361e-05, "loss": 0.6283, "step": 717 }, { "epoch": 1.0830188679245283, "grad_norm": 0.3938545925062218, "learning_rate": 3.547845551203134e-05, "loss": 0.6029, "step": 718 }, { "epoch": 1.0845283018867924, "grad_norm": 0.4535260490235984, "learning_rate": 3.5450475657526585e-05, "loss": 0.6482, "step": 719 }, { "epoch": 1.0860377358490565, "grad_norm": 0.3970289231442372, "learning_rate": 3.542249580302183e-05, "loss": 0.5763, "step": 720 }, { "epoch": 1.0875471698113208, "grad_norm": 0.4039865278069702, "learning_rate": 3.539451594851707e-05, "loss": 0.6558, "step": 721 }, { "epoch": 1.089056603773585, "grad_norm": 0.4202122162673237, "learning_rate": 3.5366536094012315e-05, "loss": 0.6462, "step": 722 }, { "epoch": 1.090566037735849, "grad_norm": 0.38039927746822294, "learning_rate": 3.533855623950756e-05, "loss": 0.6544, "step": 723 }, { "epoch": 1.0920754716981131, "grad_norm": 0.40116562127860167, "learning_rate": 3.5310576385002795e-05, "loss": 0.6408, "step": 724 }, { "epoch": 1.0935849056603772, "grad_norm": 0.48128273610391287, "learning_rate": 3.528259653049804e-05, "loss": 0.5859, "step": 725 }, { "epoch": 1.0950943396226416, "grad_norm": 0.42443398500645513, "learning_rate": 3.525461667599328e-05, "loss": 0.6272, "step": 726 }, { "epoch": 1.0966037735849057, "grad_norm": 0.8589027269475118, "learning_rate": 3.522663682148853e-05, "loss": 0.6466, "step": 727 }, { "epoch": 1.0981132075471698, "grad_norm": 1.3107651423337432, "learning_rate": 3.5198656966983775e-05, "loss": 0.6948, "step": 728 }, { "epoch": 1.099622641509434, "grad_norm": 0.46154805702038726, "learning_rate": 3.517067711247902e-05, "loss": 0.647, "step": 729 }, { "epoch": 1.101132075471698, "grad_norm": 0.4215490457807108, "learning_rate": 3.514269725797426e-05, "loss": 0.639, "step": 730 }, { "epoch": 1.1026415094339623, "grad_norm": 0.44910831262776585, "learning_rate": 3.5114717403469505e-05, "loss": 0.6416, "step": 731 }, { "epoch": 1.1041509433962264, "grad_norm": 0.4180279321095757, "learning_rate": 3.508673754896475e-05, "loss": 0.687, "step": 732 }, { "epoch": 1.1056603773584905, "grad_norm": 0.3787657588954383, "learning_rate": 3.505875769445999e-05, "loss": 0.6842, "step": 733 }, { "epoch": 1.1071698113207546, "grad_norm": 0.4577454096053106, "learning_rate": 3.5030777839955235e-05, "loss": 0.6741, "step": 734 }, { "epoch": 1.1086792452830188, "grad_norm": 1.4461079332215183, "learning_rate": 3.500279798545047e-05, "loss": 0.6817, "step": 735 }, { "epoch": 1.110188679245283, "grad_norm": 0.4104193651538478, "learning_rate": 3.497481813094572e-05, "loss": 0.6317, "step": 736 }, { "epoch": 1.1116981132075472, "grad_norm": 0.655866457602048, "learning_rate": 3.4946838276440965e-05, "loss": 0.6313, "step": 737 }, { "epoch": 1.1132075471698113, "grad_norm": 0.4531623276746969, "learning_rate": 3.491885842193621e-05, "loss": 0.5892, "step": 738 }, { "epoch": 1.1147169811320754, "grad_norm": 0.5709351123804889, "learning_rate": 3.489087856743145e-05, "loss": 0.5634, "step": 739 }, { "epoch": 1.1162264150943395, "grad_norm": 0.48674031702120063, "learning_rate": 3.4862898712926696e-05, "loss": 0.6801, "step": 740 }, { "epoch": 1.1177358490566038, "grad_norm": 0.4048608285674579, "learning_rate": 3.483491885842194e-05, "loss": 0.5654, "step": 741 }, { "epoch": 1.119245283018868, "grad_norm": 0.4626449984604195, "learning_rate": 3.480693900391718e-05, "loss": 0.6544, "step": 742 }, { "epoch": 1.120754716981132, "grad_norm": 0.5183643140468202, "learning_rate": 3.4778959149412426e-05, "loss": 0.6651, "step": 743 }, { "epoch": 1.1222641509433962, "grad_norm": 0.49226691831400854, "learning_rate": 3.475097929490767e-05, "loss": 0.6862, "step": 744 }, { "epoch": 1.1237735849056603, "grad_norm": 4.415706986920795, "learning_rate": 3.472299944040291e-05, "loss": 0.6689, "step": 745 }, { "epoch": 1.1252830188679246, "grad_norm": 0.7573980083060947, "learning_rate": 3.4695019585898156e-05, "loss": 0.6373, "step": 746 }, { "epoch": 1.1267924528301887, "grad_norm": 0.5906345275939948, "learning_rate": 3.46670397313934e-05, "loss": 0.6309, "step": 747 }, { "epoch": 1.1283018867924528, "grad_norm": 0.3998318687851271, "learning_rate": 3.463905987688864e-05, "loss": 0.6802, "step": 748 }, { "epoch": 1.129811320754717, "grad_norm": 0.5467978795675057, "learning_rate": 3.4611080022383886e-05, "loss": 0.562, "step": 749 }, { "epoch": 1.131320754716981, "grad_norm": 0.5191137071980696, "learning_rate": 3.458310016787913e-05, "loss": 0.6156, "step": 750 }, { "epoch": 1.1328301886792453, "grad_norm": 0.44633904601748514, "learning_rate": 3.455512031337437e-05, "loss": 0.5838, "step": 751 }, { "epoch": 1.1343396226415094, "grad_norm": 0.5065992896082492, "learning_rate": 3.4527140458869616e-05, "loss": 0.6843, "step": 752 }, { "epoch": 1.1358490566037736, "grad_norm": 0.4264224918247215, "learning_rate": 3.449916060436486e-05, "loss": 0.6091, "step": 753 }, { "epoch": 1.1373584905660377, "grad_norm": 0.3992177248827404, "learning_rate": 3.44711807498601e-05, "loss": 0.6, "step": 754 }, { "epoch": 1.1388679245283018, "grad_norm": 0.48903828473507216, "learning_rate": 3.4443200895355346e-05, "loss": 0.6459, "step": 755 }, { "epoch": 1.140377358490566, "grad_norm": 0.4105745437343364, "learning_rate": 3.441522104085059e-05, "loss": 0.6526, "step": 756 }, { "epoch": 1.1418867924528302, "grad_norm": 0.49384071168597404, "learning_rate": 3.438724118634583e-05, "loss": 0.685, "step": 757 }, { "epoch": 1.1433962264150943, "grad_norm": 0.5413740841399795, "learning_rate": 3.4359261331841076e-05, "loss": 0.6377, "step": 758 }, { "epoch": 1.1449056603773584, "grad_norm": 0.480269969475746, "learning_rate": 3.433128147733632e-05, "loss": 0.6463, "step": 759 }, { "epoch": 1.1464150943396225, "grad_norm": 0.5646301785912243, "learning_rate": 3.430330162283156e-05, "loss": 0.6265, "step": 760 }, { "epoch": 1.1479245283018868, "grad_norm": 0.8673433809554542, "learning_rate": 3.4275321768326806e-05, "loss": 0.642, "step": 761 }, { "epoch": 1.149433962264151, "grad_norm": 0.4112877387329245, "learning_rate": 3.424734191382205e-05, "loss": 0.6082, "step": 762 }, { "epoch": 1.150943396226415, "grad_norm": 0.6320032562966805, "learning_rate": 3.421936205931729e-05, "loss": 0.6219, "step": 763 }, { "epoch": 1.1524528301886792, "grad_norm": 0.46715710458859117, "learning_rate": 3.4191382204812536e-05, "loss": 0.6477, "step": 764 }, { "epoch": 1.1539622641509433, "grad_norm": 0.5192541163104857, "learning_rate": 3.416340235030778e-05, "loss": 0.6686, "step": 765 }, { "epoch": 1.1554716981132076, "grad_norm": 0.5115037747798377, "learning_rate": 3.413542249580302e-05, "loss": 0.5957, "step": 766 }, { "epoch": 1.1569811320754717, "grad_norm": 0.4631548895084437, "learning_rate": 3.4107442641298266e-05, "loss": 0.6691, "step": 767 }, { "epoch": 1.1584905660377358, "grad_norm": 0.4315295867507691, "learning_rate": 3.407946278679351e-05, "loss": 0.5968, "step": 768 }, { "epoch": 1.16, "grad_norm": 0.5489635360542999, "learning_rate": 3.405148293228875e-05, "loss": 0.6286, "step": 769 }, { "epoch": 1.161509433962264, "grad_norm": 0.5260776713612061, "learning_rate": 3.4023503077783996e-05, "loss": 0.6876, "step": 770 }, { "epoch": 1.1630188679245284, "grad_norm": 0.41102121571815237, "learning_rate": 3.399552322327924e-05, "loss": 0.6875, "step": 771 }, { "epoch": 1.1645283018867925, "grad_norm": 0.5051554044354539, "learning_rate": 3.396754336877448e-05, "loss": 0.6126, "step": 772 }, { "epoch": 1.1660377358490566, "grad_norm": 0.4562465913464345, "learning_rate": 3.3939563514269726e-05, "loss": 0.6137, "step": 773 }, { "epoch": 1.1675471698113207, "grad_norm": 0.4743549865343238, "learning_rate": 3.391158365976497e-05, "loss": 0.6315, "step": 774 }, { "epoch": 1.169056603773585, "grad_norm": 0.5247109156016383, "learning_rate": 3.388360380526022e-05, "loss": 0.6428, "step": 775 }, { "epoch": 1.170566037735849, "grad_norm": 3.559423434494465, "learning_rate": 3.3855623950755457e-05, "loss": 0.6067, "step": 776 }, { "epoch": 1.1720754716981132, "grad_norm": 0.5989095184362712, "learning_rate": 3.38276440962507e-05, "loss": 0.6457, "step": 777 }, { "epoch": 1.1735849056603773, "grad_norm": 0.5188354410044358, "learning_rate": 3.379966424174594e-05, "loss": 0.5998, "step": 778 }, { "epoch": 1.1750943396226414, "grad_norm": 0.4214267781907578, "learning_rate": 3.377168438724119e-05, "loss": 0.6579, "step": 779 }, { "epoch": 1.1766037735849058, "grad_norm": 0.45186691120990086, "learning_rate": 3.374370453273643e-05, "loss": 0.6321, "step": 780 }, { "epoch": 1.1781132075471699, "grad_norm": 0.5363268074933428, "learning_rate": 3.371572467823167e-05, "loss": 0.6145, "step": 781 }, { "epoch": 1.179622641509434, "grad_norm": 0.49420994902734927, "learning_rate": 3.368774482372692e-05, "loss": 0.6015, "step": 782 }, { "epoch": 1.181132075471698, "grad_norm": 0.3799463287008628, "learning_rate": 3.365976496922216e-05, "loss": 0.7114, "step": 783 }, { "epoch": 1.1826415094339622, "grad_norm": 0.4873082496659108, "learning_rate": 3.3631785114717403e-05, "loss": 0.6194, "step": 784 }, { "epoch": 1.1841509433962265, "grad_norm": 0.4207411818460765, "learning_rate": 3.3603805260212654e-05, "loss": 0.6103, "step": 785 }, { "epoch": 1.1856603773584906, "grad_norm": 0.5223740890535683, "learning_rate": 3.35758254057079e-05, "loss": 0.6299, "step": 786 }, { "epoch": 1.1871698113207547, "grad_norm": 0.3569381738413025, "learning_rate": 3.3547845551203134e-05, "loss": 0.6105, "step": 787 }, { "epoch": 1.1886792452830188, "grad_norm": 4.0178075776210775, "learning_rate": 3.351986569669838e-05, "loss": 0.6551, "step": 788 }, { "epoch": 1.190188679245283, "grad_norm": 0.9315621690551965, "learning_rate": 3.349188584219362e-05, "loss": 0.6726, "step": 789 }, { "epoch": 1.1916981132075473, "grad_norm": 0.4663325979493853, "learning_rate": 3.3463905987688864e-05, "loss": 0.6479, "step": 790 }, { "epoch": 1.1932075471698114, "grad_norm": 0.8155128870027623, "learning_rate": 3.343592613318411e-05, "loss": 0.6321, "step": 791 }, { "epoch": 1.1947169811320755, "grad_norm": 0.5354713553205344, "learning_rate": 3.340794627867935e-05, "loss": 0.6267, "step": 792 }, { "epoch": 1.1962264150943396, "grad_norm": 0.8995514690519233, "learning_rate": 3.3379966424174594e-05, "loss": 0.6668, "step": 793 }, { "epoch": 1.1977358490566037, "grad_norm": 0.42325453026549564, "learning_rate": 3.3351986569669844e-05, "loss": 0.6892, "step": 794 }, { "epoch": 1.199245283018868, "grad_norm": 0.9019398748449015, "learning_rate": 3.332400671516509e-05, "loss": 0.6142, "step": 795 }, { "epoch": 1.2007547169811321, "grad_norm": 0.42650778637925774, "learning_rate": 3.329602686066033e-05, "loss": 0.9311, "step": 796 }, { "epoch": 1.2022641509433962, "grad_norm": 5.832651725718385, "learning_rate": 3.326804700615557e-05, "loss": 0.5994, "step": 797 }, { "epoch": 1.2037735849056603, "grad_norm": 1.4055840202932217, "learning_rate": 3.324006715165081e-05, "loss": 0.6326, "step": 798 }, { "epoch": 1.2052830188679244, "grad_norm": 0.5916688857583293, "learning_rate": 3.3212087297146054e-05, "loss": 0.6772, "step": 799 }, { "epoch": 1.2067924528301888, "grad_norm": 0.9928690203020595, "learning_rate": 3.31841074426413e-05, "loss": 0.595, "step": 800 }, { "epoch": 1.2083018867924529, "grad_norm": 0.9349255830289889, "learning_rate": 3.315612758813654e-05, "loss": 0.7102, "step": 801 }, { "epoch": 1.209811320754717, "grad_norm": 0.7686689517909183, "learning_rate": 3.3128147733631784e-05, "loss": 0.6264, "step": 802 }, { "epoch": 1.211320754716981, "grad_norm": 0.9656464748523269, "learning_rate": 3.3100167879127034e-05, "loss": 0.6611, "step": 803 }, { "epoch": 1.2128301886792452, "grad_norm": 0.5809016244321841, "learning_rate": 3.307218802462228e-05, "loss": 0.6674, "step": 804 }, { "epoch": 1.2143396226415095, "grad_norm": 0.8488641757151281, "learning_rate": 3.304420817011752e-05, "loss": 0.6724, "step": 805 }, { "epoch": 1.2158490566037736, "grad_norm": 0.5115352162894421, "learning_rate": 3.3016228315612764e-05, "loss": 0.6244, "step": 806 }, { "epoch": 1.2173584905660377, "grad_norm": 0.6746402205487975, "learning_rate": 3.2988248461108e-05, "loss": 0.6216, "step": 807 }, { "epoch": 1.2188679245283018, "grad_norm": 0.5947361094685111, "learning_rate": 3.2960268606603244e-05, "loss": 0.6407, "step": 808 }, { "epoch": 1.220377358490566, "grad_norm": 0.6118537661273408, "learning_rate": 3.293228875209849e-05, "loss": 0.6642, "step": 809 }, { "epoch": 1.2218867924528303, "grad_norm": 0.5853643244928053, "learning_rate": 3.290430889759373e-05, "loss": 0.6274, "step": 810 }, { "epoch": 1.2233962264150944, "grad_norm": 0.4408870524517529, "learning_rate": 3.2876329043088974e-05, "loss": 0.6762, "step": 811 }, { "epoch": 1.2249056603773585, "grad_norm": 0.5786375937277597, "learning_rate": 3.284834918858422e-05, "loss": 0.618, "step": 812 }, { "epoch": 1.2264150943396226, "grad_norm": 0.37448594144435, "learning_rate": 3.282036933407947e-05, "loss": 0.6316, "step": 813 }, { "epoch": 1.2279245283018867, "grad_norm": 0.4860905984518343, "learning_rate": 3.279238947957471e-05, "loss": 0.6299, "step": 814 }, { "epoch": 1.229433962264151, "grad_norm": 0.5849544770920747, "learning_rate": 3.2764409625069954e-05, "loss": 0.5729, "step": 815 }, { "epoch": 1.2309433962264151, "grad_norm": 0.4008195084296223, "learning_rate": 3.27364297705652e-05, "loss": 0.6682, "step": 816 }, { "epoch": 1.2324528301886792, "grad_norm": 0.5272118883156421, "learning_rate": 3.270844991606044e-05, "loss": 0.6049, "step": 817 }, { "epoch": 1.2339622641509433, "grad_norm": 0.550682484167377, "learning_rate": 3.268047006155568e-05, "loss": 0.5918, "step": 818 }, { "epoch": 1.2354716981132075, "grad_norm": 0.45899566874631015, "learning_rate": 3.265249020705092e-05, "loss": 0.6293, "step": 819 }, { "epoch": 1.2369811320754718, "grad_norm": 0.41053358276258073, "learning_rate": 3.2624510352546165e-05, "loss": 0.5974, "step": 820 }, { "epoch": 1.2384905660377359, "grad_norm": 0.5673047046076692, "learning_rate": 3.259653049804141e-05, "loss": 0.6689, "step": 821 }, { "epoch": 1.24, "grad_norm": 0.42001733257774354, "learning_rate": 3.256855064353666e-05, "loss": 0.5781, "step": 822 }, { "epoch": 1.241509433962264, "grad_norm": 0.5274791837888488, "learning_rate": 3.25405707890319e-05, "loss": 0.6556, "step": 823 }, { "epoch": 1.2430188679245284, "grad_norm": 0.40140251232729945, "learning_rate": 3.2512590934527145e-05, "loss": 0.5618, "step": 824 }, { "epoch": 1.2445283018867925, "grad_norm": 0.38988606187868724, "learning_rate": 3.248461108002239e-05, "loss": 0.6696, "step": 825 }, { "epoch": 1.2460377358490566, "grad_norm": 0.6304384074605135, "learning_rate": 3.245663122551763e-05, "loss": 0.6769, "step": 826 }, { "epoch": 1.2475471698113207, "grad_norm": 0.3979395391659101, "learning_rate": 3.2428651371012875e-05, "loss": 0.6174, "step": 827 }, { "epoch": 1.2490566037735849, "grad_norm": 0.5329831109280307, "learning_rate": 3.240067151650811e-05, "loss": 0.6181, "step": 828 }, { "epoch": 1.2505660377358492, "grad_norm": 0.4319961747815377, "learning_rate": 3.2372691662003355e-05, "loss": 0.6993, "step": 829 }, { "epoch": 1.2520754716981133, "grad_norm": 0.4920198115969687, "learning_rate": 3.23447118074986e-05, "loss": 0.6775, "step": 830 }, { "epoch": 1.2535849056603774, "grad_norm": 0.5327815925026391, "learning_rate": 3.231673195299385e-05, "loss": 0.6518, "step": 831 }, { "epoch": 1.2550943396226415, "grad_norm": 0.4389886233997492, "learning_rate": 3.228875209848909e-05, "loss": 0.6365, "step": 832 }, { "epoch": 1.2566037735849056, "grad_norm": 0.5047415384999543, "learning_rate": 3.2260772243984335e-05, "loss": 0.7062, "step": 833 }, { "epoch": 1.25811320754717, "grad_norm": 0.4396225841431117, "learning_rate": 3.223279238947958e-05, "loss": 0.61, "step": 834 }, { "epoch": 1.259622641509434, "grad_norm": 0.4564667026923815, "learning_rate": 3.220481253497482e-05, "loss": 0.6335, "step": 835 }, { "epoch": 1.2611320754716981, "grad_norm": 0.39053767513919807, "learning_rate": 3.2176832680470065e-05, "loss": 0.6055, "step": 836 }, { "epoch": 1.2626415094339623, "grad_norm": 0.3783353602690344, "learning_rate": 3.214885282596531e-05, "loss": 0.7125, "step": 837 }, { "epoch": 1.2641509433962264, "grad_norm": 0.3553163046641145, "learning_rate": 3.2120872971460545e-05, "loss": 0.6143, "step": 838 }, { "epoch": 1.2656603773584907, "grad_norm": 0.35917670960749537, "learning_rate": 3.209289311695579e-05, "loss": 0.5769, "step": 839 }, { "epoch": 1.2671698113207548, "grad_norm": 0.36467830677027235, "learning_rate": 3.206491326245103e-05, "loss": 0.5699, "step": 840 }, { "epoch": 1.268679245283019, "grad_norm": 0.3773609262039195, "learning_rate": 3.203693340794628e-05, "loss": 0.6271, "step": 841 }, { "epoch": 1.270188679245283, "grad_norm": 0.3795609120240298, "learning_rate": 3.2008953553441525e-05, "loss": 0.6725, "step": 842 }, { "epoch": 1.271698113207547, "grad_norm": 0.37050676683076367, "learning_rate": 3.198097369893677e-05, "loss": 0.5933, "step": 843 }, { "epoch": 1.2732075471698114, "grad_norm": 0.45887192265355964, "learning_rate": 3.195299384443201e-05, "loss": 0.6373, "step": 844 }, { "epoch": 1.2747169811320755, "grad_norm": 1.4950560043816827, "learning_rate": 3.1925013989927255e-05, "loss": 0.9561, "step": 845 }, { "epoch": 1.2762264150943397, "grad_norm": 0.39850378462049746, "learning_rate": 3.18970341354225e-05, "loss": 0.6525, "step": 846 }, { "epoch": 1.2777358490566038, "grad_norm": 0.4135301305547025, "learning_rate": 3.186905428091774e-05, "loss": 0.6623, "step": 847 }, { "epoch": 1.2792452830188679, "grad_norm": 0.42557572265505506, "learning_rate": 3.1841074426412985e-05, "loss": 0.6964, "step": 848 }, { "epoch": 1.2807547169811322, "grad_norm": 0.39760056122405496, "learning_rate": 3.181309457190822e-05, "loss": 0.6106, "step": 849 }, { "epoch": 1.2822641509433963, "grad_norm": 0.40798606836706397, "learning_rate": 3.178511471740347e-05, "loss": 0.5632, "step": 850 }, { "epoch": 1.2837735849056604, "grad_norm": 0.3462421636576025, "learning_rate": 3.1757134862898716e-05, "loss": 0.5894, "step": 851 }, { "epoch": 1.2852830188679245, "grad_norm": 0.3722881735233624, "learning_rate": 3.172915500839396e-05, "loss": 0.6412, "step": 852 }, { "epoch": 1.2867924528301886, "grad_norm": 0.37960556416454605, "learning_rate": 3.17011751538892e-05, "loss": 0.6001, "step": 853 }, { "epoch": 1.288301886792453, "grad_norm": 0.4276713553028228, "learning_rate": 3.1673195299384446e-05, "loss": 0.6603, "step": 854 }, { "epoch": 1.289811320754717, "grad_norm": 0.7095036362422655, "learning_rate": 3.164521544487969e-05, "loss": 0.6137, "step": 855 }, { "epoch": 1.2913207547169812, "grad_norm": 0.4807408403609567, "learning_rate": 3.161723559037493e-05, "loss": 0.6588, "step": 856 }, { "epoch": 1.2928301886792453, "grad_norm": 0.4358649697165436, "learning_rate": 3.1589255735870176e-05, "loss": 0.5926, "step": 857 }, { "epoch": 1.2943396226415094, "grad_norm": 0.41013182956481836, "learning_rate": 3.156127588136542e-05, "loss": 0.6665, "step": 858 }, { "epoch": 1.2958490566037737, "grad_norm": 0.41515671451727343, "learning_rate": 3.153329602686066e-05, "loss": 0.7209, "step": 859 }, { "epoch": 1.2973584905660378, "grad_norm": 0.3942619832225865, "learning_rate": 3.1505316172355906e-05, "loss": 0.6667, "step": 860 }, { "epoch": 1.298867924528302, "grad_norm": 0.4021542796788365, "learning_rate": 3.147733631785115e-05, "loss": 0.6465, "step": 861 }, { "epoch": 1.300377358490566, "grad_norm": 0.37633875180949317, "learning_rate": 3.144935646334639e-05, "loss": 0.6343, "step": 862 }, { "epoch": 1.3018867924528301, "grad_norm": 0.7184734573887464, "learning_rate": 3.1421376608841636e-05, "loss": 0.6515, "step": 863 }, { "epoch": 1.3033962264150944, "grad_norm": 0.36958219984109786, "learning_rate": 3.139339675433688e-05, "loss": 0.6474, "step": 864 }, { "epoch": 1.3049056603773586, "grad_norm": 0.41188446823897223, "learning_rate": 3.136541689983212e-05, "loss": 0.6052, "step": 865 }, { "epoch": 1.3064150943396227, "grad_norm": 0.42281806811295863, "learning_rate": 3.1337437045327366e-05, "loss": 0.6652, "step": 866 }, { "epoch": 1.3079245283018868, "grad_norm": 0.33175743854835926, "learning_rate": 3.130945719082261e-05, "loss": 0.5977, "step": 867 }, { "epoch": 1.3094339622641509, "grad_norm": 0.37734563826001954, "learning_rate": 3.128147733631785e-05, "loss": 0.5966, "step": 868 }, { "epoch": 1.3109433962264152, "grad_norm": 0.4182778269406502, "learning_rate": 3.1253497481813096e-05, "loss": 0.6583, "step": 869 }, { "epoch": 1.3124528301886793, "grad_norm": 0.36140009515229593, "learning_rate": 3.122551762730834e-05, "loss": 0.6068, "step": 870 }, { "epoch": 1.3139622641509434, "grad_norm": 0.37977481362373233, "learning_rate": 3.119753777280358e-05, "loss": 0.654, "step": 871 }, { "epoch": 1.3154716981132075, "grad_norm": 0.3416259213239559, "learning_rate": 3.1169557918298826e-05, "loss": 0.6387, "step": 872 }, { "epoch": 1.3169811320754716, "grad_norm": 0.3564805566672587, "learning_rate": 3.114157806379407e-05, "loss": 0.6369, "step": 873 }, { "epoch": 1.318490566037736, "grad_norm": 0.3803523160244477, "learning_rate": 3.111359820928931e-05, "loss": 0.7093, "step": 874 }, { "epoch": 1.32, "grad_norm": 0.39166045934086346, "learning_rate": 3.1085618354784556e-05, "loss": 0.6496, "step": 875 }, { "epoch": 1.3215094339622642, "grad_norm": 0.4353767626286411, "learning_rate": 3.10576385002798e-05, "loss": 0.6266, "step": 876 }, { "epoch": 1.3230188679245283, "grad_norm": 0.3593032604836283, "learning_rate": 3.102965864577504e-05, "loss": 0.6782, "step": 877 }, { "epoch": 1.3245283018867924, "grad_norm": 0.4168829319681385, "learning_rate": 3.1001678791270286e-05, "loss": 0.6282, "step": 878 }, { "epoch": 1.3260377358490567, "grad_norm": 0.37757583489653257, "learning_rate": 3.097369893676553e-05, "loss": 0.6298, "step": 879 }, { "epoch": 1.3275471698113208, "grad_norm": 0.3501040816376274, "learning_rate": 3.094571908226077e-05, "loss": 0.6461, "step": 880 }, { "epoch": 1.329056603773585, "grad_norm": 0.4721309371418326, "learning_rate": 3.0917739227756016e-05, "loss": 0.6283, "step": 881 }, { "epoch": 1.330566037735849, "grad_norm": 0.40370139757838286, "learning_rate": 3.088975937325126e-05, "loss": 0.6533, "step": 882 }, { "epoch": 1.3320754716981131, "grad_norm": 0.4191434348168408, "learning_rate": 3.08617795187465e-05, "loss": 0.5634, "step": 883 }, { "epoch": 1.3335849056603775, "grad_norm": 0.47048915913727934, "learning_rate": 3.0833799664241747e-05, "loss": 0.6498, "step": 884 }, { "epoch": 1.3350943396226416, "grad_norm": 0.5386238703131466, "learning_rate": 3.080581980973699e-05, "loss": 0.6598, "step": 885 }, { "epoch": 1.3366037735849057, "grad_norm": 0.4046153720620124, "learning_rate": 3.077783995523223e-05, "loss": 0.6303, "step": 886 }, { "epoch": 1.3381132075471698, "grad_norm": 0.5092275981590267, "learning_rate": 3.0749860100727477e-05, "loss": 0.5994, "step": 887 }, { "epoch": 1.3396226415094339, "grad_norm": 0.4201270256233553, "learning_rate": 3.072188024622272e-05, "loss": 0.6035, "step": 888 }, { "epoch": 1.3411320754716982, "grad_norm": 0.44363066935554674, "learning_rate": 3.069390039171797e-05, "loss": 0.6149, "step": 889 }, { "epoch": 1.3426415094339623, "grad_norm": 0.45931419806615414, "learning_rate": 3.066592053721321e-05, "loss": 0.6853, "step": 890 }, { "epoch": 1.3441509433962264, "grad_norm": 0.3981069141574624, "learning_rate": 3.063794068270845e-05, "loss": 0.6445, "step": 891 }, { "epoch": 1.3456603773584905, "grad_norm": 0.4033897537692928, "learning_rate": 3.0609960828203693e-05, "loss": 0.6657, "step": 892 }, { "epoch": 1.3471698113207546, "grad_norm": 0.3858298350639289, "learning_rate": 3.058198097369894e-05, "loss": 0.6255, "step": 893 }, { "epoch": 1.348679245283019, "grad_norm": 0.47123781695477013, "learning_rate": 3.055400111919418e-05, "loss": 0.6509, "step": 894 }, { "epoch": 1.350188679245283, "grad_norm": 0.4029191937376374, "learning_rate": 3.0526021264689424e-05, "loss": 0.6651, "step": 895 }, { "epoch": 1.3516981132075472, "grad_norm": 0.388896779384108, "learning_rate": 3.049804141018467e-05, "loss": 0.6395, "step": 896 }, { "epoch": 1.3532075471698113, "grad_norm": 0.3768031336195982, "learning_rate": 3.0470061555679914e-05, "loss": 0.655, "step": 897 }, { "epoch": 1.3547169811320754, "grad_norm": 0.3984983631125007, "learning_rate": 3.0442081701175157e-05, "loss": 0.6149, "step": 898 }, { "epoch": 1.3562264150943397, "grad_norm": 0.4066202145440965, "learning_rate": 3.04141018466704e-05, "loss": 0.6486, "step": 899 }, { "epoch": 1.3577358490566038, "grad_norm": 0.44059085553416244, "learning_rate": 3.0386121992165644e-05, "loss": 0.676, "step": 900 }, { "epoch": 1.359245283018868, "grad_norm": 0.41928112497983105, "learning_rate": 3.0358142137660884e-05, "loss": 0.6166, "step": 901 }, { "epoch": 1.360754716981132, "grad_norm": 0.4484411056202494, "learning_rate": 3.0330162283156127e-05, "loss": 0.5916, "step": 902 }, { "epoch": 1.3622641509433961, "grad_norm": 0.39462759464635006, "learning_rate": 3.030218242865137e-05, "loss": 0.6252, "step": 903 }, { "epoch": 1.3637735849056605, "grad_norm": 0.3064423590338561, "learning_rate": 3.0274202574146614e-05, "loss": 0.7015, "step": 904 }, { "epoch": 1.3652830188679246, "grad_norm": 0.4187213380492406, "learning_rate": 3.024622271964186e-05, "loss": 0.6168, "step": 905 }, { "epoch": 1.3667924528301887, "grad_norm": 0.40261471242008323, "learning_rate": 3.0218242865137104e-05, "loss": 0.7055, "step": 906 }, { "epoch": 1.3683018867924528, "grad_norm": 0.3481493833030105, "learning_rate": 3.0190263010632347e-05, "loss": 0.6208, "step": 907 }, { "epoch": 1.369811320754717, "grad_norm": 0.35049655031964855, "learning_rate": 3.016228315612759e-05, "loss": 0.5781, "step": 908 }, { "epoch": 1.3713207547169812, "grad_norm": 0.42278059968973397, "learning_rate": 3.0134303301622834e-05, "loss": 0.5904, "step": 909 }, { "epoch": 1.3728301886792453, "grad_norm": 0.3445994928179493, "learning_rate": 3.0106323447118077e-05, "loss": 0.6288, "step": 910 }, { "epoch": 1.3743396226415094, "grad_norm": 0.38967053457550976, "learning_rate": 3.0078343592613317e-05, "loss": 0.6392, "step": 911 }, { "epoch": 1.3758490566037735, "grad_norm": 0.39707064143261184, "learning_rate": 3.005036373810856e-05, "loss": 0.6279, "step": 912 }, { "epoch": 1.3773584905660377, "grad_norm": 0.35444427397617145, "learning_rate": 3.0022383883603804e-05, "loss": 0.5976, "step": 913 }, { "epoch": 1.378867924528302, "grad_norm": 0.34690507904913026, "learning_rate": 2.999440402909905e-05, "loss": 0.634, "step": 914 }, { "epoch": 1.380377358490566, "grad_norm": 0.4626788655800128, "learning_rate": 2.9966424174594294e-05, "loss": 0.6583, "step": 915 }, { "epoch": 1.3818867924528302, "grad_norm": 0.3671421219021182, "learning_rate": 2.9938444320089538e-05, "loss": 0.6514, "step": 916 }, { "epoch": 1.3833962264150943, "grad_norm": 0.37784633929271194, "learning_rate": 2.991046446558478e-05, "loss": 0.601, "step": 917 }, { "epoch": 1.3849056603773584, "grad_norm": 0.4089029572046124, "learning_rate": 2.9882484611080024e-05, "loss": 0.6588, "step": 918 }, { "epoch": 1.3864150943396227, "grad_norm": 0.3455888490666691, "learning_rate": 2.9854504756575268e-05, "loss": 0.6022, "step": 919 }, { "epoch": 1.3879245283018868, "grad_norm": 0.3654741927522717, "learning_rate": 2.9826524902070514e-05, "loss": 0.6638, "step": 920 }, { "epoch": 1.389433962264151, "grad_norm": 0.3502350361285747, "learning_rate": 2.9798545047565758e-05, "loss": 0.6645, "step": 921 }, { "epoch": 1.390943396226415, "grad_norm": 0.37331731488300535, "learning_rate": 2.9770565193060994e-05, "loss": 0.6074, "step": 922 }, { "epoch": 1.3924528301886792, "grad_norm": 0.3185944385198641, "learning_rate": 2.9742585338556238e-05, "loss": 0.5764, "step": 923 }, { "epoch": 1.3939622641509435, "grad_norm": 0.37558027024168333, "learning_rate": 2.9714605484051484e-05, "loss": 0.6027, "step": 924 }, { "epoch": 1.3954716981132076, "grad_norm": 0.3494928337929872, "learning_rate": 2.9686625629546728e-05, "loss": 0.6464, "step": 925 }, { "epoch": 1.3969811320754717, "grad_norm": 0.41717609426235447, "learning_rate": 2.965864577504197e-05, "loss": 0.6136, "step": 926 }, { "epoch": 1.3984905660377358, "grad_norm": 0.3690138657188448, "learning_rate": 2.9630665920537215e-05, "loss": 0.6544, "step": 927 }, { "epoch": 1.4, "grad_norm": 0.36363869250987335, "learning_rate": 2.9602686066032458e-05, "loss": 0.6072, "step": 928 }, { "epoch": 1.4015094339622642, "grad_norm": 0.327574112494754, "learning_rate": 2.9574706211527705e-05, "loss": 0.6352, "step": 929 }, { "epoch": 1.4030188679245283, "grad_norm": 0.35727876020347676, "learning_rate": 2.9546726357022948e-05, "loss": 0.5951, "step": 930 }, { "epoch": 1.4045283018867925, "grad_norm": 0.31716305744497264, "learning_rate": 2.951874650251819e-05, "loss": 0.6039, "step": 931 }, { "epoch": 1.4060377358490566, "grad_norm": 0.33093980503518344, "learning_rate": 2.9490766648013428e-05, "loss": 0.6163, "step": 932 }, { "epoch": 1.4075471698113207, "grad_norm": 0.3861900118247219, "learning_rate": 2.9462786793508675e-05, "loss": 0.6153, "step": 933 }, { "epoch": 1.409056603773585, "grad_norm": 0.3277916345887553, "learning_rate": 2.9434806939003918e-05, "loss": 0.6284, "step": 934 }, { "epoch": 1.410566037735849, "grad_norm": 0.3780457505042723, "learning_rate": 2.940682708449916e-05, "loss": 0.6452, "step": 935 }, { "epoch": 1.4120754716981132, "grad_norm": 0.3465233045348986, "learning_rate": 2.9378847229994405e-05, "loss": 0.7393, "step": 936 }, { "epoch": 1.4135849056603773, "grad_norm": 2.5533756746713117, "learning_rate": 2.9350867375489648e-05, "loss": 0.5907, "step": 937 }, { "epoch": 1.4150943396226414, "grad_norm": 0.4089686961924739, "learning_rate": 2.932288752098489e-05, "loss": 0.6067, "step": 938 }, { "epoch": 1.4166037735849057, "grad_norm": 0.35586033724923993, "learning_rate": 2.9294907666480138e-05, "loss": 0.5488, "step": 939 }, { "epoch": 1.4181132075471699, "grad_norm": 0.4380246394519367, "learning_rate": 2.926692781197538e-05, "loss": 0.6399, "step": 940 }, { "epoch": 1.419622641509434, "grad_norm": 0.4249638666583559, "learning_rate": 2.9238947957470625e-05, "loss": 0.6233, "step": 941 }, { "epoch": 1.421132075471698, "grad_norm": 0.4153579482995771, "learning_rate": 2.9210968102965865e-05, "loss": 0.6602, "step": 942 }, { "epoch": 1.4226415094339622, "grad_norm": 0.4754922849198537, "learning_rate": 2.918298824846111e-05, "loss": 0.7016, "step": 943 }, { "epoch": 1.4241509433962265, "grad_norm": 0.3236274287786117, "learning_rate": 2.915500839395635e-05, "loss": 0.5863, "step": 944 }, { "epoch": 1.4256603773584906, "grad_norm": 0.41992181352774643, "learning_rate": 2.9127028539451595e-05, "loss": 0.6258, "step": 945 }, { "epoch": 1.4271698113207547, "grad_norm": 0.8115843394380751, "learning_rate": 2.909904868494684e-05, "loss": 0.6465, "step": 946 }, { "epoch": 1.4286792452830188, "grad_norm": 0.39731341593956065, "learning_rate": 2.9071068830442082e-05, "loss": 0.6443, "step": 947 }, { "epoch": 1.430188679245283, "grad_norm": 0.36644813089178, "learning_rate": 2.904308897593733e-05, "loss": 0.5728, "step": 948 }, { "epoch": 1.4316981132075473, "grad_norm": 0.438411822701408, "learning_rate": 2.9015109121432572e-05, "loss": 0.5963, "step": 949 }, { "epoch": 1.4332075471698114, "grad_norm": 0.38295490840225677, "learning_rate": 2.8987129266927815e-05, "loss": 0.6152, "step": 950 }, { "epoch": 1.4347169811320755, "grad_norm": 0.39618114403032584, "learning_rate": 2.895914941242306e-05, "loss": 0.6637, "step": 951 }, { "epoch": 1.4362264150943396, "grad_norm": 0.3815428588638221, "learning_rate": 2.8931169557918302e-05, "loss": 0.5864, "step": 952 }, { "epoch": 1.4377358490566037, "grad_norm": 0.3663074285581354, "learning_rate": 2.8903189703413542e-05, "loss": 0.6733, "step": 953 }, { "epoch": 1.439245283018868, "grad_norm": 0.3668247689466776, "learning_rate": 2.8875209848908785e-05, "loss": 0.6375, "step": 954 }, { "epoch": 1.440754716981132, "grad_norm": 0.3535138691082089, "learning_rate": 2.884722999440403e-05, "loss": 0.5939, "step": 955 }, { "epoch": 1.4422641509433962, "grad_norm": 0.3634202864100825, "learning_rate": 2.8819250139899272e-05, "loss": 0.6292, "step": 956 }, { "epoch": 1.4437735849056603, "grad_norm": 0.3572638644281544, "learning_rate": 2.879127028539452e-05, "loss": 0.6664, "step": 957 }, { "epoch": 1.4452830188679244, "grad_norm": 0.3508425660348941, "learning_rate": 2.8763290430889762e-05, "loss": 0.6468, "step": 958 }, { "epoch": 1.4467924528301888, "grad_norm": 0.3707956017303932, "learning_rate": 2.8735310576385005e-05, "loss": 0.6495, "step": 959 }, { "epoch": 1.4483018867924529, "grad_norm": 0.3278213890755489, "learning_rate": 2.870733072188025e-05, "loss": 0.6217, "step": 960 }, { "epoch": 1.449811320754717, "grad_norm": 0.40970965375073787, "learning_rate": 2.8679350867375492e-05, "loss": 0.6451, "step": 961 }, { "epoch": 1.451320754716981, "grad_norm": 0.3688583638649611, "learning_rate": 2.8651371012870736e-05, "loss": 0.6251, "step": 962 }, { "epoch": 1.4528301886792452, "grad_norm": 0.34179313589597693, "learning_rate": 2.8623391158365976e-05, "loss": 0.6409, "step": 963 }, { "epoch": 1.4543396226415095, "grad_norm": 0.3448261436866798, "learning_rate": 2.859541130386122e-05, "loss": 0.5905, "step": 964 }, { "epoch": 1.4558490566037736, "grad_norm": 0.33694291297212864, "learning_rate": 2.8567431449356462e-05, "loss": 0.5761, "step": 965 }, { "epoch": 1.4573584905660377, "grad_norm": 0.3118187273291105, "learning_rate": 2.8539451594851706e-05, "loss": 0.6088, "step": 966 }, { "epoch": 1.4588679245283018, "grad_norm": 0.32045778178994183, "learning_rate": 2.8511471740346952e-05, "loss": 0.6205, "step": 967 }, { "epoch": 1.460377358490566, "grad_norm": 0.45338400028752146, "learning_rate": 2.8483491885842196e-05, "loss": 0.5963, "step": 968 }, { "epoch": 1.4618867924528303, "grad_norm": 0.32689068860548115, "learning_rate": 2.845551203133744e-05, "loss": 0.5745, "step": 969 }, { "epoch": 1.4633962264150944, "grad_norm": 0.38005063017031954, "learning_rate": 2.8427532176832682e-05, "loss": 0.6426, "step": 970 }, { "epoch": 1.4649056603773585, "grad_norm": 0.3309282292367092, "learning_rate": 2.8399552322327926e-05, "loss": 0.669, "step": 971 }, { "epoch": 1.4664150943396226, "grad_norm": 1.4917946740992876, "learning_rate": 2.8371572467823173e-05, "loss": 0.6343, "step": 972 }, { "epoch": 1.4679245283018867, "grad_norm": 0.3233023167238395, "learning_rate": 2.8343592613318416e-05, "loss": 0.6077, "step": 973 }, { "epoch": 1.469433962264151, "grad_norm": 0.3262290819657421, "learning_rate": 2.8315612758813653e-05, "loss": 0.6578, "step": 974 }, { "epoch": 1.4709433962264151, "grad_norm": 0.3339350102619771, "learning_rate": 2.8287632904308896e-05, "loss": 0.621, "step": 975 }, { "epoch": 1.4724528301886792, "grad_norm": 0.3192961373970712, "learning_rate": 2.8259653049804143e-05, "loss": 0.6236, "step": 976 }, { "epoch": 1.4739622641509433, "grad_norm": 0.49845825196565774, "learning_rate": 2.8231673195299386e-05, "loss": 0.6254, "step": 977 }, { "epoch": 1.4754716981132074, "grad_norm": 0.3485802760303236, "learning_rate": 2.820369334079463e-05, "loss": 0.6046, "step": 978 }, { "epoch": 1.4769811320754718, "grad_norm": 0.3598346283166558, "learning_rate": 2.8175713486289873e-05, "loss": 0.6621, "step": 979 }, { "epoch": 1.4784905660377359, "grad_norm": 0.7079216014041246, "learning_rate": 2.8147733631785116e-05, "loss": 0.6659, "step": 980 }, { "epoch": 1.48, "grad_norm": 0.3996327762211729, "learning_rate": 2.811975377728036e-05, "loss": 0.5864, "step": 981 }, { "epoch": 1.481509433962264, "grad_norm": 0.4468257832623057, "learning_rate": 2.8091773922775606e-05, "loss": 0.629, "step": 982 }, { "epoch": 1.4830188679245282, "grad_norm": 0.35338065855143863, "learning_rate": 2.806379406827085e-05, "loss": 0.5689, "step": 983 }, { "epoch": 1.4845283018867925, "grad_norm": 0.3447521513306188, "learning_rate": 2.8035814213766086e-05, "loss": 0.5519, "step": 984 }, { "epoch": 1.4860377358490566, "grad_norm": 0.34657588388233884, "learning_rate": 2.8007834359261333e-05, "loss": 0.6423, "step": 985 }, { "epoch": 1.4875471698113207, "grad_norm": 0.34128797853204484, "learning_rate": 2.7979854504756576e-05, "loss": 0.6009, "step": 986 }, { "epoch": 1.4890566037735848, "grad_norm": 0.8214335030714001, "learning_rate": 2.795187465025182e-05, "loss": 0.5986, "step": 987 }, { "epoch": 1.490566037735849, "grad_norm": 9.613076505885713, "learning_rate": 2.7923894795747063e-05, "loss": 0.8501, "step": 988 }, { "epoch": 1.4920754716981133, "grad_norm": 0.5243283003121673, "learning_rate": 2.7895914941242306e-05, "loss": 0.6533, "step": 989 }, { "epoch": 1.4935849056603774, "grad_norm": 0.4045389908997858, "learning_rate": 2.786793508673755e-05, "loss": 0.6253, "step": 990 }, { "epoch": 1.4950943396226415, "grad_norm": 0.4281589533591875, "learning_rate": 2.7839955232232796e-05, "loss": 0.6287, "step": 991 }, { "epoch": 1.4966037735849056, "grad_norm": 0.38192093777254366, "learning_rate": 2.781197537772804e-05, "loss": 0.6681, "step": 992 }, { "epoch": 1.4981132075471697, "grad_norm": 0.47848727765965543, "learning_rate": 2.7783995523223283e-05, "loss": 0.6019, "step": 993 }, { "epoch": 1.499622641509434, "grad_norm": 0.3985883342403703, "learning_rate": 2.775601566871852e-05, "loss": 0.6599, "step": 994 }, { "epoch": 1.5011320754716981, "grad_norm": 0.3434562172845685, "learning_rate": 2.7728035814213767e-05, "loss": 0.6081, "step": 995 }, { "epoch": 1.5026415094339622, "grad_norm": 0.3980403992472545, "learning_rate": 2.770005595970901e-05, "loss": 0.6172, "step": 996 }, { "epoch": 1.5041509433962266, "grad_norm": 0.3763385571813578, "learning_rate": 2.7672076105204253e-05, "loss": 0.6792, "step": 997 }, { "epoch": 1.5056603773584905, "grad_norm": 0.40146854330428877, "learning_rate": 2.7644096250699497e-05, "loss": 0.6224, "step": 998 }, { "epoch": 1.5071698113207548, "grad_norm": 0.33301446846068716, "learning_rate": 2.761611639619474e-05, "loss": 0.6887, "step": 999 }, { "epoch": 1.5086792452830189, "grad_norm": 0.38371280593670326, "learning_rate": 2.7588136541689987e-05, "loss": 0.6132, "step": 1000 }, { "epoch": 1.510188679245283, "grad_norm": 0.37104373493288634, "learning_rate": 2.756015668718523e-05, "loss": 0.6067, "step": 1001 }, { "epoch": 1.5116981132075473, "grad_norm": 0.47232500165135083, "learning_rate": 2.7532176832680473e-05, "loss": 0.6102, "step": 1002 }, { "epoch": 1.5132075471698112, "grad_norm": 0.33043113034271476, "learning_rate": 2.7504196978175717e-05, "loss": 0.6384, "step": 1003 }, { "epoch": 1.5147169811320755, "grad_norm": 0.3448322098677679, "learning_rate": 2.747621712367096e-05, "loss": 0.6998, "step": 1004 }, { "epoch": 1.5162264150943396, "grad_norm": 0.40306936202383525, "learning_rate": 2.74482372691662e-05, "loss": 0.6676, "step": 1005 }, { "epoch": 1.5177358490566037, "grad_norm": 0.32212746145926247, "learning_rate": 2.7420257414661444e-05, "loss": 0.7054, "step": 1006 }, { "epoch": 1.519245283018868, "grad_norm": 0.3630795836083071, "learning_rate": 2.7392277560156687e-05, "loss": 0.6069, "step": 1007 }, { "epoch": 1.520754716981132, "grad_norm": 0.35892977901686324, "learning_rate": 2.736429770565193e-05, "loss": 0.6303, "step": 1008 }, { "epoch": 1.5222641509433963, "grad_norm": 0.34033834989241846, "learning_rate": 2.7336317851147174e-05, "loss": 0.6073, "step": 1009 }, { "epoch": 1.5237735849056604, "grad_norm": 0.3606987139401938, "learning_rate": 2.730833799664242e-05, "loss": 0.6533, "step": 1010 }, { "epoch": 1.5252830188679245, "grad_norm": 0.35197301530590785, "learning_rate": 2.7280358142137664e-05, "loss": 0.6774, "step": 1011 }, { "epoch": 1.5267924528301888, "grad_norm": 0.3409953188778291, "learning_rate": 2.7252378287632907e-05, "loss": 0.6111, "step": 1012 }, { "epoch": 1.5283018867924527, "grad_norm": 0.35710307374963385, "learning_rate": 2.722439843312815e-05, "loss": 0.5887, "step": 1013 }, { "epoch": 1.529811320754717, "grad_norm": 0.39361169119324246, "learning_rate": 2.7196418578623394e-05, "loss": 0.6824, "step": 1014 }, { "epoch": 1.5313207547169811, "grad_norm": 0.42089198638548375, "learning_rate": 2.7168438724118634e-05, "loss": 0.6084, "step": 1015 }, { "epoch": 1.5328301886792453, "grad_norm": 0.4002037847566263, "learning_rate": 2.7140458869613877e-05, "loss": 0.6164, "step": 1016 }, { "epoch": 1.5343396226415096, "grad_norm": 0.4279739458874451, "learning_rate": 2.711247901510912e-05, "loss": 0.602, "step": 1017 }, { "epoch": 1.5358490566037735, "grad_norm": 0.3741065695840471, "learning_rate": 2.7084499160604364e-05, "loss": 0.5819, "step": 1018 }, { "epoch": 1.5373584905660378, "grad_norm": 0.4279718365040865, "learning_rate": 2.705651930609961e-05, "loss": 0.6296, "step": 1019 }, { "epoch": 1.538867924528302, "grad_norm": 0.448203149422696, "learning_rate": 2.7028539451594854e-05, "loss": 0.5985, "step": 1020 }, { "epoch": 1.540377358490566, "grad_norm": 0.39251867440111504, "learning_rate": 2.7000559597090097e-05, "loss": 0.6152, "step": 1021 }, { "epoch": 1.5418867924528303, "grad_norm": 0.5318193655500686, "learning_rate": 2.697257974258534e-05, "loss": 0.6533, "step": 1022 }, { "epoch": 1.5433962264150942, "grad_norm": 0.36282379983998825, "learning_rate": 2.6944599888080584e-05, "loss": 0.6422, "step": 1023 }, { "epoch": 1.5449056603773585, "grad_norm": 0.4679464718781621, "learning_rate": 2.6916620033575827e-05, "loss": 0.6914, "step": 1024 }, { "epoch": 1.5464150943396227, "grad_norm": 0.390523648842573, "learning_rate": 2.6888640179071067e-05, "loss": 0.6562, "step": 1025 }, { "epoch": 1.5479245283018868, "grad_norm": 0.3704983255661872, "learning_rate": 2.686066032456631e-05, "loss": 0.6257, "step": 1026 }, { "epoch": 1.549433962264151, "grad_norm": 0.41154051577412576, "learning_rate": 2.6832680470061554e-05, "loss": 0.6028, "step": 1027 }, { "epoch": 1.550943396226415, "grad_norm": 0.33250241430063915, "learning_rate": 2.68047006155568e-05, "loss": 0.6491, "step": 1028 }, { "epoch": 1.5524528301886793, "grad_norm": 0.4012914420070519, "learning_rate": 2.6776720761052044e-05, "loss": 0.6053, "step": 1029 }, { "epoch": 1.5539622641509434, "grad_norm": 0.47506656031425826, "learning_rate": 2.6748740906547288e-05, "loss": 0.6044, "step": 1030 }, { "epoch": 1.5554716981132075, "grad_norm": 0.3707491589866621, "learning_rate": 2.672076105204253e-05, "loss": 0.6703, "step": 1031 }, { "epoch": 1.5569811320754718, "grad_norm": 0.34186133320917916, "learning_rate": 2.6692781197537774e-05, "loss": 0.6596, "step": 1032 }, { "epoch": 1.5584905660377357, "grad_norm": 0.3491021187362257, "learning_rate": 2.6664801343033018e-05, "loss": 0.5844, "step": 1033 }, { "epoch": 1.56, "grad_norm": 0.7155210370076859, "learning_rate": 2.6636821488528264e-05, "loss": 0.6393, "step": 1034 }, { "epoch": 1.5615094339622642, "grad_norm": 0.37905853694030756, "learning_rate": 2.6608841634023508e-05, "loss": 0.5804, "step": 1035 }, { "epoch": 1.5630188679245283, "grad_norm": 0.39249010794322703, "learning_rate": 2.6580861779518744e-05, "loss": 0.6622, "step": 1036 }, { "epoch": 1.5645283018867926, "grad_norm": 0.3177813349439448, "learning_rate": 2.6552881925013988e-05, "loss": 0.6893, "step": 1037 }, { "epoch": 1.5660377358490565, "grad_norm": 0.3578675379157616, "learning_rate": 2.6524902070509235e-05, "loss": 0.6743, "step": 1038 }, { "epoch": 1.5675471698113208, "grad_norm": 0.38432047726232615, "learning_rate": 2.6496922216004478e-05, "loss": 0.5848, "step": 1039 }, { "epoch": 1.569056603773585, "grad_norm": 0.38664999715982123, "learning_rate": 2.646894236149972e-05, "loss": 0.6501, "step": 1040 }, { "epoch": 1.570566037735849, "grad_norm": 0.3030876699883648, "learning_rate": 2.6440962506994965e-05, "loss": 0.6048, "step": 1041 }, { "epoch": 1.5720754716981133, "grad_norm": 0.3282480223581648, "learning_rate": 2.6412982652490208e-05, "loss": 0.5784, "step": 1042 }, { "epoch": 1.5735849056603772, "grad_norm": 0.34287712921209795, "learning_rate": 2.638500279798545e-05, "loss": 0.5587, "step": 1043 }, { "epoch": 1.5750943396226416, "grad_norm": 0.4249512336319463, "learning_rate": 2.6357022943480698e-05, "loss": 0.6872, "step": 1044 }, { "epoch": 1.5766037735849057, "grad_norm": 0.36483078091450444, "learning_rate": 2.632904308897594e-05, "loss": 0.6272, "step": 1045 }, { "epoch": 1.5781132075471698, "grad_norm": 0.33642660623413323, "learning_rate": 2.6301063234471178e-05, "loss": 0.6152, "step": 1046 }, { "epoch": 1.579622641509434, "grad_norm": 0.3795901995942625, "learning_rate": 2.6273083379966425e-05, "loss": 0.6457, "step": 1047 }, { "epoch": 1.581132075471698, "grad_norm": 0.4999474857747963, "learning_rate": 2.6245103525461668e-05, "loss": 0.6492, "step": 1048 }, { "epoch": 1.5826415094339623, "grad_norm": 0.31486204696092474, "learning_rate": 2.621712367095691e-05, "loss": 0.6266, "step": 1049 }, { "epoch": 1.5841509433962264, "grad_norm": 0.3742304773869301, "learning_rate": 2.6189143816452155e-05, "loss": 0.6116, "step": 1050 }, { "epoch": 1.5856603773584905, "grad_norm": 0.41017165219333346, "learning_rate": 2.6161163961947398e-05, "loss": 0.6209, "step": 1051 }, { "epoch": 1.5871698113207549, "grad_norm": 0.32510313559158954, "learning_rate": 2.613318410744264e-05, "loss": 0.5938, "step": 1052 }, { "epoch": 1.5886792452830187, "grad_norm": 0.34975228019540705, "learning_rate": 2.610520425293789e-05, "loss": 0.6052, "step": 1053 }, { "epoch": 1.590188679245283, "grad_norm": 0.3593828761751314, "learning_rate": 2.6077224398433132e-05, "loss": 0.5994, "step": 1054 }, { "epoch": 1.5916981132075472, "grad_norm": 0.3379735940847511, "learning_rate": 2.6049244543928375e-05, "loss": 0.6682, "step": 1055 }, { "epoch": 1.5932075471698113, "grad_norm": 0.3688041258843097, "learning_rate": 2.602126468942362e-05, "loss": 0.7183, "step": 1056 }, { "epoch": 1.5947169811320756, "grad_norm": 0.4216449339373789, "learning_rate": 2.599328483491886e-05, "loss": 0.7043, "step": 1057 }, { "epoch": 1.5962264150943395, "grad_norm": 2.267511774259029, "learning_rate": 2.5965304980414102e-05, "loss": 0.683, "step": 1058 }, { "epoch": 1.5977358490566038, "grad_norm": 0.3895472450306408, "learning_rate": 2.5937325125909345e-05, "loss": 0.6855, "step": 1059 }, { "epoch": 1.599245283018868, "grad_norm": 0.41188597998599974, "learning_rate": 2.590934527140459e-05, "loss": 0.6083, "step": 1060 }, { "epoch": 1.600754716981132, "grad_norm": 0.3059560204490495, "learning_rate": 2.5881365416899832e-05, "loss": 0.5571, "step": 1061 }, { "epoch": 1.6022641509433964, "grad_norm": 0.34622860668786465, "learning_rate": 2.585338556239508e-05, "loss": 0.5771, "step": 1062 }, { "epoch": 1.6037735849056602, "grad_norm": 0.34479433863653974, "learning_rate": 2.5825405707890322e-05, "loss": 0.6081, "step": 1063 }, { "epoch": 1.6052830188679246, "grad_norm": 0.36272238390017086, "learning_rate": 2.5797425853385565e-05, "loss": 0.6959, "step": 1064 }, { "epoch": 1.6067924528301887, "grad_norm": 0.4094790107793089, "learning_rate": 2.576944599888081e-05, "loss": 0.6309, "step": 1065 }, { "epoch": 1.6083018867924528, "grad_norm": 0.3788756955199532, "learning_rate": 2.5741466144376052e-05, "loss": 0.5994, "step": 1066 }, { "epoch": 1.6098113207547171, "grad_norm": 0.363138476827497, "learning_rate": 2.5713486289871292e-05, "loss": 0.6657, "step": 1067 }, { "epoch": 1.611320754716981, "grad_norm": 0.35600650897268843, "learning_rate": 2.5685506435366535e-05, "loss": 0.6238, "step": 1068 }, { "epoch": 1.6128301886792453, "grad_norm": 0.31529869654986054, "learning_rate": 2.565752658086178e-05, "loss": 0.6157, "step": 1069 }, { "epoch": 1.6143396226415094, "grad_norm": 0.41343618537042476, "learning_rate": 2.5629546726357022e-05, "loss": 0.6847, "step": 1070 }, { "epoch": 1.6158490566037735, "grad_norm": 0.36948590263079245, "learning_rate": 2.560156687185227e-05, "loss": 0.6373, "step": 1071 }, { "epoch": 1.6173584905660379, "grad_norm": 0.3405631397524817, "learning_rate": 2.5573587017347512e-05, "loss": 0.6454, "step": 1072 }, { "epoch": 1.6188679245283017, "grad_norm": 0.3876899509134273, "learning_rate": 2.5545607162842756e-05, "loss": 0.6897, "step": 1073 }, { "epoch": 1.620377358490566, "grad_norm": 0.36648572255209627, "learning_rate": 2.5517627308338e-05, "loss": 0.6366, "step": 1074 }, { "epoch": 1.6218867924528302, "grad_norm": 0.39363149190822344, "learning_rate": 2.5489647453833242e-05, "loss": 0.6312, "step": 1075 }, { "epoch": 1.6233962264150943, "grad_norm": 0.393083017888767, "learning_rate": 2.5461667599328486e-05, "loss": 0.621, "step": 1076 }, { "epoch": 1.6249056603773586, "grad_norm": 0.42789295095166247, "learning_rate": 2.5433687744823726e-05, "loss": 0.6328, "step": 1077 }, { "epoch": 1.6264150943396225, "grad_norm": 0.34143614581342263, "learning_rate": 2.540570789031897e-05, "loss": 0.5833, "step": 1078 }, { "epoch": 1.6279245283018868, "grad_norm": 0.48993095092983346, "learning_rate": 2.5377728035814212e-05, "loss": 0.6499, "step": 1079 }, { "epoch": 1.629433962264151, "grad_norm": 0.38482007687414094, "learning_rate": 2.5349748181309456e-05, "loss": 0.624, "step": 1080 }, { "epoch": 1.630943396226415, "grad_norm": 0.3620296499170073, "learning_rate": 2.5321768326804703e-05, "loss": 0.6196, "step": 1081 }, { "epoch": 1.6324528301886794, "grad_norm": 0.4413925417638725, "learning_rate": 2.5293788472299946e-05, "loss": 0.6562, "step": 1082 }, { "epoch": 1.6339622641509433, "grad_norm": 0.31728101683770593, "learning_rate": 2.526580861779519e-05, "loss": 0.5931, "step": 1083 }, { "epoch": 1.6354716981132076, "grad_norm": 0.3242208210115639, "learning_rate": 2.5237828763290433e-05, "loss": 0.6428, "step": 1084 }, { "epoch": 1.6369811320754717, "grad_norm": 0.34313826598204494, "learning_rate": 2.5209848908785676e-05, "loss": 0.5892, "step": 1085 }, { "epoch": 1.6384905660377358, "grad_norm": 0.27873747077218364, "learning_rate": 2.518186905428092e-05, "loss": 0.6435, "step": 1086 }, { "epoch": 1.6400000000000001, "grad_norm": 0.34391194671727837, "learning_rate": 2.5153889199776166e-05, "loss": 0.6208, "step": 1087 }, { "epoch": 1.641509433962264, "grad_norm": 0.3202174999860832, "learning_rate": 2.5125909345271403e-05, "loss": 0.5982, "step": 1088 }, { "epoch": 1.6430188679245283, "grad_norm": 0.353421412519148, "learning_rate": 2.5097929490766646e-05, "loss": 0.6558, "step": 1089 }, { "epoch": 1.6445283018867924, "grad_norm": 0.33590448835443065, "learning_rate": 2.5069949636261893e-05, "loss": 0.5849, "step": 1090 }, { "epoch": 1.6460377358490565, "grad_norm": 0.3049009352630597, "learning_rate": 2.5041969781757136e-05, "loss": 0.6268, "step": 1091 }, { "epoch": 1.6475471698113209, "grad_norm": 0.32814946592178634, "learning_rate": 2.501398992725238e-05, "loss": 0.571, "step": 1092 }, { "epoch": 1.6490566037735848, "grad_norm": 0.32851218281151046, "learning_rate": 2.4986010072747623e-05, "loss": 0.5919, "step": 1093 }, { "epoch": 1.650566037735849, "grad_norm": 0.29866330007475594, "learning_rate": 2.4958030218242866e-05, "loss": 0.609, "step": 1094 }, { "epoch": 1.6520754716981132, "grad_norm": 0.2903575786430782, "learning_rate": 2.493005036373811e-05, "loss": 0.633, "step": 1095 }, { "epoch": 1.6535849056603773, "grad_norm": 0.34536143070473535, "learning_rate": 2.4902070509233353e-05, "loss": 0.6005, "step": 1096 }, { "epoch": 1.6550943396226416, "grad_norm": 0.3026599205345073, "learning_rate": 2.4874090654728596e-05, "loss": 0.6478, "step": 1097 }, { "epoch": 1.6566037735849055, "grad_norm": 0.306175601002009, "learning_rate": 2.484611080022384e-05, "loss": 0.6195, "step": 1098 }, { "epoch": 1.6581132075471698, "grad_norm": 0.3710617396392141, "learning_rate": 2.4818130945719083e-05, "loss": 0.6228, "step": 1099 }, { "epoch": 1.659622641509434, "grad_norm": 0.3377466953849946, "learning_rate": 2.4790151091214326e-05, "loss": 0.6511, "step": 1100 }, { "epoch": 1.661132075471698, "grad_norm": 0.31549413830814466, "learning_rate": 2.476217123670957e-05, "loss": 0.5633, "step": 1101 }, { "epoch": 1.6626415094339624, "grad_norm": 0.357754789091578, "learning_rate": 2.4734191382204813e-05, "loss": 0.6337, "step": 1102 }, { "epoch": 1.6641509433962263, "grad_norm": 0.3306548434162944, "learning_rate": 2.4706211527700057e-05, "loss": 0.5843, "step": 1103 }, { "epoch": 1.6656603773584906, "grad_norm": 6.054734215990575, "learning_rate": 2.46782316731953e-05, "loss": 0.661, "step": 1104 }, { "epoch": 1.6671698113207547, "grad_norm": 0.4115492900895262, "learning_rate": 2.4650251818690547e-05, "loss": 0.6434, "step": 1105 }, { "epoch": 1.6686792452830188, "grad_norm": 0.822362998347998, "learning_rate": 2.4622271964185787e-05, "loss": 0.6846, "step": 1106 }, { "epoch": 1.6701886792452831, "grad_norm": 0.4448253668967885, "learning_rate": 2.459429210968103e-05, "loss": 0.5628, "step": 1107 }, { "epoch": 1.671698113207547, "grad_norm": 0.3770953555568863, "learning_rate": 2.4566312255176273e-05, "loss": 0.6269, "step": 1108 }, { "epoch": 1.6732075471698113, "grad_norm": 0.5299343272830751, "learning_rate": 2.4538332400671517e-05, "loss": 0.6237, "step": 1109 }, { "epoch": 1.6747169811320755, "grad_norm": 0.3702282688776482, "learning_rate": 2.4510352546166763e-05, "loss": 0.6586, "step": 1110 }, { "epoch": 1.6762264150943396, "grad_norm": 0.39024405698123815, "learning_rate": 2.4482372691662003e-05, "loss": 0.6429, "step": 1111 }, { "epoch": 1.677735849056604, "grad_norm": 0.4537391331752708, "learning_rate": 2.4454392837157247e-05, "loss": 0.639, "step": 1112 }, { "epoch": 1.6792452830188678, "grad_norm": 0.3149708264349129, "learning_rate": 2.442641298265249e-05, "loss": 0.5799, "step": 1113 }, { "epoch": 1.680754716981132, "grad_norm": 0.332009587191511, "learning_rate": 2.4398433128147733e-05, "loss": 0.5921, "step": 1114 }, { "epoch": 1.6822641509433962, "grad_norm": 0.4987660993726321, "learning_rate": 2.437045327364298e-05, "loss": 0.5617, "step": 1115 }, { "epoch": 1.6837735849056603, "grad_norm": 0.38823515270339287, "learning_rate": 2.434247341913822e-05, "loss": 0.6943, "step": 1116 }, { "epoch": 1.6852830188679246, "grad_norm": 0.41948680248507986, "learning_rate": 2.4314493564633464e-05, "loss": 0.6714, "step": 1117 }, { "epoch": 1.6867924528301885, "grad_norm": 0.41536448849439084, "learning_rate": 2.4286513710128707e-05, "loss": 0.6236, "step": 1118 }, { "epoch": 1.6883018867924529, "grad_norm": 0.38203854076076366, "learning_rate": 2.4258533855623954e-05, "loss": 0.6081, "step": 1119 }, { "epoch": 1.689811320754717, "grad_norm": 0.3167935649962668, "learning_rate": 2.4230554001119197e-05, "loss": 0.6062, "step": 1120 }, { "epoch": 1.691320754716981, "grad_norm": 0.39343182435379037, "learning_rate": 2.420257414661444e-05, "loss": 0.6464, "step": 1121 }, { "epoch": 1.6928301886792454, "grad_norm": 0.38741191325073565, "learning_rate": 2.417459429210968e-05, "loss": 0.5858, "step": 1122 }, { "epoch": 1.6943396226415093, "grad_norm": 0.3110251540082128, "learning_rate": 2.4146614437604924e-05, "loss": 0.6742, "step": 1123 }, { "epoch": 1.6958490566037736, "grad_norm": 0.4205404270555864, "learning_rate": 2.411863458310017e-05, "loss": 0.6817, "step": 1124 }, { "epoch": 1.6973584905660377, "grad_norm": 0.3370144854328815, "learning_rate": 2.4090654728595414e-05, "loss": 0.6306, "step": 1125 }, { "epoch": 1.6988679245283018, "grad_norm": 0.31428186192021673, "learning_rate": 2.4062674874090657e-05, "loss": 0.6701, "step": 1126 }, { "epoch": 1.7003773584905661, "grad_norm": 0.35546961013819645, "learning_rate": 2.4034695019585897e-05, "loss": 0.5598, "step": 1127 }, { "epoch": 1.70188679245283, "grad_norm": 0.30842915313942293, "learning_rate": 2.400671516508114e-05, "loss": 0.6555, "step": 1128 }, { "epoch": 1.7033962264150944, "grad_norm": 0.3503526140590797, "learning_rate": 2.3978735310576387e-05, "loss": 0.6268, "step": 1129 }, { "epoch": 1.7049056603773585, "grad_norm": 0.3525131623141097, "learning_rate": 2.395075545607163e-05, "loss": 0.651, "step": 1130 }, { "epoch": 1.7064150943396226, "grad_norm": 0.362462685268756, "learning_rate": 2.3922775601566874e-05, "loss": 0.654, "step": 1131 }, { "epoch": 1.707924528301887, "grad_norm": 0.35270111632126117, "learning_rate": 2.3894795747062114e-05, "loss": 0.6304, "step": 1132 }, { "epoch": 1.7094339622641508, "grad_norm": 0.35642789633544897, "learning_rate": 2.386681589255736e-05, "loss": 0.569, "step": 1133 }, { "epoch": 1.7109433962264151, "grad_norm": 0.3548672747435073, "learning_rate": 2.3838836038052604e-05, "loss": 0.6078, "step": 1134 }, { "epoch": 1.7124528301886792, "grad_norm": 0.41883631687319944, "learning_rate": 2.3810856183547848e-05, "loss": 0.6137, "step": 1135 }, { "epoch": 1.7139622641509433, "grad_norm": 0.37255769585734255, "learning_rate": 2.378287632904309e-05, "loss": 0.6341, "step": 1136 }, { "epoch": 1.7154716981132077, "grad_norm": 0.343650289950691, "learning_rate": 2.375489647453833e-05, "loss": 0.6816, "step": 1137 }, { "epoch": 1.7169811320754715, "grad_norm": 0.3135936860965841, "learning_rate": 2.3726916620033578e-05, "loss": 0.661, "step": 1138 }, { "epoch": 1.7184905660377359, "grad_norm": 0.3354145369379278, "learning_rate": 2.369893676552882e-05, "loss": 0.6131, "step": 1139 }, { "epoch": 1.72, "grad_norm": 0.37591403645106586, "learning_rate": 2.3670956911024064e-05, "loss": 0.7752, "step": 1140 }, { "epoch": 1.721509433962264, "grad_norm": 3.276557779806758, "learning_rate": 2.3642977056519308e-05, "loss": 0.6548, "step": 1141 }, { "epoch": 1.7230188679245284, "grad_norm": 0.40078418503283786, "learning_rate": 2.361499720201455e-05, "loss": 0.6305, "step": 1142 }, { "epoch": 1.7245283018867923, "grad_norm": 0.3490281170396962, "learning_rate": 2.3587017347509794e-05, "loss": 0.6477, "step": 1143 }, { "epoch": 1.7260377358490566, "grad_norm": 0.3130044965355929, "learning_rate": 2.3559037493005038e-05, "loss": 0.6318, "step": 1144 }, { "epoch": 1.7275471698113207, "grad_norm": 0.31341978043560415, "learning_rate": 2.353105763850028e-05, "loss": 0.5936, "step": 1145 }, { "epoch": 1.7290566037735848, "grad_norm": 0.4007373637031972, "learning_rate": 2.3503077783995524e-05, "loss": 0.6289, "step": 1146 }, { "epoch": 1.7305660377358492, "grad_norm": 0.3145194497545859, "learning_rate": 2.3475097929490768e-05, "loss": 0.5794, "step": 1147 }, { "epoch": 1.732075471698113, "grad_norm": 0.41907955688569665, "learning_rate": 2.344711807498601e-05, "loss": 0.6204, "step": 1148 }, { "epoch": 1.7335849056603774, "grad_norm": 0.3368506672709832, "learning_rate": 2.3419138220481255e-05, "loss": 0.6453, "step": 1149 }, { "epoch": 1.7350943396226415, "grad_norm": 0.3578725045779005, "learning_rate": 2.3391158365976498e-05, "loss": 0.6131, "step": 1150 }, { "epoch": 1.7366037735849056, "grad_norm": 0.387646182019032, "learning_rate": 2.336317851147174e-05, "loss": 0.6591, "step": 1151 }, { "epoch": 1.73811320754717, "grad_norm": 0.370813360320474, "learning_rate": 2.3335198656966985e-05, "loss": 0.6132, "step": 1152 }, { "epoch": 1.7396226415094338, "grad_norm": 0.35325542219207184, "learning_rate": 2.3307218802462228e-05, "loss": 0.6152, "step": 1153 }, { "epoch": 1.7411320754716981, "grad_norm": 0.34775995134307486, "learning_rate": 2.327923894795747e-05, "loss": 0.696, "step": 1154 }, { "epoch": 1.7426415094339622, "grad_norm": 0.431893921598768, "learning_rate": 2.3251259093452715e-05, "loss": 0.5584, "step": 1155 }, { "epoch": 1.7441509433962263, "grad_norm": 0.4818685295387096, "learning_rate": 2.3223279238947958e-05, "loss": 0.648, "step": 1156 }, { "epoch": 1.7456603773584907, "grad_norm": 0.3148571492339726, "learning_rate": 2.31952993844432e-05, "loss": 0.6053, "step": 1157 }, { "epoch": 1.7471698113207546, "grad_norm": 0.49474292238819984, "learning_rate": 2.3167319529938445e-05, "loss": 0.6756, "step": 1158 }, { "epoch": 1.7486792452830189, "grad_norm": 0.42970659132808886, "learning_rate": 2.3139339675433688e-05, "loss": 0.6378, "step": 1159 }, { "epoch": 1.750188679245283, "grad_norm": 0.3467646221311692, "learning_rate": 2.311135982092893e-05, "loss": 0.6767, "step": 1160 }, { "epoch": 1.751698113207547, "grad_norm": 0.49298106645190726, "learning_rate": 2.3083379966424175e-05, "loss": 0.6482, "step": 1161 }, { "epoch": 1.7532075471698114, "grad_norm": 0.38860132343453263, "learning_rate": 2.305540011191942e-05, "loss": 0.6828, "step": 1162 }, { "epoch": 1.7547169811320755, "grad_norm": 0.36904627662475137, "learning_rate": 2.302742025741466e-05, "loss": 0.5228, "step": 1163 }, { "epoch": 1.7562264150943396, "grad_norm": 0.3699438829613781, "learning_rate": 2.2999440402909905e-05, "loss": 0.7172, "step": 1164 }, { "epoch": 1.7577358490566037, "grad_norm": 0.46842274714832544, "learning_rate": 2.297146054840515e-05, "loss": 0.6039, "step": 1165 }, { "epoch": 1.7592452830188678, "grad_norm": 0.33529915395896437, "learning_rate": 2.2943480693900392e-05, "loss": 0.6698, "step": 1166 }, { "epoch": 1.7607547169811322, "grad_norm": 1.0369116423976208, "learning_rate": 2.291550083939564e-05, "loss": 0.632, "step": 1167 }, { "epoch": 1.7622641509433963, "grad_norm": 0.40116881234876345, "learning_rate": 2.288752098489088e-05, "loss": 0.5547, "step": 1168 }, { "epoch": 1.7637735849056604, "grad_norm": 0.32276429823331587, "learning_rate": 2.2859541130386122e-05, "loss": 0.6588, "step": 1169 }, { "epoch": 1.7652830188679245, "grad_norm": 0.43292042576567075, "learning_rate": 2.2831561275881365e-05, "loss": 0.6191, "step": 1170 }, { "epoch": 1.7667924528301886, "grad_norm": 0.3272559617175695, "learning_rate": 2.280358142137661e-05, "loss": 0.6573, "step": 1171 }, { "epoch": 1.768301886792453, "grad_norm": 0.3315275623212996, "learning_rate": 2.2775601566871855e-05, "loss": 0.5584, "step": 1172 }, { "epoch": 1.769811320754717, "grad_norm": 0.3839260631685633, "learning_rate": 2.27476217123671e-05, "loss": 0.6137, "step": 1173 }, { "epoch": 1.7713207547169811, "grad_norm": 0.29910749329288777, "learning_rate": 2.271964185786234e-05, "loss": 0.593, "step": 1174 }, { "epoch": 1.7728301886792452, "grad_norm": 0.3966476953266179, "learning_rate": 2.2691662003357582e-05, "loss": 0.5881, "step": 1175 }, { "epoch": 1.7743396226415094, "grad_norm": 0.3834221903344162, "learning_rate": 2.266368214885283e-05, "loss": 0.652, "step": 1176 }, { "epoch": 1.7758490566037737, "grad_norm": 0.3086183820074408, "learning_rate": 2.2635702294348072e-05, "loss": 0.6265, "step": 1177 }, { "epoch": 1.7773584905660378, "grad_norm": 0.3760096804332004, "learning_rate": 2.2607722439843315e-05, "loss": 0.5765, "step": 1178 }, { "epoch": 1.778867924528302, "grad_norm": 0.28869156048512906, "learning_rate": 2.2579742585338555e-05, "loss": 0.6032, "step": 1179 }, { "epoch": 1.780377358490566, "grad_norm": 0.3723280148627601, "learning_rate": 2.25517627308338e-05, "loss": 0.6364, "step": 1180 }, { "epoch": 1.78188679245283, "grad_norm": 0.3073245665980812, "learning_rate": 2.2523782876329046e-05, "loss": 0.5801, "step": 1181 }, { "epoch": 1.7833962264150944, "grad_norm": 0.30275210444327105, "learning_rate": 2.249580302182429e-05, "loss": 0.6272, "step": 1182 }, { "epoch": 1.7849056603773585, "grad_norm": 0.3482396962287078, "learning_rate": 2.2467823167319532e-05, "loss": 0.6292, "step": 1183 }, { "epoch": 1.7864150943396226, "grad_norm": 0.3036844211927974, "learning_rate": 2.2439843312814772e-05, "loss": 0.6301, "step": 1184 }, { "epoch": 1.7879245283018868, "grad_norm": 0.36892257197797645, "learning_rate": 2.2411863458310016e-05, "loss": 0.601, "step": 1185 }, { "epoch": 1.7894339622641509, "grad_norm": 0.30478398813561736, "learning_rate": 2.2383883603805262e-05, "loss": 0.5778, "step": 1186 }, { "epoch": 1.7909433962264152, "grad_norm": 0.41357451178016175, "learning_rate": 2.2355903749300506e-05, "loss": 0.6415, "step": 1187 }, { "epoch": 1.7924528301886793, "grad_norm": 0.40936383915996677, "learning_rate": 2.232792389479575e-05, "loss": 0.6318, "step": 1188 }, { "epoch": 1.7939622641509434, "grad_norm": 0.3305974308413721, "learning_rate": 2.229994404029099e-05, "loss": 0.6276, "step": 1189 }, { "epoch": 1.7954716981132075, "grad_norm": 0.44756079605901494, "learning_rate": 2.2271964185786236e-05, "loss": 0.5627, "step": 1190 }, { "epoch": 1.7969811320754716, "grad_norm": 0.32668591688032894, "learning_rate": 2.224398433128148e-05, "loss": 0.5553, "step": 1191 }, { "epoch": 1.798490566037736, "grad_norm": 0.4056153389625477, "learning_rate": 2.2216004476776723e-05, "loss": 0.6964, "step": 1192 }, { "epoch": 1.8, "grad_norm": 0.36423242716401244, "learning_rate": 2.2188024622271966e-05, "loss": 0.6385, "step": 1193 }, { "epoch": 1.8015094339622642, "grad_norm": 0.3714533894188977, "learning_rate": 2.2160044767767206e-05, "loss": 0.5732, "step": 1194 }, { "epoch": 1.8030188679245283, "grad_norm": 0.4092859262603853, "learning_rate": 2.2132064913262453e-05, "loss": 0.6767, "step": 1195 }, { "epoch": 1.8045283018867924, "grad_norm": 0.4137797947792045, "learning_rate": 2.2104085058757696e-05, "loss": 0.5931, "step": 1196 }, { "epoch": 1.8060377358490567, "grad_norm": 0.34744989842603025, "learning_rate": 2.207610520425294e-05, "loss": 0.6201, "step": 1197 }, { "epoch": 1.8075471698113208, "grad_norm": 0.3331997614705039, "learning_rate": 2.2048125349748183e-05, "loss": 0.6613, "step": 1198 }, { "epoch": 1.809056603773585, "grad_norm": 0.511957469574061, "learning_rate": 2.2020145495243423e-05, "loss": 0.5911, "step": 1199 }, { "epoch": 1.810566037735849, "grad_norm": 0.33360941658253085, "learning_rate": 2.199216564073867e-05, "loss": 0.5849, "step": 1200 }, { "epoch": 1.8120754716981131, "grad_norm": 0.3741495659072703, "learning_rate": 2.1964185786233913e-05, "loss": 0.609, "step": 1201 }, { "epoch": 1.8135849056603774, "grad_norm": 0.4742678582136245, "learning_rate": 2.1936205931729156e-05, "loss": 0.6193, "step": 1202 }, { "epoch": 1.8150943396226416, "grad_norm": 0.44917206739099247, "learning_rate": 2.19082260772244e-05, "loss": 0.692, "step": 1203 }, { "epoch": 1.8166037735849057, "grad_norm": 0.394176326005825, "learning_rate": 2.1880246222719643e-05, "loss": 0.6738, "step": 1204 }, { "epoch": 1.8181132075471698, "grad_norm": 0.43066185484760017, "learning_rate": 2.1852266368214886e-05, "loss": 0.642, "step": 1205 }, { "epoch": 1.8196226415094339, "grad_norm": 0.3418450949122748, "learning_rate": 2.182428651371013e-05, "loss": 0.6076, "step": 1206 }, { "epoch": 1.8211320754716982, "grad_norm": 0.3652282110342225, "learning_rate": 2.1796306659205373e-05, "loss": 0.6063, "step": 1207 }, { "epoch": 1.8226415094339623, "grad_norm": 0.30250644712093494, "learning_rate": 2.1768326804700616e-05, "loss": 0.5631, "step": 1208 }, { "epoch": 1.8241509433962264, "grad_norm": 0.3288737231812076, "learning_rate": 2.174034695019586e-05, "loss": 0.592, "step": 1209 }, { "epoch": 1.8256603773584905, "grad_norm": 0.33567232845354483, "learning_rate": 2.1712367095691103e-05, "loss": 0.6278, "step": 1210 }, { "epoch": 1.8271698113207546, "grad_norm": 0.3277915063716108, "learning_rate": 2.1684387241186346e-05, "loss": 0.6181, "step": 1211 }, { "epoch": 1.828679245283019, "grad_norm": 0.3265756953441019, "learning_rate": 2.165640738668159e-05, "loss": 0.6163, "step": 1212 }, { "epoch": 1.830188679245283, "grad_norm": 0.36101523587088497, "learning_rate": 2.1628427532176833e-05, "loss": 0.6528, "step": 1213 }, { "epoch": 1.8316981132075472, "grad_norm": 0.3559998541778918, "learning_rate": 2.1600447677672077e-05, "loss": 0.5998, "step": 1214 }, { "epoch": 1.8332075471698113, "grad_norm": 0.293141053328167, "learning_rate": 2.157246782316732e-05, "loss": 0.5898, "step": 1215 }, { "epoch": 1.8347169811320754, "grad_norm": 0.29122255758943166, "learning_rate": 2.1544487968662563e-05, "loss": 0.6354, "step": 1216 }, { "epoch": 1.8362264150943397, "grad_norm": 1.045084927109783, "learning_rate": 2.1516508114157807e-05, "loss": 0.5911, "step": 1217 }, { "epoch": 1.8377358490566038, "grad_norm": 0.31757940698589965, "learning_rate": 2.148852825965305e-05, "loss": 0.6768, "step": 1218 }, { "epoch": 1.839245283018868, "grad_norm": 0.32235934503323127, "learning_rate": 2.1460548405148297e-05, "loss": 0.6418, "step": 1219 }, { "epoch": 1.840754716981132, "grad_norm": 0.3428069649816916, "learning_rate": 2.1432568550643537e-05, "loss": 0.6437, "step": 1220 }, { "epoch": 1.8422641509433961, "grad_norm": 0.3059598449926168, "learning_rate": 2.140458869613878e-05, "loss": 0.6399, "step": 1221 }, { "epoch": 1.8437735849056605, "grad_norm": 0.29285459882489406, "learning_rate": 2.1376608841634023e-05, "loss": 0.6112, "step": 1222 }, { "epoch": 1.8452830188679246, "grad_norm": 1.0373182604259723, "learning_rate": 2.1348628987129267e-05, "loss": 0.6441, "step": 1223 }, { "epoch": 1.8467924528301887, "grad_norm": 0.2824305236439785, "learning_rate": 2.1320649132624514e-05, "loss": 0.6141, "step": 1224 }, { "epoch": 1.8483018867924528, "grad_norm": 0.40148675247914556, "learning_rate": 2.1292669278119754e-05, "loss": 0.6118, "step": 1225 }, { "epoch": 1.8498113207547169, "grad_norm": 0.2981810451237384, "learning_rate": 2.1264689423614997e-05, "loss": 0.6731, "step": 1226 }, { "epoch": 1.8513207547169812, "grad_norm": 2.584492019717478, "learning_rate": 2.123670956911024e-05, "loss": 0.616, "step": 1227 }, { "epoch": 1.8528301886792453, "grad_norm": 0.3591861226985177, "learning_rate": 2.1208729714605484e-05, "loss": 0.6236, "step": 1228 }, { "epoch": 1.8543396226415094, "grad_norm": 0.3095222847651958, "learning_rate": 2.118074986010073e-05, "loss": 0.6523, "step": 1229 }, { "epoch": 1.8558490566037738, "grad_norm": 0.3424147285170496, "learning_rate": 2.1152770005595974e-05, "loss": 0.6396, "step": 1230 }, { "epoch": 1.8573584905660376, "grad_norm": 0.314080508242106, "learning_rate": 2.1124790151091214e-05, "loss": 0.5968, "step": 1231 }, { "epoch": 1.858867924528302, "grad_norm": 0.29525316784574474, "learning_rate": 2.1096810296586457e-05, "loss": 0.6179, "step": 1232 }, { "epoch": 1.860377358490566, "grad_norm": 0.3262826338661236, "learning_rate": 2.1068830442081704e-05, "loss": 0.6361, "step": 1233 }, { "epoch": 1.8618867924528302, "grad_norm": 0.30483158690049045, "learning_rate": 2.1040850587576947e-05, "loss": 0.5972, "step": 1234 }, { "epoch": 1.8633962264150945, "grad_norm": 0.2998154483089429, "learning_rate": 2.101287073307219e-05, "loss": 0.6319, "step": 1235 }, { "epoch": 1.8649056603773584, "grad_norm": 0.3132929823968501, "learning_rate": 2.098489087856743e-05, "loss": 0.6272, "step": 1236 }, { "epoch": 1.8664150943396227, "grad_norm": 0.3644256019767862, "learning_rate": 2.0956911024062674e-05, "loss": 0.6275, "step": 1237 }, { "epoch": 1.8679245283018868, "grad_norm": 0.3619907633958512, "learning_rate": 2.092893116955792e-05, "loss": 0.6324, "step": 1238 }, { "epoch": 1.869433962264151, "grad_norm": 0.30644024116691077, "learning_rate": 2.0900951315053164e-05, "loss": 0.6487, "step": 1239 }, { "epoch": 1.8709433962264153, "grad_norm": 0.32101407266001664, "learning_rate": 2.0872971460548407e-05, "loss": 0.5942, "step": 1240 }, { "epoch": 1.8724528301886791, "grad_norm": 0.3574797990980977, "learning_rate": 2.0844991606043647e-05, "loss": 0.6058, "step": 1241 }, { "epoch": 1.8739622641509435, "grad_norm": 0.2731432397008649, "learning_rate": 2.081701175153889e-05, "loss": 0.5838, "step": 1242 }, { "epoch": 1.8754716981132076, "grad_norm": 0.3081971583676486, "learning_rate": 2.0789031897034137e-05, "loss": 0.5985, "step": 1243 }, { "epoch": 1.8769811320754717, "grad_norm": 0.4638755248661489, "learning_rate": 2.076105204252938e-05, "loss": 0.5448, "step": 1244 }, { "epoch": 1.878490566037736, "grad_norm": 0.36428569342156447, "learning_rate": 2.0733072188024624e-05, "loss": 0.6677, "step": 1245 }, { "epoch": 1.88, "grad_norm": 0.32794651307072614, "learning_rate": 2.0705092333519864e-05, "loss": 0.6155, "step": 1246 }, { "epoch": 1.8815094339622642, "grad_norm": 0.817788237288649, "learning_rate": 2.067711247901511e-05, "loss": 0.6651, "step": 1247 }, { "epoch": 1.8830188679245283, "grad_norm": 0.32900653987481365, "learning_rate": 2.0649132624510354e-05, "loss": 0.6534, "step": 1248 }, { "epoch": 1.8845283018867924, "grad_norm": 0.354259188328193, "learning_rate": 2.0621152770005598e-05, "loss": 0.5931, "step": 1249 }, { "epoch": 1.8860377358490568, "grad_norm": 0.3280985985620716, "learning_rate": 2.059317291550084e-05, "loss": 0.6111, "step": 1250 }, { "epoch": 1.8875471698113206, "grad_norm": 0.3875058797193474, "learning_rate": 2.056519306099608e-05, "loss": 0.5952, "step": 1251 }, { "epoch": 1.889056603773585, "grad_norm": 0.37671901941800245, "learning_rate": 2.0537213206491328e-05, "loss": 0.6522, "step": 1252 }, { "epoch": 1.890566037735849, "grad_norm": 0.4128531980352486, "learning_rate": 2.050923335198657e-05, "loss": 0.6864, "step": 1253 }, { "epoch": 1.8920754716981132, "grad_norm": 0.3164903698692737, "learning_rate": 2.0481253497481814e-05, "loss": 0.5493, "step": 1254 }, { "epoch": 1.8935849056603775, "grad_norm": 0.3960462076340831, "learning_rate": 2.0453273642977058e-05, "loss": 0.5976, "step": 1255 }, { "epoch": 1.8950943396226414, "grad_norm": 0.35480881312131624, "learning_rate": 2.04252937884723e-05, "loss": 0.5845, "step": 1256 }, { "epoch": 1.8966037735849057, "grad_norm": 0.34522057536500367, "learning_rate": 2.0397313933967545e-05, "loss": 0.6556, "step": 1257 }, { "epoch": 1.8981132075471698, "grad_norm": 0.337296443279804, "learning_rate": 2.0369334079462788e-05, "loss": 0.6814, "step": 1258 }, { "epoch": 1.899622641509434, "grad_norm": 0.2805168811056917, "learning_rate": 2.034135422495803e-05, "loss": 0.6272, "step": 1259 }, { "epoch": 1.9011320754716983, "grad_norm": 0.3449624394911455, "learning_rate": 2.0313374370453275e-05, "loss": 0.6474, "step": 1260 }, { "epoch": 1.9026415094339622, "grad_norm": 0.3280572211846723, "learning_rate": 2.0285394515948518e-05, "loss": 0.6347, "step": 1261 }, { "epoch": 1.9041509433962265, "grad_norm": 0.35714732927573933, "learning_rate": 2.025741466144376e-05, "loss": 0.5295, "step": 1262 }, { "epoch": 1.9056603773584906, "grad_norm": 0.35049276342588254, "learning_rate": 2.0229434806939005e-05, "loss": 0.586, "step": 1263 }, { "epoch": 1.9071698113207547, "grad_norm": 0.36879482492260923, "learning_rate": 2.0201454952434248e-05, "loss": 0.5682, "step": 1264 }, { "epoch": 1.908679245283019, "grad_norm": 0.300565774847212, "learning_rate": 2.017347509792949e-05, "loss": 0.6407, "step": 1265 }, { "epoch": 1.910188679245283, "grad_norm": 0.29997892836529605, "learning_rate": 2.0145495243424735e-05, "loss": 0.5913, "step": 1266 }, { "epoch": 1.9116981132075472, "grad_norm": 0.3222923618904655, "learning_rate": 2.0117515388919978e-05, "loss": 0.6426, "step": 1267 }, { "epoch": 1.9132075471698113, "grad_norm": 0.34653857778460406, "learning_rate": 2.008953553441522e-05, "loss": 0.629, "step": 1268 }, { "epoch": 1.9147169811320754, "grad_norm": 0.36204978834352647, "learning_rate": 2.0061555679910465e-05, "loss": 0.6626, "step": 1269 }, { "epoch": 1.9162264150943398, "grad_norm": 0.30703816406770273, "learning_rate": 2.0033575825405708e-05, "loss": 0.6269, "step": 1270 }, { "epoch": 1.9177358490566037, "grad_norm": 0.45526307662284454, "learning_rate": 2.000559597090095e-05, "loss": 0.6394, "step": 1271 }, { "epoch": 1.919245283018868, "grad_norm": 0.3428934663134498, "learning_rate": 1.9977616116396195e-05, "loss": 0.5814, "step": 1272 }, { "epoch": 1.920754716981132, "grad_norm": 0.3483198311610776, "learning_rate": 1.994963626189144e-05, "loss": 0.6114, "step": 1273 }, { "epoch": 1.9222641509433962, "grad_norm": 0.4806434783200171, "learning_rate": 1.9921656407386682e-05, "loss": 0.6376, "step": 1274 }, { "epoch": 1.9237735849056605, "grad_norm": 0.3746332153159746, "learning_rate": 1.9893676552881925e-05, "loss": 0.6103, "step": 1275 }, { "epoch": 1.9252830188679244, "grad_norm": 0.330995548312209, "learning_rate": 1.9865696698377172e-05, "loss": 0.5888, "step": 1276 }, { "epoch": 1.9267924528301887, "grad_norm": 0.33991119023837807, "learning_rate": 1.9837716843872412e-05, "loss": 0.6318, "step": 1277 }, { "epoch": 1.9283018867924528, "grad_norm": 0.7848019038955028, "learning_rate": 1.9809736989367655e-05, "loss": 0.6823, "step": 1278 }, { "epoch": 1.929811320754717, "grad_norm": 0.36581755414914946, "learning_rate": 1.97817571348629e-05, "loss": 0.631, "step": 1279 }, { "epoch": 1.9313207547169813, "grad_norm": 0.3516181876894466, "learning_rate": 1.9753777280358142e-05, "loss": 0.623, "step": 1280 }, { "epoch": 1.9328301886792452, "grad_norm": 0.34076006303037526, "learning_rate": 1.972579742585339e-05, "loss": 0.6362, "step": 1281 }, { "epoch": 1.9343396226415095, "grad_norm": 0.38953097373298856, "learning_rate": 1.9697817571348632e-05, "loss": 0.5828, "step": 1282 }, { "epoch": 1.9358490566037736, "grad_norm": 0.30915722554617087, "learning_rate": 1.9669837716843872e-05, "loss": 0.6231, "step": 1283 }, { "epoch": 1.9373584905660377, "grad_norm": 0.4125242098883407, "learning_rate": 1.9641857862339115e-05, "loss": 0.6137, "step": 1284 }, { "epoch": 1.938867924528302, "grad_norm": 0.306469481947522, "learning_rate": 1.961387800783436e-05, "loss": 0.6435, "step": 1285 }, { "epoch": 1.940377358490566, "grad_norm": 0.3902186395516134, "learning_rate": 1.9585898153329605e-05, "loss": 0.5786, "step": 1286 }, { "epoch": 1.9418867924528302, "grad_norm": 0.42033083017228917, "learning_rate": 1.955791829882485e-05, "loss": 0.6479, "step": 1287 }, { "epoch": 1.9433962264150944, "grad_norm": 0.376407214561354, "learning_rate": 1.952993844432009e-05, "loss": 0.5883, "step": 1288 }, { "epoch": 1.9449056603773585, "grad_norm": 0.3254588344744923, "learning_rate": 1.9501958589815332e-05, "loss": 0.6866, "step": 1289 }, { "epoch": 1.9464150943396228, "grad_norm": 0.5212042890070583, "learning_rate": 1.947397873531058e-05, "loss": 0.667, "step": 1290 }, { "epoch": 1.9479245283018867, "grad_norm": 0.48459611656974566, "learning_rate": 1.9445998880805822e-05, "loss": 0.5971, "step": 1291 }, { "epoch": 1.949433962264151, "grad_norm": 0.3517856389474132, "learning_rate": 1.9418019026301066e-05, "loss": 0.5731, "step": 1292 }, { "epoch": 1.950943396226415, "grad_norm": 0.37784085824797087, "learning_rate": 1.9390039171796306e-05, "loss": 0.6935, "step": 1293 }, { "epoch": 1.9524528301886792, "grad_norm": 0.4003678989850066, "learning_rate": 1.936205931729155e-05, "loss": 0.5787, "step": 1294 }, { "epoch": 1.9539622641509435, "grad_norm": 0.38383200350522656, "learning_rate": 1.9334079462786796e-05, "loss": 0.5821, "step": 1295 }, { "epoch": 1.9554716981132074, "grad_norm": 0.38567119387350274, "learning_rate": 1.930609960828204e-05, "loss": 0.6339, "step": 1296 }, { "epoch": 1.9569811320754718, "grad_norm": 0.3505634513365029, "learning_rate": 1.9278119753777282e-05, "loss": 0.6306, "step": 1297 }, { "epoch": 1.9584905660377359, "grad_norm": 0.36433000051048686, "learning_rate": 1.9250139899272522e-05, "loss": 0.6311, "step": 1298 }, { "epoch": 1.96, "grad_norm": 0.32889466881528095, "learning_rate": 1.9222160044767766e-05, "loss": 0.5904, "step": 1299 }, { "epoch": 1.9615094339622643, "grad_norm": 0.31496527251241535, "learning_rate": 1.9194180190263013e-05, "loss": 0.628, "step": 1300 }, { "epoch": 1.9630188679245282, "grad_norm": 3.183835781238465, "learning_rate": 1.9166200335758256e-05, "loss": 0.6688, "step": 1301 }, { "epoch": 1.9645283018867925, "grad_norm": 2.11988404901896, "learning_rate": 1.91382204812535e-05, "loss": 0.5751, "step": 1302 }, { "epoch": 1.9660377358490566, "grad_norm": 0.39102169211455845, "learning_rate": 1.911024062674874e-05, "loss": 0.6105, "step": 1303 }, { "epoch": 1.9675471698113207, "grad_norm": 0.3356373344184646, "learning_rate": 1.9082260772243986e-05, "loss": 0.6691, "step": 1304 }, { "epoch": 1.969056603773585, "grad_norm": 0.33936850902738813, "learning_rate": 1.905428091773923e-05, "loss": 0.5818, "step": 1305 }, { "epoch": 1.970566037735849, "grad_norm": 0.3125127190755324, "learning_rate": 1.9026301063234473e-05, "loss": 0.6903, "step": 1306 }, { "epoch": 1.9720754716981133, "grad_norm": 0.37220176903618524, "learning_rate": 1.8998321208729716e-05, "loss": 0.6372, "step": 1307 }, { "epoch": 1.9735849056603774, "grad_norm": 0.333323524072569, "learning_rate": 1.897034135422496e-05, "loss": 0.5685, "step": 1308 }, { "epoch": 1.9750943396226415, "grad_norm": 0.30576012281180814, "learning_rate": 1.8942361499720203e-05, "loss": 0.6038, "step": 1309 }, { "epoch": 1.9766037735849058, "grad_norm": 0.3945529751240445, "learning_rate": 1.8914381645215446e-05, "loss": 0.6457, "step": 1310 }, { "epoch": 1.9781132075471697, "grad_norm": 0.2774316641209328, "learning_rate": 1.888640179071069e-05, "loss": 0.6567, "step": 1311 }, { "epoch": 1.979622641509434, "grad_norm": 0.35117826532025936, "learning_rate": 1.8858421936205933e-05, "loss": 0.6308, "step": 1312 }, { "epoch": 1.9811320754716981, "grad_norm": 0.3419364217067856, "learning_rate": 1.8830442081701176e-05, "loss": 0.6162, "step": 1313 }, { "epoch": 1.9826415094339622, "grad_norm": 0.36693025313968536, "learning_rate": 1.880246222719642e-05, "loss": 0.634, "step": 1314 }, { "epoch": 1.9841509433962266, "grad_norm": 0.3183259269090567, "learning_rate": 1.8774482372691663e-05, "loss": 0.5759, "step": 1315 }, { "epoch": 1.9856603773584904, "grad_norm": 2.9550667184423682, "learning_rate": 1.8746502518186906e-05, "loss": 0.712, "step": 1316 }, { "epoch": 1.9871698113207548, "grad_norm": 0.36436478637302866, "learning_rate": 1.871852266368215e-05, "loss": 0.6046, "step": 1317 }, { "epoch": 1.9886792452830189, "grad_norm": 0.3513792552765972, "learning_rate": 1.8690542809177393e-05, "loss": 0.6677, "step": 1318 }, { "epoch": 1.990188679245283, "grad_norm": 0.32038155032867643, "learning_rate": 1.8662562954672636e-05, "loss": 0.6549, "step": 1319 }, { "epoch": 1.9916981132075473, "grad_norm": 0.31445105810605595, "learning_rate": 1.863458310016788e-05, "loss": 0.6003, "step": 1320 }, { "epoch": 1.9932075471698112, "grad_norm": 0.4328766003560714, "learning_rate": 1.8606603245663123e-05, "loss": 0.6476, "step": 1321 }, { "epoch": 1.9947169811320755, "grad_norm": 0.32805033665845823, "learning_rate": 1.8578623391158366e-05, "loss": 0.6121, "step": 1322 }, { "epoch": 1.9962264150943396, "grad_norm": 0.3016162458656436, "learning_rate": 1.855064353665361e-05, "loss": 0.5952, "step": 1323 }, { "epoch": 1.9977358490566037, "grad_norm": 0.3131440934220455, "learning_rate": 1.8522663682148853e-05, "loss": 0.5831, "step": 1324 }, { "epoch": 1.999245283018868, "grad_norm": 0.29401127063820814, "learning_rate": 1.8494683827644097e-05, "loss": 0.5822, "step": 1325 }, { "epoch": 2.0, "grad_norm": 0.49733230922246346, "learning_rate": 1.846670397313934e-05, "loss": 0.6198, "step": 1326 }, { "epoch": 2.0015094339622643, "grad_norm": 0.35646209321907746, "learning_rate": 1.8438724118634583e-05, "loss": 0.4999, "step": 1327 }, { "epoch": 2.003018867924528, "grad_norm": 0.3981125944090188, "learning_rate": 1.8410744264129827e-05, "loss": 0.4977, "step": 1328 }, { "epoch": 2.0045283018867925, "grad_norm": 0.3736032043806837, "learning_rate": 1.838276440962507e-05, "loss": 0.541, "step": 1329 }, { "epoch": 2.0060377358490564, "grad_norm": 0.30212541569145873, "learning_rate": 1.8354784555120313e-05, "loss": 0.4858, "step": 1330 }, { "epoch": 2.0075471698113208, "grad_norm": 0.3073556357585776, "learning_rate": 1.8326804700615557e-05, "loss": 0.5015, "step": 1331 }, { "epoch": 2.009056603773585, "grad_norm": 0.34113812336109905, "learning_rate": 1.82988248461108e-05, "loss": 0.5172, "step": 1332 }, { "epoch": 2.010566037735849, "grad_norm": 4.487795810249564, "learning_rate": 1.8270844991606047e-05, "loss": 0.5514, "step": 1333 }, { "epoch": 2.0120754716981133, "grad_norm": 0.42900541776859996, "learning_rate": 1.824286513710129e-05, "loss": 0.4934, "step": 1334 }, { "epoch": 2.013584905660377, "grad_norm": 3.065346721241103, "learning_rate": 1.821488528259653e-05, "loss": 0.5091, "step": 1335 }, { "epoch": 2.0150943396226415, "grad_norm": 0.40144469758266016, "learning_rate": 1.8186905428091774e-05, "loss": 0.5088, "step": 1336 }, { "epoch": 2.016603773584906, "grad_norm": 0.40845248070738877, "learning_rate": 1.8158925573587017e-05, "loss": 0.5574, "step": 1337 }, { "epoch": 2.0181132075471697, "grad_norm": 3.392575632261363, "learning_rate": 1.8130945719082264e-05, "loss": 0.5533, "step": 1338 }, { "epoch": 2.019622641509434, "grad_norm": 0.45092263682500694, "learning_rate": 1.8102965864577507e-05, "loss": 0.5609, "step": 1339 }, { "epoch": 2.021132075471698, "grad_norm": 0.37318211497573206, "learning_rate": 1.8074986010072747e-05, "loss": 0.5026, "step": 1340 }, { "epoch": 2.0226415094339623, "grad_norm": 0.34400471516145126, "learning_rate": 1.804700615556799e-05, "loss": 0.4826, "step": 1341 }, { "epoch": 2.0241509433962266, "grad_norm": 0.3500144022243311, "learning_rate": 1.8019026301063234e-05, "loss": 0.4663, "step": 1342 }, { "epoch": 2.0256603773584905, "grad_norm": 0.34048486473138423, "learning_rate": 1.799104644655848e-05, "loss": 0.4835, "step": 1343 }, { "epoch": 2.027169811320755, "grad_norm": 0.33541429775955595, "learning_rate": 1.7963066592053724e-05, "loss": 0.5427, "step": 1344 }, { "epoch": 2.0286792452830187, "grad_norm": 0.3600896985895227, "learning_rate": 1.7935086737548964e-05, "loss": 0.516, "step": 1345 }, { "epoch": 2.030188679245283, "grad_norm": 0.3429326727185839, "learning_rate": 1.7907106883044207e-05, "loss": 0.5144, "step": 1346 }, { "epoch": 2.0316981132075473, "grad_norm": 0.31543021450047887, "learning_rate": 1.7879127028539454e-05, "loss": 0.5449, "step": 1347 }, { "epoch": 2.0332075471698112, "grad_norm": 0.27664831389978795, "learning_rate": 1.7851147174034697e-05, "loss": 0.4857, "step": 1348 }, { "epoch": 2.0347169811320756, "grad_norm": 0.32371945804571334, "learning_rate": 1.782316731952994e-05, "loss": 0.5512, "step": 1349 }, { "epoch": 2.0362264150943394, "grad_norm": 0.3123789070192699, "learning_rate": 1.779518746502518e-05, "loss": 0.5339, "step": 1350 }, { "epoch": 2.0377358490566038, "grad_norm": 0.3018082555969256, "learning_rate": 1.7767207610520424e-05, "loss": 0.5477, "step": 1351 }, { "epoch": 2.039245283018868, "grad_norm": 0.330825753840397, "learning_rate": 1.773922775601567e-05, "loss": 0.5226, "step": 1352 }, { "epoch": 2.040754716981132, "grad_norm": 0.3064556391674686, "learning_rate": 1.7711247901510914e-05, "loss": 0.5009, "step": 1353 }, { "epoch": 2.0422641509433963, "grad_norm": 0.3279356419954503, "learning_rate": 1.7683268047006157e-05, "loss": 0.5007, "step": 1354 }, { "epoch": 2.04377358490566, "grad_norm": 0.3562043309709785, "learning_rate": 1.7655288192501397e-05, "loss": 0.4928, "step": 1355 }, { "epoch": 2.0452830188679245, "grad_norm": 0.2992958064668747, "learning_rate": 1.762730833799664e-05, "loss": 0.513, "step": 1356 }, { "epoch": 2.046792452830189, "grad_norm": 0.36710409137930194, "learning_rate": 1.7599328483491888e-05, "loss": 0.5065, "step": 1357 }, { "epoch": 2.0483018867924527, "grad_norm": 0.3134601371847729, "learning_rate": 1.757134862898713e-05, "loss": 0.4913, "step": 1358 }, { "epoch": 2.049811320754717, "grad_norm": 0.27689129271088303, "learning_rate": 1.7543368774482374e-05, "loss": 0.4884, "step": 1359 }, { "epoch": 2.051320754716981, "grad_norm": 0.3714023246669663, "learning_rate": 1.7515388919977618e-05, "loss": 0.5384, "step": 1360 }, { "epoch": 2.0528301886792453, "grad_norm": 0.2847785796252443, "learning_rate": 1.748740906547286e-05, "loss": 0.503, "step": 1361 }, { "epoch": 2.0543396226415096, "grad_norm": 0.31030468078337337, "learning_rate": 1.7459429210968104e-05, "loss": 0.5054, "step": 1362 }, { "epoch": 2.0558490566037735, "grad_norm": 0.2888211469303781, "learning_rate": 1.7431449356463348e-05, "loss": 0.5146, "step": 1363 }, { "epoch": 2.057358490566038, "grad_norm": 0.3059017261234641, "learning_rate": 1.740346950195859e-05, "loss": 0.5527, "step": 1364 }, { "epoch": 2.0588679245283017, "grad_norm": 0.2880026611275554, "learning_rate": 1.7375489647453834e-05, "loss": 0.5871, "step": 1365 }, { "epoch": 2.060377358490566, "grad_norm": 0.3435683903553664, "learning_rate": 1.7347509792949078e-05, "loss": 0.4751, "step": 1366 }, { "epoch": 2.0618867924528304, "grad_norm": 0.2577466623620189, "learning_rate": 1.731952993844432e-05, "loss": 0.5072, "step": 1367 }, { "epoch": 2.0633962264150942, "grad_norm": 0.5951239656292413, "learning_rate": 1.7291550083939565e-05, "loss": 0.5067, "step": 1368 }, { "epoch": 2.0649056603773586, "grad_norm": 0.3091376294357986, "learning_rate": 1.7263570229434808e-05, "loss": 0.5628, "step": 1369 }, { "epoch": 2.0664150943396224, "grad_norm": 0.3448984710683876, "learning_rate": 1.723559037493005e-05, "loss": 0.5315, "step": 1370 }, { "epoch": 2.0679245283018868, "grad_norm": 0.2883682281289665, "learning_rate": 1.7207610520425295e-05, "loss": 0.4979, "step": 1371 }, { "epoch": 2.069433962264151, "grad_norm": 0.3162892758521297, "learning_rate": 1.7179630665920538e-05, "loss": 0.5156, "step": 1372 }, { "epoch": 2.070943396226415, "grad_norm": 9.124432005211863, "learning_rate": 1.715165081141578e-05, "loss": 0.6299, "step": 1373 }, { "epoch": 2.0724528301886793, "grad_norm": 0.3426920377467487, "learning_rate": 1.7123670956911025e-05, "loss": 0.5106, "step": 1374 }, { "epoch": 2.073962264150943, "grad_norm": 0.3252637771238837, "learning_rate": 1.7095691102406268e-05, "loss": 0.465, "step": 1375 }, { "epoch": 2.0754716981132075, "grad_norm": 0.27946931242846335, "learning_rate": 1.706771124790151e-05, "loss": 0.5466, "step": 1376 }, { "epoch": 2.076981132075472, "grad_norm": 0.2950006671302379, "learning_rate": 1.7039731393396755e-05, "loss": 0.5233, "step": 1377 }, { "epoch": 2.0784905660377357, "grad_norm": 0.3071238734202378, "learning_rate": 1.7011751538891998e-05, "loss": 0.4996, "step": 1378 }, { "epoch": 2.08, "grad_norm": 0.29000161966129556, "learning_rate": 1.698377168438724e-05, "loss": 0.5456, "step": 1379 }, { "epoch": 2.081509433962264, "grad_norm": 0.3638090048005039, "learning_rate": 1.6955791829882485e-05, "loss": 0.5342, "step": 1380 }, { "epoch": 2.0830188679245283, "grad_norm": 0.3221376007630064, "learning_rate": 1.6927811975377728e-05, "loss": 0.506, "step": 1381 }, { "epoch": 2.0845283018867926, "grad_norm": 0.2882987076412252, "learning_rate": 1.689983212087297e-05, "loss": 0.5291, "step": 1382 }, { "epoch": 2.0860377358490565, "grad_norm": 0.28696681372347427, "learning_rate": 1.6871852266368215e-05, "loss": 0.4838, "step": 1383 }, { "epoch": 2.087547169811321, "grad_norm": 0.38854977880255626, "learning_rate": 1.684387241186346e-05, "loss": 0.5473, "step": 1384 }, { "epoch": 2.0890566037735847, "grad_norm": 0.29092276215854995, "learning_rate": 1.6815892557358702e-05, "loss": 0.4947, "step": 1385 }, { "epoch": 2.090566037735849, "grad_norm": 0.3087900936237078, "learning_rate": 1.678791270285395e-05, "loss": 0.4894, "step": 1386 }, { "epoch": 2.0920754716981134, "grad_norm": 0.3447103919070566, "learning_rate": 1.675993284834919e-05, "loss": 0.4929, "step": 1387 }, { "epoch": 2.0935849056603772, "grad_norm": 0.2823804807438117, "learning_rate": 1.6731952993844432e-05, "loss": 0.5266, "step": 1388 }, { "epoch": 2.0950943396226416, "grad_norm": 2.227830939004405, "learning_rate": 1.6703973139339675e-05, "loss": 0.4667, "step": 1389 }, { "epoch": 2.0966037735849055, "grad_norm": 0.36027609216088624, "learning_rate": 1.6675993284834922e-05, "loss": 0.5089, "step": 1390 }, { "epoch": 2.09811320754717, "grad_norm": 0.30939324253732786, "learning_rate": 1.6648013430330165e-05, "loss": 0.509, "step": 1391 }, { "epoch": 2.099622641509434, "grad_norm": 0.2940056905558936, "learning_rate": 1.6620033575825405e-05, "loss": 0.5088, "step": 1392 }, { "epoch": 2.101132075471698, "grad_norm": 0.2906000756858416, "learning_rate": 1.659205372132065e-05, "loss": 0.496, "step": 1393 }, { "epoch": 2.1026415094339623, "grad_norm": 0.2869459547310367, "learning_rate": 1.6564073866815892e-05, "loss": 0.5428, "step": 1394 }, { "epoch": 2.104150943396226, "grad_norm": 0.2999230365705766, "learning_rate": 1.653609401231114e-05, "loss": 0.4922, "step": 1395 }, { "epoch": 2.1056603773584905, "grad_norm": 0.2962964554482559, "learning_rate": 1.6508114157806382e-05, "loss": 0.507, "step": 1396 }, { "epoch": 2.107169811320755, "grad_norm": 0.2795439894392488, "learning_rate": 1.6480134303301622e-05, "loss": 0.5061, "step": 1397 }, { "epoch": 2.1086792452830188, "grad_norm": 0.31442101897336405, "learning_rate": 1.6452154448796865e-05, "loss": 0.5429, "step": 1398 }, { "epoch": 2.110188679245283, "grad_norm": 0.29732567961373857, "learning_rate": 1.642417459429211e-05, "loss": 0.5197, "step": 1399 }, { "epoch": 2.111698113207547, "grad_norm": 0.2869692773071042, "learning_rate": 1.6396194739787356e-05, "loss": 0.5009, "step": 1400 }, { "epoch": 2.1132075471698113, "grad_norm": 0.31504920038578266, "learning_rate": 1.63682148852826e-05, "loss": 0.5181, "step": 1401 }, { "epoch": 2.1147169811320756, "grad_norm": 0.31546568417414367, "learning_rate": 1.634023503077784e-05, "loss": 0.4921, "step": 1402 }, { "epoch": 2.1162264150943395, "grad_norm": 0.5023731953716996, "learning_rate": 1.6312255176273082e-05, "loss": 0.5294, "step": 1403 }, { "epoch": 2.117735849056604, "grad_norm": 0.2997062635797708, "learning_rate": 1.628427532176833e-05, "loss": 0.5035, "step": 1404 }, { "epoch": 2.1192452830188677, "grad_norm": 6.197559051784822, "learning_rate": 1.6256295467263572e-05, "loss": 1.1389, "step": 1405 }, { "epoch": 2.120754716981132, "grad_norm": 0.37051838525151753, "learning_rate": 1.6228315612758816e-05, "loss": 0.4735, "step": 1406 }, { "epoch": 2.1222641509433964, "grad_norm": 0.34610400457497426, "learning_rate": 1.6200335758254056e-05, "loss": 0.5028, "step": 1407 }, { "epoch": 2.1237735849056603, "grad_norm": 0.29036868542009187, "learning_rate": 1.61723559037493e-05, "loss": 0.5034, "step": 1408 }, { "epoch": 2.1252830188679246, "grad_norm": 0.2800323997242073, "learning_rate": 1.6144376049244546e-05, "loss": 0.502, "step": 1409 }, { "epoch": 2.1267924528301885, "grad_norm": 0.38872365691156985, "learning_rate": 1.611639619473979e-05, "loss": 0.5192, "step": 1410 }, { "epoch": 2.128301886792453, "grad_norm": 0.3370959225184659, "learning_rate": 1.6088416340235033e-05, "loss": 0.6035, "step": 1411 }, { "epoch": 2.129811320754717, "grad_norm": 0.27435668970687543, "learning_rate": 1.6060436485730273e-05, "loss": 0.51, "step": 1412 }, { "epoch": 2.131320754716981, "grad_norm": 1.7820096334751305, "learning_rate": 1.6032456631225516e-05, "loss": 0.5151, "step": 1413 }, { "epoch": 2.1328301886792453, "grad_norm": 0.383674413890331, "learning_rate": 1.6004476776720763e-05, "loss": 0.5579, "step": 1414 }, { "epoch": 2.1343396226415092, "grad_norm": 0.328537968912913, "learning_rate": 1.5976496922216006e-05, "loss": 0.495, "step": 1415 }, { "epoch": 2.1358490566037736, "grad_norm": 0.3243298723276784, "learning_rate": 1.594851706771125e-05, "loss": 0.5264, "step": 1416 }, { "epoch": 2.137358490566038, "grad_norm": 0.32700388956318677, "learning_rate": 1.5920537213206493e-05, "loss": 0.4996, "step": 1417 }, { "epoch": 2.1388679245283018, "grad_norm": 0.3322839065657285, "learning_rate": 1.5892557358701736e-05, "loss": 0.5223, "step": 1418 }, { "epoch": 2.140377358490566, "grad_norm": 0.3004552967389372, "learning_rate": 1.586457750419698e-05, "loss": 0.5064, "step": 1419 }, { "epoch": 2.14188679245283, "grad_norm": 0.5763848814349466, "learning_rate": 1.5836597649692223e-05, "loss": 0.5033, "step": 1420 }, { "epoch": 2.1433962264150943, "grad_norm": 0.32960695702396287, "learning_rate": 1.5808617795187466e-05, "loss": 0.5373, "step": 1421 }, { "epoch": 2.1449056603773586, "grad_norm": 0.3107640748412959, "learning_rate": 1.578063794068271e-05, "loss": 0.5038, "step": 1422 }, { "epoch": 2.1464150943396225, "grad_norm": 0.30299740916002305, "learning_rate": 1.5752658086177953e-05, "loss": 0.4905, "step": 1423 }, { "epoch": 2.147924528301887, "grad_norm": 0.3175005717166028, "learning_rate": 1.5724678231673196e-05, "loss": 0.5247, "step": 1424 }, { "epoch": 2.149433962264151, "grad_norm": 0.38504686240959507, "learning_rate": 1.569669837716844e-05, "loss": 0.4848, "step": 1425 }, { "epoch": 2.150943396226415, "grad_norm": 0.29861363410964187, "learning_rate": 1.5668718522663683e-05, "loss": 0.5272, "step": 1426 }, { "epoch": 2.1524528301886794, "grad_norm": 0.3048000067062521, "learning_rate": 1.5640738668158926e-05, "loss": 0.523, "step": 1427 }, { "epoch": 2.1539622641509433, "grad_norm": 0.29199245961301584, "learning_rate": 1.561275881365417e-05, "loss": 0.4737, "step": 1428 }, { "epoch": 2.1554716981132076, "grad_norm": 0.324024535729314, "learning_rate": 1.5584778959149413e-05, "loss": 0.5564, "step": 1429 }, { "epoch": 2.1569811320754715, "grad_norm": 0.2706042276628025, "learning_rate": 1.5556799104644656e-05, "loss": 0.4991, "step": 1430 }, { "epoch": 2.158490566037736, "grad_norm": 0.31292251156779505, "learning_rate": 1.55288192501399e-05, "loss": 0.5495, "step": 1431 }, { "epoch": 2.16, "grad_norm": 0.28435967603261125, "learning_rate": 1.5500839395635143e-05, "loss": 0.5318, "step": 1432 }, { "epoch": 2.161509433962264, "grad_norm": 0.28290949197621923, "learning_rate": 1.5472859541130387e-05, "loss": 0.475, "step": 1433 }, { "epoch": 2.1630188679245284, "grad_norm": 0.27419716811272765, "learning_rate": 1.544487968662563e-05, "loss": 0.512, "step": 1434 }, { "epoch": 2.1645283018867927, "grad_norm": 0.33229780717562507, "learning_rate": 1.5416899832120873e-05, "loss": 0.5253, "step": 1435 }, { "epoch": 2.1660377358490566, "grad_norm": 0.2635435711478846, "learning_rate": 1.5388919977616117e-05, "loss": 0.4922, "step": 1436 }, { "epoch": 2.167547169811321, "grad_norm": 0.3354287803738388, "learning_rate": 1.536094012311136e-05, "loss": 0.5607, "step": 1437 }, { "epoch": 2.169056603773585, "grad_norm": 0.2982433296429279, "learning_rate": 1.5332960268606603e-05, "loss": 0.53, "step": 1438 }, { "epoch": 2.170566037735849, "grad_norm": 0.2744678686241482, "learning_rate": 1.5304980414101847e-05, "loss": 0.4821, "step": 1439 }, { "epoch": 2.172075471698113, "grad_norm": 0.30144942338602565, "learning_rate": 1.527700055959709e-05, "loss": 0.5545, "step": 1440 }, { "epoch": 2.1735849056603773, "grad_norm": 0.2890165107307089, "learning_rate": 1.5249020705092335e-05, "loss": 0.4918, "step": 1441 }, { "epoch": 2.1750943396226416, "grad_norm": 0.27397501117313733, "learning_rate": 1.5221040850587578e-05, "loss": 0.5342, "step": 1442 }, { "epoch": 2.1766037735849055, "grad_norm": 0.2948193831715891, "learning_rate": 1.5193060996082822e-05, "loss": 0.493, "step": 1443 }, { "epoch": 2.17811320754717, "grad_norm": 0.31953210199754906, "learning_rate": 1.5165081141578064e-05, "loss": 0.5324, "step": 1444 }, { "epoch": 2.179622641509434, "grad_norm": 0.34514765305750156, "learning_rate": 1.5137101287073307e-05, "loss": 0.4684, "step": 1445 }, { "epoch": 2.181132075471698, "grad_norm": 0.30888012716661545, "learning_rate": 1.5109121432568552e-05, "loss": 0.568, "step": 1446 }, { "epoch": 2.1826415094339624, "grad_norm": 0.31339189400441597, "learning_rate": 1.5081141578063795e-05, "loss": 0.5351, "step": 1447 }, { "epoch": 2.1841509433962263, "grad_norm": 0.2850461935374051, "learning_rate": 1.5053161723559039e-05, "loss": 0.4853, "step": 1448 }, { "epoch": 2.1856603773584906, "grad_norm": 0.2827499266089179, "learning_rate": 1.502518186905428e-05, "loss": 0.5234, "step": 1449 }, { "epoch": 2.1871698113207545, "grad_norm": 0.3015886512697336, "learning_rate": 1.4997202014549525e-05, "loss": 0.5494, "step": 1450 }, { "epoch": 2.188679245283019, "grad_norm": 0.2849817847705387, "learning_rate": 1.4969222160044769e-05, "loss": 0.5304, "step": 1451 }, { "epoch": 2.190188679245283, "grad_norm": 0.40838242894053084, "learning_rate": 1.4941242305540012e-05, "loss": 0.5282, "step": 1452 }, { "epoch": 2.191698113207547, "grad_norm": 0.3487341672894736, "learning_rate": 1.4913262451035257e-05, "loss": 0.5272, "step": 1453 }, { "epoch": 2.1932075471698114, "grad_norm": 0.28796501873449454, "learning_rate": 1.4885282596530497e-05, "loss": 0.4951, "step": 1454 }, { "epoch": 2.1947169811320757, "grad_norm": 0.2684130709861942, "learning_rate": 1.4857302742025742e-05, "loss": 0.4948, "step": 1455 }, { "epoch": 2.1962264150943396, "grad_norm": 0.27600668574894804, "learning_rate": 1.4829322887520986e-05, "loss": 0.4908, "step": 1456 }, { "epoch": 2.197735849056604, "grad_norm": 0.2588348566679071, "learning_rate": 1.4801343033016229e-05, "loss": 0.491, "step": 1457 }, { "epoch": 2.199245283018868, "grad_norm": 0.2987400337756697, "learning_rate": 1.4773363178511474e-05, "loss": 0.5446, "step": 1458 }, { "epoch": 2.200754716981132, "grad_norm": 0.25621718687856143, "learning_rate": 1.4745383324006714e-05, "loss": 0.4764, "step": 1459 }, { "epoch": 2.202264150943396, "grad_norm": 0.7981297027382572, "learning_rate": 1.4717403469501959e-05, "loss": 0.4999, "step": 1460 }, { "epoch": 2.2037735849056603, "grad_norm": 0.2965666251899607, "learning_rate": 1.4689423614997202e-05, "loss": 0.535, "step": 1461 }, { "epoch": 2.2052830188679247, "grad_norm": 0.2804521739277098, "learning_rate": 1.4661443760492446e-05, "loss": 0.4918, "step": 1462 }, { "epoch": 2.2067924528301885, "grad_norm": 0.276142140919494, "learning_rate": 1.463346390598769e-05, "loss": 0.4702, "step": 1463 }, { "epoch": 2.208301886792453, "grad_norm": 0.2820984265166286, "learning_rate": 1.4605484051482932e-05, "loss": 0.4939, "step": 1464 }, { "epoch": 2.209811320754717, "grad_norm": 0.7192982932338521, "learning_rate": 1.4577504196978176e-05, "loss": 0.5609, "step": 1465 }, { "epoch": 2.211320754716981, "grad_norm": 0.2937186576584232, "learning_rate": 1.454952434247342e-05, "loss": 0.5353, "step": 1466 }, { "epoch": 2.2128301886792454, "grad_norm": 0.31277570600962407, "learning_rate": 1.4521544487968664e-05, "loss": 0.4838, "step": 1467 }, { "epoch": 2.2143396226415093, "grad_norm": 0.3055256840051491, "learning_rate": 1.4493564633463908e-05, "loss": 0.5433, "step": 1468 }, { "epoch": 2.2158490566037736, "grad_norm": 0.31560711080139303, "learning_rate": 1.4465584778959151e-05, "loss": 0.5337, "step": 1469 }, { "epoch": 2.2173584905660375, "grad_norm": 0.29251628786569595, "learning_rate": 1.4437604924454393e-05, "loss": 0.5144, "step": 1470 }, { "epoch": 2.218867924528302, "grad_norm": 0.3049591268418919, "learning_rate": 1.4409625069949636e-05, "loss": 0.5322, "step": 1471 }, { "epoch": 2.220377358490566, "grad_norm": 0.3094888112301788, "learning_rate": 1.4381645215444881e-05, "loss": 0.5531, "step": 1472 }, { "epoch": 2.22188679245283, "grad_norm": 0.272447846375912, "learning_rate": 1.4353665360940124e-05, "loss": 0.4913, "step": 1473 }, { "epoch": 2.2233962264150944, "grad_norm": 0.3123501475624154, "learning_rate": 1.4325685506435368e-05, "loss": 0.5543, "step": 1474 }, { "epoch": 2.2249056603773587, "grad_norm": 0.272171738262368, "learning_rate": 1.429770565193061e-05, "loss": 0.5062, "step": 1475 }, { "epoch": 2.2264150943396226, "grad_norm": 0.34071458995993736, "learning_rate": 1.4269725797425853e-05, "loss": 0.4612, "step": 1476 }, { "epoch": 2.227924528301887, "grad_norm": 0.3207863817121142, "learning_rate": 1.4241745942921098e-05, "loss": 0.5399, "step": 1477 }, { "epoch": 2.229433962264151, "grad_norm": 0.3312395334585876, "learning_rate": 1.4213766088416341e-05, "loss": 0.5551, "step": 1478 }, { "epoch": 2.230943396226415, "grad_norm": 0.3078119559299569, "learning_rate": 1.4185786233911586e-05, "loss": 0.5399, "step": 1479 }, { "epoch": 2.232452830188679, "grad_norm": 0.3677416007012839, "learning_rate": 1.4157806379406826e-05, "loss": 0.4692, "step": 1480 }, { "epoch": 2.2339622641509433, "grad_norm": 0.29294788473317285, "learning_rate": 1.4129826524902071e-05, "loss": 0.5189, "step": 1481 }, { "epoch": 2.2354716981132077, "grad_norm": 0.2967231058765099, "learning_rate": 1.4101846670397315e-05, "loss": 0.4883, "step": 1482 }, { "epoch": 2.2369811320754716, "grad_norm": 0.28070084795251554, "learning_rate": 1.4073866815892558e-05, "loss": 0.5129, "step": 1483 }, { "epoch": 2.238490566037736, "grad_norm": 0.27025440257233807, "learning_rate": 1.4045886961387803e-05, "loss": 0.5129, "step": 1484 }, { "epoch": 2.24, "grad_norm": 0.3278016709506503, "learning_rate": 1.4017907106883043e-05, "loss": 0.5552, "step": 1485 }, { "epoch": 2.241509433962264, "grad_norm": 0.2677486192645805, "learning_rate": 1.3989927252378288e-05, "loss": 0.505, "step": 1486 }, { "epoch": 2.2430188679245284, "grad_norm": 0.27640060062320465, "learning_rate": 1.3961947397873532e-05, "loss": 0.5228, "step": 1487 }, { "epoch": 2.2445283018867923, "grad_norm": 0.32687355578285887, "learning_rate": 1.3933967543368775e-05, "loss": 0.5329, "step": 1488 }, { "epoch": 2.2460377358490566, "grad_norm": 0.28290727880482414, "learning_rate": 1.390598768886402e-05, "loss": 0.5653, "step": 1489 }, { "epoch": 2.2475471698113205, "grad_norm": 0.2945089980775346, "learning_rate": 1.387800783435926e-05, "loss": 0.5558, "step": 1490 }, { "epoch": 2.249056603773585, "grad_norm": 0.24883863133395934, "learning_rate": 1.3850027979854505e-05, "loss": 0.4777, "step": 1491 }, { "epoch": 2.250566037735849, "grad_norm": 0.2864405466136228, "learning_rate": 1.3822048125349748e-05, "loss": 0.5316, "step": 1492 }, { "epoch": 2.252075471698113, "grad_norm": 0.29305064929033103, "learning_rate": 1.3794068270844993e-05, "loss": 0.5115, "step": 1493 }, { "epoch": 2.2535849056603774, "grad_norm": 0.2657764273856441, "learning_rate": 1.3766088416340237e-05, "loss": 0.496, "step": 1494 }, { "epoch": 2.2550943396226417, "grad_norm": 0.29966606901721604, "learning_rate": 1.373810856183548e-05, "loss": 0.5369, "step": 1495 }, { "epoch": 2.2566037735849056, "grad_norm": 0.32608189723105613, "learning_rate": 1.3710128707330722e-05, "loss": 0.5373, "step": 1496 }, { "epoch": 2.25811320754717, "grad_norm": 0.2698321482337212, "learning_rate": 1.3682148852825965e-05, "loss": 0.5642, "step": 1497 }, { "epoch": 2.259622641509434, "grad_norm": 0.2862557953990325, "learning_rate": 1.365416899832121e-05, "loss": 0.5419, "step": 1498 }, { "epoch": 2.261132075471698, "grad_norm": 0.2610202337731504, "learning_rate": 1.3626189143816454e-05, "loss": 0.4617, "step": 1499 }, { "epoch": 2.262641509433962, "grad_norm": 0.2756743186352737, "learning_rate": 1.3598209289311697e-05, "loss": 0.5105, "step": 1500 }, { "epoch": 2.2641509433962264, "grad_norm": 0.318550644447325, "learning_rate": 1.3570229434806939e-05, "loss": 0.5459, "step": 1501 }, { "epoch": 2.2656603773584907, "grad_norm": 0.27546152249243994, "learning_rate": 1.3542249580302182e-05, "loss": 0.5206, "step": 1502 }, { "epoch": 2.2671698113207546, "grad_norm": 0.29381799157195004, "learning_rate": 1.3514269725797427e-05, "loss": 0.4987, "step": 1503 }, { "epoch": 2.268679245283019, "grad_norm": 0.295754784857654, "learning_rate": 1.348628987129267e-05, "loss": 0.5064, "step": 1504 }, { "epoch": 2.2701886792452832, "grad_norm": 0.2918834331491631, "learning_rate": 1.3458310016787914e-05, "loss": 0.571, "step": 1505 }, { "epoch": 2.271698113207547, "grad_norm": 0.256431144636905, "learning_rate": 1.3430330162283155e-05, "loss": 0.4934, "step": 1506 }, { "epoch": 2.2732075471698114, "grad_norm": 0.2836824188188069, "learning_rate": 1.34023503077784e-05, "loss": 0.5082, "step": 1507 }, { "epoch": 2.2747169811320753, "grad_norm": 0.29318207704716076, "learning_rate": 1.3374370453273644e-05, "loss": 0.5, "step": 1508 }, { "epoch": 2.2762264150943397, "grad_norm": 0.271941956506678, "learning_rate": 1.3346390598768887e-05, "loss": 0.5325, "step": 1509 }, { "epoch": 2.2777358490566035, "grad_norm": 0.2855337211319555, "learning_rate": 1.3318410744264132e-05, "loss": 0.4959, "step": 1510 }, { "epoch": 2.279245283018868, "grad_norm": 0.2949051630826, "learning_rate": 1.3290430889759372e-05, "loss": 0.5202, "step": 1511 }, { "epoch": 2.280754716981132, "grad_norm": 0.27050450657637826, "learning_rate": 1.3262451035254617e-05, "loss": 0.5023, "step": 1512 }, { "epoch": 2.282264150943396, "grad_norm": 0.291946921230413, "learning_rate": 1.323447118074986e-05, "loss": 0.5177, "step": 1513 }, { "epoch": 2.2837735849056604, "grad_norm": 0.27140194708974613, "learning_rate": 1.3206491326245104e-05, "loss": 0.4802, "step": 1514 }, { "epoch": 2.2852830188679247, "grad_norm": 0.28632708768258713, "learning_rate": 1.3178511471740349e-05, "loss": 0.5031, "step": 1515 }, { "epoch": 2.2867924528301886, "grad_norm": 0.2653378913781436, "learning_rate": 1.3150531617235589e-05, "loss": 0.5348, "step": 1516 }, { "epoch": 2.288301886792453, "grad_norm": 0.3416254176656584, "learning_rate": 1.3122551762730834e-05, "loss": 0.5115, "step": 1517 }, { "epoch": 2.289811320754717, "grad_norm": 0.3132158552150094, "learning_rate": 1.3094571908226077e-05, "loss": 0.5507, "step": 1518 }, { "epoch": 2.291320754716981, "grad_norm": 0.2587415321407086, "learning_rate": 1.306659205372132e-05, "loss": 0.471, "step": 1519 }, { "epoch": 2.292830188679245, "grad_norm": 0.2813684595304604, "learning_rate": 1.3038612199216566e-05, "loss": 0.5033, "step": 1520 }, { "epoch": 2.2943396226415094, "grad_norm": 0.34086813878117894, "learning_rate": 1.301063234471181e-05, "loss": 0.5173, "step": 1521 }, { "epoch": 2.2958490566037737, "grad_norm": 0.29897003760539104, "learning_rate": 1.2982652490207051e-05, "loss": 0.4927, "step": 1522 }, { "epoch": 2.2973584905660376, "grad_norm": 0.3875603639423126, "learning_rate": 1.2954672635702294e-05, "loss": 0.5581, "step": 1523 }, { "epoch": 2.298867924528302, "grad_norm": 0.2862074867727448, "learning_rate": 1.292669278119754e-05, "loss": 0.4862, "step": 1524 }, { "epoch": 2.3003773584905662, "grad_norm": 0.30868691166763873, "learning_rate": 1.2898712926692783e-05, "loss": 0.5272, "step": 1525 }, { "epoch": 2.30188679245283, "grad_norm": 0.29790893971786336, "learning_rate": 1.2870733072188026e-05, "loss": 0.5266, "step": 1526 }, { "epoch": 2.3033962264150944, "grad_norm": 0.2782090398414401, "learning_rate": 1.2842753217683268e-05, "loss": 0.5462, "step": 1527 }, { "epoch": 2.3049056603773583, "grad_norm": 0.2854745590256956, "learning_rate": 1.2814773363178511e-05, "loss": 0.4805, "step": 1528 }, { "epoch": 2.3064150943396227, "grad_norm": 0.3113454727191614, "learning_rate": 1.2786793508673756e-05, "loss": 0.4902, "step": 1529 }, { "epoch": 2.3079245283018865, "grad_norm": 0.25928474454077677, "learning_rate": 1.2758813654169e-05, "loss": 0.4906, "step": 1530 }, { "epoch": 2.309433962264151, "grad_norm": 0.26698105372178027, "learning_rate": 1.2730833799664243e-05, "loss": 0.5508, "step": 1531 }, { "epoch": 2.310943396226415, "grad_norm": 0.26530252126413506, "learning_rate": 1.2702853945159485e-05, "loss": 0.5375, "step": 1532 }, { "epoch": 2.312452830188679, "grad_norm": 0.32824150490138687, "learning_rate": 1.2674874090654728e-05, "loss": 0.5316, "step": 1533 }, { "epoch": 2.3139622641509434, "grad_norm": 0.2616604148241911, "learning_rate": 1.2646894236149973e-05, "loss": 0.4693, "step": 1534 }, { "epoch": 2.3154716981132077, "grad_norm": 0.2937527365395457, "learning_rate": 1.2618914381645216e-05, "loss": 0.5329, "step": 1535 }, { "epoch": 2.3169811320754716, "grad_norm": 0.28549646821620334, "learning_rate": 1.259093452714046e-05, "loss": 0.5417, "step": 1536 }, { "epoch": 2.318490566037736, "grad_norm": 0.29605947776580616, "learning_rate": 1.2562954672635701e-05, "loss": 0.5188, "step": 1537 }, { "epoch": 2.32, "grad_norm": 0.28290540315403917, "learning_rate": 1.2534974818130946e-05, "loss": 0.4817, "step": 1538 }, { "epoch": 2.321509433962264, "grad_norm": 0.28346799307216214, "learning_rate": 1.250699496362619e-05, "loss": 0.5291, "step": 1539 }, { "epoch": 2.323018867924528, "grad_norm": 0.3025578648182897, "learning_rate": 1.2479015109121433e-05, "loss": 0.5549, "step": 1540 }, { "epoch": 2.3245283018867924, "grad_norm": 0.28767635571955014, "learning_rate": 1.2451035254616676e-05, "loss": 0.5334, "step": 1541 }, { "epoch": 2.3260377358490567, "grad_norm": 0.31251235625172796, "learning_rate": 1.242305540011192e-05, "loss": 0.5222, "step": 1542 }, { "epoch": 2.3275471698113206, "grad_norm": 0.3280951350127823, "learning_rate": 1.2395075545607163e-05, "loss": 0.5397, "step": 1543 }, { "epoch": 2.329056603773585, "grad_norm": 0.313854810082151, "learning_rate": 1.2367095691102407e-05, "loss": 0.5483, "step": 1544 }, { "epoch": 2.3305660377358492, "grad_norm": 0.2670608264932217, "learning_rate": 1.233911583659765e-05, "loss": 0.5254, "step": 1545 }, { "epoch": 2.332075471698113, "grad_norm": 0.2719825460240819, "learning_rate": 1.2311135982092893e-05, "loss": 0.5459, "step": 1546 }, { "epoch": 2.3335849056603775, "grad_norm": 0.31088912379982053, "learning_rate": 1.2283156127588137e-05, "loss": 0.5487, "step": 1547 }, { "epoch": 2.3350943396226413, "grad_norm": 0.27061842798171537, "learning_rate": 1.2255176273083382e-05, "loss": 0.4935, "step": 1548 }, { "epoch": 2.3366037735849057, "grad_norm": 0.2831678674322194, "learning_rate": 1.2227196418578623e-05, "loss": 0.5351, "step": 1549 }, { "epoch": 2.33811320754717, "grad_norm": 0.49601884749761044, "learning_rate": 1.2199216564073867e-05, "loss": 0.5137, "step": 1550 }, { "epoch": 2.339622641509434, "grad_norm": 0.27586411056769244, "learning_rate": 1.217123670956911e-05, "loss": 0.5412, "step": 1551 }, { "epoch": 2.341132075471698, "grad_norm": 0.29114042947531105, "learning_rate": 1.2143256855064353e-05, "loss": 0.5458, "step": 1552 }, { "epoch": 2.342641509433962, "grad_norm": 0.2689747977811522, "learning_rate": 1.2115277000559599e-05, "loss": 0.5441, "step": 1553 }, { "epoch": 2.3441509433962264, "grad_norm": 0.261267562119711, "learning_rate": 1.208729714605484e-05, "loss": 0.4996, "step": 1554 }, { "epoch": 2.3456603773584908, "grad_norm": 0.2630661794726856, "learning_rate": 1.2059317291550085e-05, "loss": 0.4945, "step": 1555 }, { "epoch": 2.3471698113207546, "grad_norm": 0.2714886371024407, "learning_rate": 1.2031337437045329e-05, "loss": 0.5419, "step": 1556 }, { "epoch": 2.348679245283019, "grad_norm": 0.27322996123705456, "learning_rate": 1.200335758254057e-05, "loss": 0.5344, "step": 1557 }, { "epoch": 2.350188679245283, "grad_norm": 0.27446564225924486, "learning_rate": 1.1975377728035815e-05, "loss": 0.4914, "step": 1558 }, { "epoch": 2.351698113207547, "grad_norm": 0.2786018549729709, "learning_rate": 1.1947397873531057e-05, "loss": 0.5524, "step": 1559 }, { "epoch": 2.3532075471698115, "grad_norm": 0.2744379654868362, "learning_rate": 1.1919418019026302e-05, "loss": 0.5057, "step": 1560 }, { "epoch": 2.3547169811320754, "grad_norm": 0.29383230395925714, "learning_rate": 1.1891438164521545e-05, "loss": 0.4995, "step": 1561 }, { "epoch": 2.3562264150943397, "grad_norm": 0.27567156486489924, "learning_rate": 1.1863458310016789e-05, "loss": 0.5059, "step": 1562 }, { "epoch": 2.3577358490566036, "grad_norm": 0.2965755545135667, "learning_rate": 1.1835478455512032e-05, "loss": 0.5183, "step": 1563 }, { "epoch": 2.359245283018868, "grad_norm": 0.2942882557087409, "learning_rate": 1.1807498601007276e-05, "loss": 0.4978, "step": 1564 }, { "epoch": 2.3607547169811323, "grad_norm": 0.3612767923459085, "learning_rate": 1.1779518746502519e-05, "loss": 0.5482, "step": 1565 }, { "epoch": 2.362264150943396, "grad_norm": 0.37068931043688397, "learning_rate": 1.1751538891997762e-05, "loss": 0.5417, "step": 1566 }, { "epoch": 2.3637735849056605, "grad_norm": 0.3009994319645029, "learning_rate": 1.1723559037493006e-05, "loss": 0.5241, "step": 1567 }, { "epoch": 2.3652830188679244, "grad_norm": 0.2982051870389813, "learning_rate": 1.1695579182988249e-05, "loss": 0.5138, "step": 1568 }, { "epoch": 2.3667924528301887, "grad_norm": 0.273290340770851, "learning_rate": 1.1667599328483492e-05, "loss": 0.5425, "step": 1569 }, { "epoch": 2.368301886792453, "grad_norm": 0.30521472956958234, "learning_rate": 1.1639619473978736e-05, "loss": 0.503, "step": 1570 }, { "epoch": 2.369811320754717, "grad_norm": 0.2733183011713467, "learning_rate": 1.1611639619473979e-05, "loss": 0.4991, "step": 1571 }, { "epoch": 2.3713207547169812, "grad_norm": 0.2700309326901011, "learning_rate": 1.1583659764969222e-05, "loss": 0.4871, "step": 1572 }, { "epoch": 2.372830188679245, "grad_norm": 0.3165698192789348, "learning_rate": 1.1555679910464466e-05, "loss": 0.5259, "step": 1573 }, { "epoch": 2.3743396226415094, "grad_norm": 0.32213328390083884, "learning_rate": 1.152770005595971e-05, "loss": 0.4876, "step": 1574 }, { "epoch": 2.3758490566037738, "grad_norm": 0.3212920255017883, "learning_rate": 1.1499720201454953e-05, "loss": 0.5213, "step": 1575 }, { "epoch": 2.3773584905660377, "grad_norm": 0.28550611630396816, "learning_rate": 1.1471740346950196e-05, "loss": 0.5245, "step": 1576 }, { "epoch": 2.378867924528302, "grad_norm": 0.33575387480012625, "learning_rate": 1.144376049244544e-05, "loss": 0.5325, "step": 1577 }, { "epoch": 2.380377358490566, "grad_norm": 0.3209791568084635, "learning_rate": 1.1415780637940683e-05, "loss": 0.5203, "step": 1578 }, { "epoch": 2.38188679245283, "grad_norm": 0.3068339238371063, "learning_rate": 1.1387800783435928e-05, "loss": 0.5432, "step": 1579 }, { "epoch": 2.3833962264150945, "grad_norm": 0.29288109611440577, "learning_rate": 1.135982092893117e-05, "loss": 0.4908, "step": 1580 }, { "epoch": 2.3849056603773584, "grad_norm": 0.2823808739473944, "learning_rate": 1.1331841074426414e-05, "loss": 0.5247, "step": 1581 }, { "epoch": 2.3864150943396227, "grad_norm": 0.2970923159171053, "learning_rate": 1.1303861219921658e-05, "loss": 0.5034, "step": 1582 }, { "epoch": 2.3879245283018866, "grad_norm": 0.2811796099376647, "learning_rate": 1.12758813654169e-05, "loss": 0.513, "step": 1583 }, { "epoch": 2.389433962264151, "grad_norm": 0.28383899311229177, "learning_rate": 1.1247901510912144e-05, "loss": 0.4913, "step": 1584 }, { "epoch": 2.3909433962264153, "grad_norm": 0.2965229723431742, "learning_rate": 1.1219921656407386e-05, "loss": 0.5565, "step": 1585 }, { "epoch": 2.392452830188679, "grad_norm": 0.28613474578656095, "learning_rate": 1.1191941801902631e-05, "loss": 0.5035, "step": 1586 }, { "epoch": 2.3939622641509435, "grad_norm": 0.333638319152662, "learning_rate": 1.1163961947397875e-05, "loss": 0.5502, "step": 1587 }, { "epoch": 2.3954716981132074, "grad_norm": 0.2762683785182854, "learning_rate": 1.1135982092893118e-05, "loss": 0.4957, "step": 1588 }, { "epoch": 2.3969811320754717, "grad_norm": 0.2841965209758776, "learning_rate": 1.1108002238388361e-05, "loss": 0.5323, "step": 1589 }, { "epoch": 2.398490566037736, "grad_norm": 0.30355635904838757, "learning_rate": 1.1080022383883603e-05, "loss": 0.5362, "step": 1590 }, { "epoch": 2.4, "grad_norm": 0.2613635701106073, "learning_rate": 1.1052042529378848e-05, "loss": 0.5163, "step": 1591 }, { "epoch": 2.4015094339622642, "grad_norm": 0.616566657926194, "learning_rate": 1.1024062674874091e-05, "loss": 0.5233, "step": 1592 }, { "epoch": 2.403018867924528, "grad_norm": 0.2693893123464824, "learning_rate": 1.0996082820369335e-05, "loss": 0.5075, "step": 1593 }, { "epoch": 2.4045283018867925, "grad_norm": 0.27548371010084427, "learning_rate": 1.0968102965864578e-05, "loss": 0.5055, "step": 1594 }, { "epoch": 2.406037735849057, "grad_norm": 0.2781662462502358, "learning_rate": 1.0940123111359821e-05, "loss": 0.5539, "step": 1595 }, { "epoch": 2.4075471698113207, "grad_norm": 0.30912389736469736, "learning_rate": 1.0912143256855065e-05, "loss": 0.5981, "step": 1596 }, { "epoch": 2.409056603773585, "grad_norm": 0.28742057157673107, "learning_rate": 1.0884163402350308e-05, "loss": 0.4773, "step": 1597 }, { "epoch": 2.410566037735849, "grad_norm": 0.2544732632143813, "learning_rate": 1.0856183547845552e-05, "loss": 0.4747, "step": 1598 }, { "epoch": 2.412075471698113, "grad_norm": 0.2542075611364493, "learning_rate": 1.0828203693340795e-05, "loss": 0.4653, "step": 1599 }, { "epoch": 2.4135849056603775, "grad_norm": 0.28973359121279074, "learning_rate": 1.0800223838836038e-05, "loss": 0.5229, "step": 1600 }, { "epoch": 2.4150943396226414, "grad_norm": 0.36347700962188983, "learning_rate": 1.0772243984331282e-05, "loss": 0.5372, "step": 1601 }, { "epoch": 2.4166037735849057, "grad_norm": 0.27163267397417085, "learning_rate": 1.0744264129826525e-05, "loss": 0.5022, "step": 1602 }, { "epoch": 2.4181132075471696, "grad_norm": 3.40159737994843, "learning_rate": 1.0716284275321768e-05, "loss": 0.5791, "step": 1603 }, { "epoch": 2.419622641509434, "grad_norm": 0.3024980193028065, "learning_rate": 1.0688304420817012e-05, "loss": 0.5413, "step": 1604 }, { "epoch": 2.4211320754716983, "grad_norm": 0.3191908802604592, "learning_rate": 1.0660324566312257e-05, "loss": 0.5505, "step": 1605 }, { "epoch": 2.422641509433962, "grad_norm": 0.34247393903147477, "learning_rate": 1.0632344711807498e-05, "loss": 0.5156, "step": 1606 }, { "epoch": 2.4241509433962265, "grad_norm": 0.30555621465573174, "learning_rate": 1.0604364857302742e-05, "loss": 0.5164, "step": 1607 }, { "epoch": 2.4256603773584904, "grad_norm": 0.28883349420538895, "learning_rate": 1.0576385002797987e-05, "loss": 0.5196, "step": 1608 }, { "epoch": 2.4271698113207547, "grad_norm": 0.29370588202466086, "learning_rate": 1.0548405148293229e-05, "loss": 0.4992, "step": 1609 }, { "epoch": 2.428679245283019, "grad_norm": 0.2992724879265412, "learning_rate": 1.0520425293788474e-05, "loss": 0.5085, "step": 1610 }, { "epoch": 2.430188679245283, "grad_norm": 0.32945758350811966, "learning_rate": 1.0492445439283715e-05, "loss": 0.552, "step": 1611 }, { "epoch": 2.4316981132075473, "grad_norm": 0.2682269673076967, "learning_rate": 1.046446558477896e-05, "loss": 0.5156, "step": 1612 }, { "epoch": 2.433207547169811, "grad_norm": 0.28647715743947366, "learning_rate": 1.0436485730274204e-05, "loss": 0.4993, "step": 1613 }, { "epoch": 2.4347169811320755, "grad_norm": 0.2744939023949034, "learning_rate": 1.0408505875769445e-05, "loss": 0.5253, "step": 1614 }, { "epoch": 2.43622641509434, "grad_norm": 0.35250397665004307, "learning_rate": 1.038052602126469e-05, "loss": 0.5396, "step": 1615 }, { "epoch": 2.4377358490566037, "grad_norm": 0.3123726343205459, "learning_rate": 1.0352546166759932e-05, "loss": 0.5134, "step": 1616 }, { "epoch": 2.439245283018868, "grad_norm": 0.2764691102160436, "learning_rate": 1.0324566312255177e-05, "loss": 0.5091, "step": 1617 }, { "epoch": 2.440754716981132, "grad_norm": 0.2814627012575424, "learning_rate": 1.029658645775042e-05, "loss": 0.489, "step": 1618 }, { "epoch": 2.442264150943396, "grad_norm": 0.39565086571218255, "learning_rate": 1.0268606603245664e-05, "loss": 0.5338, "step": 1619 }, { "epoch": 2.4437735849056605, "grad_norm": 0.3089370742923853, "learning_rate": 1.0240626748740907e-05, "loss": 0.5511, "step": 1620 }, { "epoch": 2.4452830188679244, "grad_norm": 0.27889082409535804, "learning_rate": 1.021264689423615e-05, "loss": 0.4989, "step": 1621 }, { "epoch": 2.4467924528301888, "grad_norm": 0.2801027230431505, "learning_rate": 1.0184667039731394e-05, "loss": 0.5151, "step": 1622 }, { "epoch": 2.4483018867924526, "grad_norm": 0.2864728776570272, "learning_rate": 1.0156687185226637e-05, "loss": 0.5427, "step": 1623 }, { "epoch": 2.449811320754717, "grad_norm": 0.2870692848584596, "learning_rate": 1.012870733072188e-05, "loss": 0.5044, "step": 1624 }, { "epoch": 2.4513207547169813, "grad_norm": 0.27728518831547827, "learning_rate": 1.0100727476217124e-05, "loss": 0.5263, "step": 1625 }, { "epoch": 2.452830188679245, "grad_norm": 0.28108945164897814, "learning_rate": 1.0072747621712367e-05, "loss": 0.5278, "step": 1626 }, { "epoch": 2.4543396226415095, "grad_norm": 0.281028089568473, "learning_rate": 1.004476776720761e-05, "loss": 0.526, "step": 1627 }, { "epoch": 2.4558490566037734, "grad_norm": 0.29992491532026355, "learning_rate": 1.0016787912702854e-05, "loss": 0.5543, "step": 1628 }, { "epoch": 2.4573584905660377, "grad_norm": 0.2446925829790535, "learning_rate": 9.988808058198097e-06, "loss": 0.4827, "step": 1629 }, { "epoch": 2.458867924528302, "grad_norm": 0.29289490874253116, "learning_rate": 9.960828203693341e-06, "loss": 0.5348, "step": 1630 }, { "epoch": 2.460377358490566, "grad_norm": 0.3010609740305946, "learning_rate": 9.932848349188586e-06, "loss": 0.5043, "step": 1631 }, { "epoch": 2.4618867924528303, "grad_norm": 0.28789020180830055, "learning_rate": 9.904868494683828e-06, "loss": 0.5147, "step": 1632 }, { "epoch": 2.463396226415094, "grad_norm": 0.26705365000795894, "learning_rate": 9.876888640179071e-06, "loss": 0.5287, "step": 1633 }, { "epoch": 2.4649056603773585, "grad_norm": 0.2798599523967274, "learning_rate": 9.848908785674316e-06, "loss": 0.4933, "step": 1634 }, { "epoch": 2.466415094339623, "grad_norm": 0.2733384516616599, "learning_rate": 9.820928931169558e-06, "loss": 0.5029, "step": 1635 }, { "epoch": 2.4679245283018867, "grad_norm": 0.29643592834837407, "learning_rate": 9.792949076664803e-06, "loss": 0.5158, "step": 1636 }, { "epoch": 2.469433962264151, "grad_norm": 0.28526464289507686, "learning_rate": 9.764969222160044e-06, "loss": 0.55, "step": 1637 }, { "epoch": 2.470943396226415, "grad_norm": 0.26357930501597526, "learning_rate": 9.73698936765529e-06, "loss": 0.4749, "step": 1638 }, { "epoch": 2.4724528301886792, "grad_norm": 0.27127677302581304, "learning_rate": 9.709009513150533e-06, "loss": 0.4896, "step": 1639 }, { "epoch": 2.4739622641509436, "grad_norm": 0.3258038736676753, "learning_rate": 9.681029658645774e-06, "loss": 0.5453, "step": 1640 }, { "epoch": 2.4754716981132074, "grad_norm": 0.2721708903510538, "learning_rate": 9.65304980414102e-06, "loss": 0.5088, "step": 1641 }, { "epoch": 2.4769811320754718, "grad_norm": 0.25031552664361656, "learning_rate": 9.625069949636261e-06, "loss": 0.4432, "step": 1642 }, { "epoch": 2.4784905660377357, "grad_norm": 0.27170615085235295, "learning_rate": 9.597090095131506e-06, "loss": 0.5401, "step": 1643 }, { "epoch": 2.48, "grad_norm": 0.25077059492786125, "learning_rate": 9.56911024062675e-06, "loss": 0.5061, "step": 1644 }, { "epoch": 2.4815094339622643, "grad_norm": 0.4381733597945453, "learning_rate": 9.541130386121993e-06, "loss": 0.5121, "step": 1645 }, { "epoch": 2.483018867924528, "grad_norm": 0.30828080858038703, "learning_rate": 9.513150531617236e-06, "loss": 0.4767, "step": 1646 }, { "epoch": 2.4845283018867925, "grad_norm": 0.2679178849366846, "learning_rate": 9.48517067711248e-06, "loss": 0.5115, "step": 1647 }, { "epoch": 2.486037735849057, "grad_norm": 0.25498782051352026, "learning_rate": 9.457190822607723e-06, "loss": 0.4925, "step": 1648 }, { "epoch": 2.4875471698113207, "grad_norm": 0.2712046865589304, "learning_rate": 9.429210968102966e-06, "loss": 0.5239, "step": 1649 }, { "epoch": 2.489056603773585, "grad_norm": 0.2841429645079444, "learning_rate": 9.40123111359821e-06, "loss": 0.5038, "step": 1650 }, { "epoch": 2.490566037735849, "grad_norm": 0.2922250090980482, "learning_rate": 9.373251259093453e-06, "loss": 0.5323, "step": 1651 }, { "epoch": 2.4920754716981133, "grad_norm": 0.3490130576354896, "learning_rate": 9.345271404588697e-06, "loss": 0.5219, "step": 1652 }, { "epoch": 2.493584905660377, "grad_norm": 0.2911807875623749, "learning_rate": 9.31729155008394e-06, "loss": 0.5356, "step": 1653 }, { "epoch": 2.4950943396226415, "grad_norm": 0.26818440305605495, "learning_rate": 9.289311695579183e-06, "loss": 0.4687, "step": 1654 }, { "epoch": 2.496603773584906, "grad_norm": 0.2629570306134703, "learning_rate": 9.261331841074427e-06, "loss": 0.5058, "step": 1655 }, { "epoch": 2.4981132075471697, "grad_norm": 0.27394341115092274, "learning_rate": 9.23335198656967e-06, "loss": 0.5189, "step": 1656 }, { "epoch": 2.499622641509434, "grad_norm": 0.31719664729022873, "learning_rate": 9.205372132064913e-06, "loss": 0.5479, "step": 1657 }, { "epoch": 2.5011320754716984, "grad_norm": 0.28948224769873077, "learning_rate": 9.177392277560157e-06, "loss": 0.5195, "step": 1658 }, { "epoch": 2.5026415094339622, "grad_norm": 0.27872732971809544, "learning_rate": 9.1494124230554e-06, "loss": 0.4954, "step": 1659 }, { "epoch": 2.5041509433962266, "grad_norm": 0.2972680704163209, "learning_rate": 9.121432568550645e-06, "loss": 0.5492, "step": 1660 }, { "epoch": 2.5056603773584905, "grad_norm": 0.258478206277154, "learning_rate": 9.093452714045887e-06, "loss": 0.4811, "step": 1661 }, { "epoch": 2.507169811320755, "grad_norm": 0.27135076957243554, "learning_rate": 9.065472859541132e-06, "loss": 0.5241, "step": 1662 }, { "epoch": 2.5086792452830187, "grad_norm": 0.25049127152785716, "learning_rate": 9.037493005036374e-06, "loss": 0.5322, "step": 1663 }, { "epoch": 2.510188679245283, "grad_norm": 0.2414933347105079, "learning_rate": 9.009513150531617e-06, "loss": 0.5065, "step": 1664 }, { "epoch": 2.5116981132075473, "grad_norm": 0.26905976255414465, "learning_rate": 8.981533296026862e-06, "loss": 0.5204, "step": 1665 }, { "epoch": 2.513207547169811, "grad_norm": 0.2574263160663239, "learning_rate": 8.953553441522104e-06, "loss": 0.4995, "step": 1666 }, { "epoch": 2.5147169811320755, "grad_norm": 0.2412927664002052, "learning_rate": 8.925573587017349e-06, "loss": 0.5189, "step": 1667 }, { "epoch": 2.51622641509434, "grad_norm": 0.2724697409186211, "learning_rate": 8.89759373251259e-06, "loss": 0.4707, "step": 1668 }, { "epoch": 2.5177358490566037, "grad_norm": 0.2649030391477551, "learning_rate": 8.869613878007835e-06, "loss": 0.5068, "step": 1669 }, { "epoch": 2.519245283018868, "grad_norm": 1.937782774908197, "learning_rate": 8.841634023503079e-06, "loss": 0.4939, "step": 1670 }, { "epoch": 2.520754716981132, "grad_norm": 0.26438563805690896, "learning_rate": 8.81365416899832e-06, "loss": 0.5031, "step": 1671 }, { "epoch": 2.5222641509433963, "grad_norm": 0.254419310430134, "learning_rate": 8.785674314493565e-06, "loss": 0.5183, "step": 1672 }, { "epoch": 2.52377358490566, "grad_norm": 0.27032143419025384, "learning_rate": 8.757694459988809e-06, "loss": 0.4891, "step": 1673 }, { "epoch": 2.5252830188679245, "grad_norm": 0.2714305046812353, "learning_rate": 8.729714605484052e-06, "loss": 0.5136, "step": 1674 }, { "epoch": 2.526792452830189, "grad_norm": 0.27145056603532564, "learning_rate": 8.701734750979296e-06, "loss": 0.5275, "step": 1675 }, { "epoch": 2.5283018867924527, "grad_norm": 0.2636503580959262, "learning_rate": 8.673754896474539e-06, "loss": 0.5661, "step": 1676 }, { "epoch": 2.529811320754717, "grad_norm": 0.2862829068236602, "learning_rate": 8.645775041969782e-06, "loss": 0.5069, "step": 1677 }, { "epoch": 2.5313207547169814, "grad_norm": 0.2629202015341607, "learning_rate": 8.617795187465026e-06, "loss": 0.5216, "step": 1678 }, { "epoch": 2.5328301886792453, "grad_norm": 0.2512663214374365, "learning_rate": 8.589815332960269e-06, "loss": 0.4779, "step": 1679 }, { "epoch": 2.5343396226415096, "grad_norm": 0.2582674957787444, "learning_rate": 8.561835478455512e-06, "loss": 0.5012, "step": 1680 }, { "epoch": 2.5358490566037735, "grad_norm": 0.26493998715379696, "learning_rate": 8.533855623950756e-06, "loss": 0.5263, "step": 1681 }, { "epoch": 2.537358490566038, "grad_norm": 0.2451699637847086, "learning_rate": 8.505875769445999e-06, "loss": 0.4737, "step": 1682 }, { "epoch": 2.5388679245283017, "grad_norm": 0.2646435180201724, "learning_rate": 8.477895914941242e-06, "loss": 0.5208, "step": 1683 }, { "epoch": 2.540377358490566, "grad_norm": 0.2582198344619996, "learning_rate": 8.449916060436486e-06, "loss": 0.4994, "step": 1684 }, { "epoch": 2.5418867924528303, "grad_norm": 0.24851559763189898, "learning_rate": 8.42193620593173e-06, "loss": 0.4999, "step": 1685 }, { "epoch": 2.543396226415094, "grad_norm": 0.2554578588043926, "learning_rate": 8.393956351426974e-06, "loss": 0.5173, "step": 1686 }, { "epoch": 2.5449056603773585, "grad_norm": 0.29270235106198766, "learning_rate": 8.365976496922216e-06, "loss": 0.5509, "step": 1687 }, { "epoch": 2.546415094339623, "grad_norm": 0.2521719040156708, "learning_rate": 8.337996642417461e-06, "loss": 0.4927, "step": 1688 }, { "epoch": 2.5479245283018868, "grad_norm": 0.24926253146252536, "learning_rate": 8.310016787912703e-06, "loss": 0.5119, "step": 1689 }, { "epoch": 2.549433962264151, "grad_norm": 0.24764507855189977, "learning_rate": 8.282036933407946e-06, "loss": 0.5169, "step": 1690 }, { "epoch": 2.550943396226415, "grad_norm": 0.2846935555038941, "learning_rate": 8.254057078903191e-06, "loss": 0.5303, "step": 1691 }, { "epoch": 2.5524528301886793, "grad_norm": 1.3298175058900719, "learning_rate": 8.226077224398433e-06, "loss": 0.4987, "step": 1692 }, { "epoch": 2.553962264150943, "grad_norm": 0.26585384992525096, "learning_rate": 8.198097369893678e-06, "loss": 0.533, "step": 1693 }, { "epoch": 2.5554716981132075, "grad_norm": 0.26892517763510493, "learning_rate": 8.17011751538892e-06, "loss": 0.5505, "step": 1694 }, { "epoch": 2.556981132075472, "grad_norm": 0.27708427633239685, "learning_rate": 8.142137660884165e-06, "loss": 0.4835, "step": 1695 }, { "epoch": 2.5584905660377357, "grad_norm": 0.25860083912017584, "learning_rate": 8.114157806379408e-06, "loss": 0.4909, "step": 1696 }, { "epoch": 2.56, "grad_norm": 0.28186533953501614, "learning_rate": 8.08617795187465e-06, "loss": 0.5547, "step": 1697 }, { "epoch": 2.5615094339622644, "grad_norm": 0.3569206208752698, "learning_rate": 8.058198097369895e-06, "loss": 0.5023, "step": 1698 }, { "epoch": 2.5630188679245283, "grad_norm": 0.26678040300011296, "learning_rate": 8.030218242865136e-06, "loss": 0.514, "step": 1699 }, { "epoch": 2.5645283018867926, "grad_norm": 0.27519779245794745, "learning_rate": 8.002238388360381e-06, "loss": 0.499, "step": 1700 }, { "epoch": 2.5660377358490565, "grad_norm": 0.28184342859209355, "learning_rate": 7.974258533855625e-06, "loss": 0.5079, "step": 1701 }, { "epoch": 2.567547169811321, "grad_norm": 0.2696907607563735, "learning_rate": 7.946278679350868e-06, "loss": 0.5349, "step": 1702 }, { "epoch": 2.5690566037735847, "grad_norm": 0.34249892968895884, "learning_rate": 7.918298824846111e-06, "loss": 0.4951, "step": 1703 }, { "epoch": 2.570566037735849, "grad_norm": 0.28104386328396547, "learning_rate": 7.890318970341355e-06, "loss": 0.4748, "step": 1704 }, { "epoch": 2.5720754716981133, "grad_norm": 0.27841856070995974, "learning_rate": 7.862339115836598e-06, "loss": 0.5164, "step": 1705 }, { "epoch": 2.5735849056603772, "grad_norm": 0.26103373360378673, "learning_rate": 7.834359261331841e-06, "loss": 0.5196, "step": 1706 }, { "epoch": 2.5750943396226416, "grad_norm": 0.28226689926781434, "learning_rate": 7.806379406827085e-06, "loss": 0.5471, "step": 1707 }, { "epoch": 2.576603773584906, "grad_norm": 0.260925718946685, "learning_rate": 7.778399552322328e-06, "loss": 0.5245, "step": 1708 }, { "epoch": 2.5781132075471698, "grad_norm": 0.2651170853166187, "learning_rate": 7.750419697817572e-06, "loss": 0.5191, "step": 1709 }, { "epoch": 2.579622641509434, "grad_norm": 0.2878754716574323, "learning_rate": 7.722439843312815e-06, "loss": 0.5276, "step": 1710 }, { "epoch": 2.581132075471698, "grad_norm": 0.38149999629802744, "learning_rate": 7.694459988808058e-06, "loss": 0.5063, "step": 1711 }, { "epoch": 2.5826415094339623, "grad_norm": 0.25769713900097513, "learning_rate": 7.666480134303302e-06, "loss": 0.4829, "step": 1712 }, { "epoch": 2.584150943396226, "grad_norm": 0.2537097287006656, "learning_rate": 7.638500279798545e-06, "loss": 0.5452, "step": 1713 }, { "epoch": 2.5856603773584905, "grad_norm": 0.26625886535127746, "learning_rate": 7.610520425293789e-06, "loss": 0.4987, "step": 1714 }, { "epoch": 2.587169811320755, "grad_norm": 0.27567634014604986, "learning_rate": 7.582540570789032e-06, "loss": 0.5292, "step": 1715 }, { "epoch": 2.5886792452830187, "grad_norm": 0.2745735348694907, "learning_rate": 7.554560716284276e-06, "loss": 0.5124, "step": 1716 }, { "epoch": 2.590188679245283, "grad_norm": 0.27869253594934607, "learning_rate": 7.526580861779519e-06, "loss": 0.5011, "step": 1717 }, { "epoch": 2.5916981132075474, "grad_norm": 0.26083708585981563, "learning_rate": 7.498601007274763e-06, "loss": 0.5063, "step": 1718 }, { "epoch": 2.5932075471698113, "grad_norm": 0.25898145180183996, "learning_rate": 7.470621152770006e-06, "loss": 0.5263, "step": 1719 }, { "epoch": 2.5947169811320756, "grad_norm": 0.25116990389413063, "learning_rate": 7.4426412982652486e-06, "loss": 0.5004, "step": 1720 }, { "epoch": 2.5962264150943395, "grad_norm": 0.26255445723201376, "learning_rate": 7.414661443760493e-06, "loss": 0.4588, "step": 1721 }, { "epoch": 2.597735849056604, "grad_norm": 0.29059302191645914, "learning_rate": 7.386681589255737e-06, "loss": 0.5715, "step": 1722 }, { "epoch": 2.5992452830188677, "grad_norm": 0.259169324744869, "learning_rate": 7.3587017347509795e-06, "loss": 0.5066, "step": 1723 }, { "epoch": 2.600754716981132, "grad_norm": 0.2550956884217296, "learning_rate": 7.330721880246223e-06, "loss": 0.4929, "step": 1724 }, { "epoch": 2.6022641509433964, "grad_norm": 0.26605119100893077, "learning_rate": 7.302742025741466e-06, "loss": 0.5148, "step": 1725 }, { "epoch": 2.6037735849056602, "grad_norm": 0.2839684129223697, "learning_rate": 7.27476217123671e-06, "loss": 0.5379, "step": 1726 }, { "epoch": 2.6052830188679246, "grad_norm": 0.24671746161838828, "learning_rate": 7.246782316731954e-06, "loss": 0.5344, "step": 1727 }, { "epoch": 2.606792452830189, "grad_norm": 0.26950728167251553, "learning_rate": 7.218802462227196e-06, "loss": 0.5269, "step": 1728 }, { "epoch": 2.608301886792453, "grad_norm": 0.30410413396696345, "learning_rate": 7.1908226077224405e-06, "loss": 0.5198, "step": 1729 }, { "epoch": 2.609811320754717, "grad_norm": 0.3012042476084446, "learning_rate": 7.162842753217684e-06, "loss": 0.5261, "step": 1730 }, { "epoch": 2.611320754716981, "grad_norm": 0.25613913625539647, "learning_rate": 7.134862898712926e-06, "loss": 0.5445, "step": 1731 }, { "epoch": 2.6128301886792453, "grad_norm": 0.24351065578422196, "learning_rate": 7.106883044208171e-06, "loss": 0.4737, "step": 1732 }, { "epoch": 2.614339622641509, "grad_norm": 0.27310531472703253, "learning_rate": 7.078903189703413e-06, "loss": 0.5484, "step": 1733 }, { "epoch": 2.6158490566037735, "grad_norm": 0.27115505717512334, "learning_rate": 7.050923335198657e-06, "loss": 0.5029, "step": 1734 }, { "epoch": 2.617358490566038, "grad_norm": 0.3029677136637512, "learning_rate": 7.0229434806939016e-06, "loss": 0.5418, "step": 1735 }, { "epoch": 2.6188679245283017, "grad_norm": 0.27908867772659823, "learning_rate": 6.994963626189144e-06, "loss": 0.5047, "step": 1736 }, { "epoch": 2.620377358490566, "grad_norm": 0.26058238891559354, "learning_rate": 6.9669837716843874e-06, "loss": 0.4792, "step": 1737 }, { "epoch": 2.6218867924528304, "grad_norm": 0.2675541705765002, "learning_rate": 6.93900391717963e-06, "loss": 0.5317, "step": 1738 }, { "epoch": 2.6233962264150943, "grad_norm": 0.27497640444418847, "learning_rate": 6.911024062674874e-06, "loss": 0.5155, "step": 1739 }, { "epoch": 2.6249056603773586, "grad_norm": 0.2744416701414958, "learning_rate": 6.883044208170118e-06, "loss": 0.5331, "step": 1740 }, { "epoch": 2.6264150943396225, "grad_norm": 0.2601570495959637, "learning_rate": 6.855064353665361e-06, "loss": 0.5181, "step": 1741 }, { "epoch": 2.627924528301887, "grad_norm": 0.5745643045304843, "learning_rate": 6.827084499160605e-06, "loss": 0.543, "step": 1742 }, { "epoch": 2.6294339622641507, "grad_norm": 0.2511923804891276, "learning_rate": 6.7991046446558485e-06, "loss": 0.5087, "step": 1743 }, { "epoch": 2.630943396226415, "grad_norm": 0.28546674221830076, "learning_rate": 6.771124790151091e-06, "loss": 0.5444, "step": 1744 }, { "epoch": 2.6324528301886794, "grad_norm": 0.28540740645360846, "learning_rate": 6.743144935646335e-06, "loss": 0.5114, "step": 1745 }, { "epoch": 2.6339622641509433, "grad_norm": 0.2930716219200601, "learning_rate": 6.715165081141578e-06, "loss": 0.4819, "step": 1746 }, { "epoch": 2.6354716981132076, "grad_norm": 0.25044896688250096, "learning_rate": 6.687185226636822e-06, "loss": 0.5132, "step": 1747 }, { "epoch": 2.636981132075472, "grad_norm": 0.27658711363810984, "learning_rate": 6.659205372132066e-06, "loss": 0.5184, "step": 1748 }, { "epoch": 2.638490566037736, "grad_norm": 1.2715522760039857, "learning_rate": 6.631225517627309e-06, "loss": 0.4769, "step": 1749 }, { "epoch": 2.64, "grad_norm": 0.2606537545098945, "learning_rate": 6.603245663122552e-06, "loss": 0.5156, "step": 1750 }, { "epoch": 2.641509433962264, "grad_norm": 0.25476970965477846, "learning_rate": 6.5752658086177945e-06, "loss": 0.5238, "step": 1751 }, { "epoch": 2.6430188679245283, "grad_norm": 0.2760824398454106, "learning_rate": 6.547285954113039e-06, "loss": 0.4519, "step": 1752 }, { "epoch": 2.644528301886792, "grad_norm": 0.2552261697445966, "learning_rate": 6.519306099608283e-06, "loss": 0.5238, "step": 1753 }, { "epoch": 2.6460377358490565, "grad_norm": 0.2505994172404276, "learning_rate": 6.4913262451035254e-06, "loss": 0.4914, "step": 1754 }, { "epoch": 2.647547169811321, "grad_norm": 0.2622458535402264, "learning_rate": 6.46334639059877e-06, "loss": 0.5338, "step": 1755 }, { "epoch": 2.6490566037735848, "grad_norm": 0.2678028391066474, "learning_rate": 6.435366536094013e-06, "loss": 0.5065, "step": 1756 }, { "epoch": 2.650566037735849, "grad_norm": 0.2484890494798296, "learning_rate": 6.4073866815892555e-06, "loss": 0.4955, "step": 1757 }, { "epoch": 2.6520754716981134, "grad_norm": 0.2400998987991155, "learning_rate": 6.3794068270845e-06, "loss": 0.479, "step": 1758 }, { "epoch": 2.6535849056603773, "grad_norm": 0.32685735246268977, "learning_rate": 6.351426972579742e-06, "loss": 0.538, "step": 1759 }, { "epoch": 2.6550943396226416, "grad_norm": 0.2603240897335215, "learning_rate": 6.3234471180749865e-06, "loss": 0.498, "step": 1760 }, { "epoch": 2.6566037735849055, "grad_norm": 0.26860424402573896, "learning_rate": 6.29546726357023e-06, "loss": 0.5033, "step": 1761 }, { "epoch": 2.65811320754717, "grad_norm": 0.27530236656485146, "learning_rate": 6.267487409065473e-06, "loss": 0.4868, "step": 1762 }, { "epoch": 2.6596226415094337, "grad_norm": 0.24655990905403674, "learning_rate": 6.2395075545607166e-06, "loss": 0.4981, "step": 1763 }, { "epoch": 2.661132075471698, "grad_norm": 0.2620387036360828, "learning_rate": 6.21152770005596e-06, "loss": 0.5074, "step": 1764 }, { "epoch": 2.6626415094339624, "grad_norm": 0.2680227306633879, "learning_rate": 6.183547845551203e-06, "loss": 0.5265, "step": 1765 }, { "epoch": 2.6641509433962263, "grad_norm": 0.30502521082126244, "learning_rate": 6.155567991046447e-06, "loss": 0.5369, "step": 1766 }, { "epoch": 2.6656603773584906, "grad_norm": 0.2600029610076752, "learning_rate": 6.127588136541691e-06, "loss": 0.496, "step": 1767 }, { "epoch": 2.667169811320755, "grad_norm": 0.31160692938753265, "learning_rate": 6.099608282036933e-06, "loss": 0.5235, "step": 1768 }, { "epoch": 2.668679245283019, "grad_norm": 0.25597688257703527, "learning_rate": 6.071628427532177e-06, "loss": 0.5042, "step": 1769 }, { "epoch": 2.670188679245283, "grad_norm": 0.2713273198602419, "learning_rate": 6.04364857302742e-06, "loss": 0.5259, "step": 1770 }, { "epoch": 2.671698113207547, "grad_norm": 0.2665123339756921, "learning_rate": 6.015668718522664e-06, "loss": 0.4882, "step": 1771 }, { "epoch": 2.6732075471698113, "grad_norm": 0.25296669006459144, "learning_rate": 5.987688864017908e-06, "loss": 0.5413, "step": 1772 }, { "epoch": 2.6747169811320752, "grad_norm": 0.2429367748155575, "learning_rate": 5.959709009513151e-06, "loss": 0.518, "step": 1773 }, { "epoch": 2.6762264150943396, "grad_norm": 0.23850623638622218, "learning_rate": 5.931729155008394e-06, "loss": 0.4992, "step": 1774 }, { "epoch": 2.677735849056604, "grad_norm": 0.2749471413882996, "learning_rate": 5.903749300503638e-06, "loss": 0.5218, "step": 1775 }, { "epoch": 2.6792452830188678, "grad_norm": 0.27456504003042126, "learning_rate": 5.875769445998881e-06, "loss": 0.471, "step": 1776 }, { "epoch": 2.680754716981132, "grad_norm": 6.661889115901275, "learning_rate": 5.8477895914941245e-06, "loss": 0.8345, "step": 1777 }, { "epoch": 2.6822641509433964, "grad_norm": 0.3348543340219312, "learning_rate": 5.819809736989368e-06, "loss": 0.529, "step": 1778 }, { "epoch": 2.6837735849056603, "grad_norm": 0.4156712628420884, "learning_rate": 5.791829882484611e-06, "loss": 0.5019, "step": 1779 }, { "epoch": 2.6852830188679246, "grad_norm": 0.3163158572906154, "learning_rate": 5.763850027979855e-06, "loss": 0.5285, "step": 1780 }, { "epoch": 2.6867924528301885, "grad_norm": 0.2574033108522731, "learning_rate": 5.735870173475098e-06, "loss": 0.5036, "step": 1781 }, { "epoch": 2.688301886792453, "grad_norm": 0.2579216297103016, "learning_rate": 5.707890318970341e-06, "loss": 0.5149, "step": 1782 }, { "epoch": 2.6898113207547167, "grad_norm": 0.2566957267530941, "learning_rate": 5.679910464465585e-06, "loss": 0.5001, "step": 1783 }, { "epoch": 2.691320754716981, "grad_norm": 0.2440800355297943, "learning_rate": 5.651930609960829e-06, "loss": 0.4883, "step": 1784 }, { "epoch": 2.6928301886792454, "grad_norm": 0.2474228533524819, "learning_rate": 5.623950755456072e-06, "loss": 0.51, "step": 1785 }, { "epoch": 2.6943396226415093, "grad_norm": 0.2482868350432357, "learning_rate": 5.595970900951316e-06, "loss": 0.5291, "step": 1786 }, { "epoch": 2.6958490566037736, "grad_norm": 0.23259366850007013, "learning_rate": 5.567991046446559e-06, "loss": 0.4855, "step": 1787 }, { "epoch": 2.697358490566038, "grad_norm": 0.2460435686988639, "learning_rate": 5.5400111919418015e-06, "loss": 0.5084, "step": 1788 }, { "epoch": 2.698867924528302, "grad_norm": 0.26038359055504035, "learning_rate": 5.512031337437046e-06, "loss": 0.5132, "step": 1789 }, { "epoch": 2.700377358490566, "grad_norm": 0.28067308819480147, "learning_rate": 5.484051482932289e-06, "loss": 0.567, "step": 1790 }, { "epoch": 2.70188679245283, "grad_norm": 0.2471009110626984, "learning_rate": 5.456071628427532e-06, "loss": 0.516, "step": 1791 }, { "epoch": 2.7033962264150944, "grad_norm": 0.26721610477068264, "learning_rate": 5.428091773922776e-06, "loss": 0.5093, "step": 1792 }, { "epoch": 2.7049056603773582, "grad_norm": 0.2498621325235465, "learning_rate": 5.400111919418019e-06, "loss": 0.4712, "step": 1793 }, { "epoch": 2.7064150943396226, "grad_norm": 0.25672107157026186, "learning_rate": 5.3721320649132625e-06, "loss": 0.4877, "step": 1794 }, { "epoch": 2.707924528301887, "grad_norm": 0.22665414380543533, "learning_rate": 5.344152210408506e-06, "loss": 0.4315, "step": 1795 }, { "epoch": 2.709433962264151, "grad_norm": 0.24408552016267313, "learning_rate": 5.316172355903749e-06, "loss": 0.5067, "step": 1796 }, { "epoch": 2.710943396226415, "grad_norm": 0.24060193772424485, "learning_rate": 5.2881925013989934e-06, "loss": 0.4887, "step": 1797 }, { "epoch": 2.7124528301886794, "grad_norm": 0.2626077977831592, "learning_rate": 5.260212646894237e-06, "loss": 0.5321, "step": 1798 }, { "epoch": 2.7139622641509433, "grad_norm": 0.26635629306405656, "learning_rate": 5.23223279238948e-06, "loss": 0.4984, "step": 1799 }, { "epoch": 2.7154716981132077, "grad_norm": 0.27439116564294674, "learning_rate": 5.204252937884723e-06, "loss": 0.5383, "step": 1800 }, { "epoch": 2.7169811320754715, "grad_norm": 0.27336316693975615, "learning_rate": 5.176273083379966e-06, "loss": 0.4991, "step": 1801 }, { "epoch": 2.718490566037736, "grad_norm": 0.25078792605060657, "learning_rate": 5.14829322887521e-06, "loss": 0.4746, "step": 1802 }, { "epoch": 2.7199999999999998, "grad_norm": 0.2579469072246058, "learning_rate": 5.120313374370454e-06, "loss": 0.528, "step": 1803 }, { "epoch": 2.721509433962264, "grad_norm": 0.3410333484332067, "learning_rate": 5.092333519865697e-06, "loss": 0.4824, "step": 1804 }, { "epoch": 2.7230188679245284, "grad_norm": 0.34511657220725606, "learning_rate": 5.06435366536094e-06, "loss": 0.4832, "step": 1805 }, { "epoch": 2.7245283018867923, "grad_norm": 0.23917828132388028, "learning_rate": 5.036373810856184e-06, "loss": 0.5131, "step": 1806 }, { "epoch": 2.7260377358490566, "grad_norm": 0.250710939200575, "learning_rate": 5.008393956351427e-06, "loss": 0.5168, "step": 1807 }, { "epoch": 2.727547169811321, "grad_norm": 0.25569387237719443, "learning_rate": 4.9804141018466704e-06, "loss": 0.5251, "step": 1808 }, { "epoch": 2.729056603773585, "grad_norm": 0.2595212700539186, "learning_rate": 4.952434247341914e-06, "loss": 0.4766, "step": 1809 }, { "epoch": 2.730566037735849, "grad_norm": 0.3827330155254612, "learning_rate": 4.924454392837158e-06, "loss": 0.5103, "step": 1810 }, { "epoch": 2.732075471698113, "grad_norm": 0.24296190605611276, "learning_rate": 4.896474538332401e-06, "loss": 0.4906, "step": 1811 }, { "epoch": 2.7335849056603774, "grad_norm": 0.4549554603421421, "learning_rate": 4.868494683827645e-06, "loss": 0.4825, "step": 1812 }, { "epoch": 2.7350943396226413, "grad_norm": 0.27193474503030435, "learning_rate": 4.840514829322887e-06, "loss": 0.4838, "step": 1813 }, { "epoch": 2.7366037735849056, "grad_norm": 0.2563880809210197, "learning_rate": 4.812534974818131e-06, "loss": 0.5064, "step": 1814 }, { "epoch": 2.73811320754717, "grad_norm": 0.2548907465195013, "learning_rate": 4.784555120313375e-06, "loss": 0.5096, "step": 1815 }, { "epoch": 2.739622641509434, "grad_norm": 0.2535545744550968, "learning_rate": 4.756575265808618e-06, "loss": 0.5075, "step": 1816 }, { "epoch": 2.741132075471698, "grad_norm": 0.2538458515041986, "learning_rate": 4.7285954113038615e-06, "loss": 0.5233, "step": 1817 }, { "epoch": 2.7426415094339625, "grad_norm": 0.2449829091378601, "learning_rate": 4.700615556799105e-06, "loss": 0.4963, "step": 1818 }, { "epoch": 2.7441509433962263, "grad_norm": 0.2583453664244264, "learning_rate": 4.672635702294348e-06, "loss": 0.5075, "step": 1819 }, { "epoch": 2.7456603773584907, "grad_norm": 0.24305960423400333, "learning_rate": 4.644655847789592e-06, "loss": 0.5265, "step": 1820 }, { "epoch": 2.7471698113207546, "grad_norm": 0.24200712952451542, "learning_rate": 4.616675993284835e-06, "loss": 0.4775, "step": 1821 }, { "epoch": 2.748679245283019, "grad_norm": 0.28201193390392365, "learning_rate": 4.588696138780078e-06, "loss": 0.4837, "step": 1822 }, { "epoch": 2.7501886792452828, "grad_norm": 0.26394748624366776, "learning_rate": 4.5607162842753226e-06, "loss": 0.5334, "step": 1823 }, { "epoch": 2.751698113207547, "grad_norm": 0.24720195678002388, "learning_rate": 4.532736429770566e-06, "loss": 0.5021, "step": 1824 }, { "epoch": 2.7532075471698114, "grad_norm": 0.30947073261476093, "learning_rate": 4.5047565752658084e-06, "loss": 0.5546, "step": 1825 }, { "epoch": 2.7547169811320753, "grad_norm": 0.2988805165162394, "learning_rate": 4.476776720761052e-06, "loss": 0.5042, "step": 1826 }, { "epoch": 2.7562264150943396, "grad_norm": 0.2462987932828045, "learning_rate": 4.448796866256295e-06, "loss": 0.5357, "step": 1827 }, { "epoch": 2.757735849056604, "grad_norm": 0.2415826373536316, "learning_rate": 4.420817011751539e-06, "loss": 0.4834, "step": 1828 }, { "epoch": 2.759245283018868, "grad_norm": 0.2535333115257667, "learning_rate": 4.392837157246783e-06, "loss": 0.5412, "step": 1829 }, { "epoch": 2.760754716981132, "grad_norm": 0.24596768950408637, "learning_rate": 4.364857302742026e-06, "loss": 0.5119, "step": 1830 }, { "epoch": 2.7622641509433965, "grad_norm": 0.24845301619797933, "learning_rate": 4.3368774482372695e-06, "loss": 0.5117, "step": 1831 }, { "epoch": 2.7637735849056604, "grad_norm": 2.3475646101301897, "learning_rate": 4.308897593732513e-06, "loss": 0.509, "step": 1832 }, { "epoch": 2.7652830188679243, "grad_norm": 0.2544934651097907, "learning_rate": 4.280917739227756e-06, "loss": 0.5145, "step": 1833 }, { "epoch": 2.7667924528301886, "grad_norm": 0.2634896263966863, "learning_rate": 4.2529378847229995e-06, "loss": 0.5485, "step": 1834 }, { "epoch": 2.768301886792453, "grad_norm": 0.25651084274796315, "learning_rate": 4.224958030218243e-06, "loss": 0.5134, "step": 1835 }, { "epoch": 2.769811320754717, "grad_norm": 0.2646923542784634, "learning_rate": 4.196978175713487e-06, "loss": 0.5164, "step": 1836 }, { "epoch": 2.771320754716981, "grad_norm": 0.26044964960565437, "learning_rate": 4.1689983212087305e-06, "loss": 0.5161, "step": 1837 }, { "epoch": 2.7728301886792455, "grad_norm": 0.231525130140077, "learning_rate": 4.141018466703973e-06, "loss": 0.4982, "step": 1838 }, { "epoch": 2.7743396226415094, "grad_norm": 0.25942703847568144, "learning_rate": 4.113038612199216e-06, "loss": 0.5105, "step": 1839 }, { "epoch": 2.7758490566037737, "grad_norm": 0.26605983694964946, "learning_rate": 4.08505875769446e-06, "loss": 0.5111, "step": 1840 }, { "epoch": 2.777358490566038, "grad_norm": 0.24395856253509673, "learning_rate": 4.057078903189704e-06, "loss": 0.4747, "step": 1841 }, { "epoch": 2.778867924528302, "grad_norm": 0.26184813710048754, "learning_rate": 4.029099048684947e-06, "loss": 0.5157, "step": 1842 }, { "epoch": 2.7803773584905658, "grad_norm": 0.26268875875244085, "learning_rate": 4.001119194180191e-06, "loss": 0.5598, "step": 1843 }, { "epoch": 2.78188679245283, "grad_norm": 0.23009142240299044, "learning_rate": 3.973139339675434e-06, "loss": 0.4734, "step": 1844 }, { "epoch": 2.7833962264150944, "grad_norm": 0.2458756232178251, "learning_rate": 3.945159485170677e-06, "loss": 0.4777, "step": 1845 }, { "epoch": 2.7849056603773583, "grad_norm": 0.24794152608284992, "learning_rate": 3.917179630665921e-06, "loss": 0.4422, "step": 1846 }, { "epoch": 2.7864150943396226, "grad_norm": 0.24897274231650662, "learning_rate": 3.889199776161164e-06, "loss": 0.5042, "step": 1847 }, { "epoch": 2.787924528301887, "grad_norm": 0.26060926813065055, "learning_rate": 3.8612199216564075e-06, "loss": 0.4873, "step": 1848 }, { "epoch": 2.789433962264151, "grad_norm": 0.23848484427152364, "learning_rate": 3.833240067151651e-06, "loss": 0.5234, "step": 1849 }, { "epoch": 2.790943396226415, "grad_norm": 0.250353041090764, "learning_rate": 3.8052602126468946e-06, "loss": 0.5231, "step": 1850 }, { "epoch": 2.7924528301886795, "grad_norm": 0.24066679801616733, "learning_rate": 3.777280358142138e-06, "loss": 0.5212, "step": 1851 }, { "epoch": 2.7939622641509434, "grad_norm": 0.2600941674746975, "learning_rate": 3.7493005036373813e-06, "loss": 0.5084, "step": 1852 }, { "epoch": 2.7954716981132073, "grad_norm": 0.2510555438696239, "learning_rate": 3.7213206491326243e-06, "loss": 0.5511, "step": 1853 }, { "epoch": 2.7969811320754716, "grad_norm": 0.24160412240772206, "learning_rate": 3.6933407946278685e-06, "loss": 0.5095, "step": 1854 }, { "epoch": 2.798490566037736, "grad_norm": 0.2748568322627208, "learning_rate": 3.6653609401231114e-06, "loss": 0.5496, "step": 1855 }, { "epoch": 2.8, "grad_norm": 0.2584358586285045, "learning_rate": 3.637381085618355e-06, "loss": 0.5463, "step": 1856 }, { "epoch": 2.801509433962264, "grad_norm": 0.2498517036300968, "learning_rate": 3.609401231113598e-06, "loss": 0.5415, "step": 1857 }, { "epoch": 2.8030188679245285, "grad_norm": 0.24524690085464163, "learning_rate": 3.581421376608842e-06, "loss": 0.5375, "step": 1858 }, { "epoch": 2.8045283018867924, "grad_norm": 0.2769513128398129, "learning_rate": 3.5534415221040853e-06, "loss": 0.552, "step": 1859 }, { "epoch": 2.8060377358490567, "grad_norm": 0.2506846446168906, "learning_rate": 3.5254616675993287e-06, "loss": 0.5459, "step": 1860 }, { "epoch": 2.807547169811321, "grad_norm": 0.2554286512869799, "learning_rate": 3.497481813094572e-06, "loss": 0.4944, "step": 1861 }, { "epoch": 2.809056603773585, "grad_norm": 0.23601391002584438, "learning_rate": 3.469501958589815e-06, "loss": 0.4922, "step": 1862 }, { "epoch": 2.810566037735849, "grad_norm": 0.26302780246456897, "learning_rate": 3.441522104085059e-06, "loss": 0.5044, "step": 1863 }, { "epoch": 2.812075471698113, "grad_norm": 0.2577873934898228, "learning_rate": 3.4135422495803025e-06, "loss": 0.5334, "step": 1864 }, { "epoch": 2.8135849056603774, "grad_norm": 0.23924590337032847, "learning_rate": 3.3855623950755455e-06, "loss": 0.5229, "step": 1865 }, { "epoch": 2.8150943396226413, "grad_norm": 0.24147365730794057, "learning_rate": 3.357582540570789e-06, "loss": 0.5122, "step": 1866 }, { "epoch": 2.8166037735849057, "grad_norm": 0.2327138561482277, "learning_rate": 3.329602686066033e-06, "loss": 0.4637, "step": 1867 }, { "epoch": 2.81811320754717, "grad_norm": 0.2750595632862016, "learning_rate": 3.301622831561276e-06, "loss": 0.4651, "step": 1868 }, { "epoch": 2.819622641509434, "grad_norm": 0.25380188643044477, "learning_rate": 3.2736429770565194e-06, "loss": 0.5683, "step": 1869 }, { "epoch": 2.821132075471698, "grad_norm": 0.24310252434490498, "learning_rate": 3.2456631225517627e-06, "loss": 0.5079, "step": 1870 }, { "epoch": 2.8226415094339625, "grad_norm": 0.22968025157830738, "learning_rate": 3.2176832680470065e-06, "loss": 0.4756, "step": 1871 }, { "epoch": 2.8241509433962264, "grad_norm": 0.23406333934374154, "learning_rate": 3.18970341354225e-06, "loss": 0.508, "step": 1872 }, { "epoch": 2.8256603773584903, "grad_norm": 3.331648704661487, "learning_rate": 3.1617235590374932e-06, "loss": 0.5314, "step": 1873 }, { "epoch": 2.8271698113207546, "grad_norm": 0.2695647874070789, "learning_rate": 3.1337437045327366e-06, "loss": 0.5471, "step": 1874 }, { "epoch": 2.828679245283019, "grad_norm": 0.25565203431365335, "learning_rate": 3.10576385002798e-06, "loss": 0.4956, "step": 1875 }, { "epoch": 2.830188679245283, "grad_norm": 0.24812316022229003, "learning_rate": 3.0777839955232233e-06, "loss": 0.5063, "step": 1876 }, { "epoch": 2.831698113207547, "grad_norm": 0.2512898425536453, "learning_rate": 3.0498041410184667e-06, "loss": 0.5132, "step": 1877 }, { "epoch": 2.8332075471698115, "grad_norm": 0.25905997230865613, "learning_rate": 3.02182428651371e-06, "loss": 0.536, "step": 1878 }, { "epoch": 2.8347169811320754, "grad_norm": 0.2527084751717376, "learning_rate": 2.993844432008954e-06, "loss": 0.5058, "step": 1879 }, { "epoch": 2.8362264150943397, "grad_norm": 0.25014337600940817, "learning_rate": 2.965864577504197e-06, "loss": 0.5141, "step": 1880 }, { "epoch": 2.837735849056604, "grad_norm": 0.26771202447798464, "learning_rate": 2.9378847229994406e-06, "loss": 0.4997, "step": 1881 }, { "epoch": 2.839245283018868, "grad_norm": 0.3542671459269332, "learning_rate": 2.909904868494684e-06, "loss": 0.5118, "step": 1882 }, { "epoch": 2.840754716981132, "grad_norm": 0.29850083758472223, "learning_rate": 2.8819250139899277e-06, "loss": 0.5432, "step": 1883 }, { "epoch": 2.842264150943396, "grad_norm": 0.23709430578888918, "learning_rate": 2.8539451594851706e-06, "loss": 0.5221, "step": 1884 }, { "epoch": 2.8437735849056605, "grad_norm": 0.26493114426048814, "learning_rate": 2.8259653049804144e-06, "loss": 0.5103, "step": 1885 }, { "epoch": 2.8452830188679243, "grad_norm": 0.23960026530705572, "learning_rate": 2.797985450475658e-06, "loss": 0.4965, "step": 1886 }, { "epoch": 2.8467924528301887, "grad_norm": 0.251820373185035, "learning_rate": 2.7700055959709007e-06, "loss": 0.512, "step": 1887 }, { "epoch": 2.848301886792453, "grad_norm": 0.2716135014490187, "learning_rate": 2.7420257414661445e-06, "loss": 0.5373, "step": 1888 }, { "epoch": 2.849811320754717, "grad_norm": 0.3050812952561557, "learning_rate": 2.714045886961388e-06, "loss": 0.4955, "step": 1889 }, { "epoch": 2.851320754716981, "grad_norm": 0.23801738006267384, "learning_rate": 2.6860660324566312e-06, "loss": 0.4919, "step": 1890 }, { "epoch": 2.8528301886792455, "grad_norm": 0.23926038110207012, "learning_rate": 2.6580861779518746e-06, "loss": 0.486, "step": 1891 }, { "epoch": 2.8543396226415094, "grad_norm": 0.24625086888055978, "learning_rate": 2.6301063234471184e-06, "loss": 0.563, "step": 1892 }, { "epoch": 2.8558490566037738, "grad_norm": 0.25559463779086583, "learning_rate": 2.6021264689423613e-06, "loss": 0.4613, "step": 1893 }, { "epoch": 2.8573584905660376, "grad_norm": 0.2437794999089875, "learning_rate": 2.574146614437605e-06, "loss": 0.5394, "step": 1894 }, { "epoch": 2.858867924528302, "grad_norm": 0.24054866492710766, "learning_rate": 2.5461667599328485e-06, "loss": 0.5187, "step": 1895 }, { "epoch": 2.860377358490566, "grad_norm": 0.24376318441166045, "learning_rate": 2.518186905428092e-06, "loss": 0.4887, "step": 1896 }, { "epoch": 2.86188679245283, "grad_norm": 0.2410895191336604, "learning_rate": 2.4902070509233352e-06, "loss": 0.4956, "step": 1897 }, { "epoch": 2.8633962264150945, "grad_norm": 0.2389979905294272, "learning_rate": 2.462227196418579e-06, "loss": 0.5055, "step": 1898 }, { "epoch": 2.8649056603773584, "grad_norm": 0.24099313691698937, "learning_rate": 2.4342473419138224e-06, "loss": 0.4998, "step": 1899 }, { "epoch": 2.8664150943396227, "grad_norm": 0.24527579020611284, "learning_rate": 2.4062674874090653e-06, "loss": 0.5264, "step": 1900 }, { "epoch": 2.867924528301887, "grad_norm": 0.23933465605234516, "learning_rate": 2.378287632904309e-06, "loss": 0.4959, "step": 1901 }, { "epoch": 2.869433962264151, "grad_norm": 0.24622168013990314, "learning_rate": 2.3503077783995524e-06, "loss": 0.5173, "step": 1902 }, { "epoch": 2.8709433962264153, "grad_norm": 0.23919977406233273, "learning_rate": 2.322327923894796e-06, "loss": 0.4966, "step": 1903 }, { "epoch": 2.872452830188679, "grad_norm": 0.261296807681531, "learning_rate": 2.294348069390039e-06, "loss": 0.5598, "step": 1904 }, { "epoch": 2.8739622641509435, "grad_norm": 0.2799512370951401, "learning_rate": 2.266368214885283e-06, "loss": 0.4819, "step": 1905 }, { "epoch": 2.8754716981132074, "grad_norm": 0.23214062234311064, "learning_rate": 2.238388360380526e-06, "loss": 0.5261, "step": 1906 }, { "epoch": 2.8769811320754717, "grad_norm": 0.24329861142019843, "learning_rate": 2.2104085058757697e-06, "loss": 0.5273, "step": 1907 }, { "epoch": 2.878490566037736, "grad_norm": 0.24170422215332774, "learning_rate": 2.182428651371013e-06, "loss": 0.4899, "step": 1908 }, { "epoch": 2.88, "grad_norm": 0.21557021451090852, "learning_rate": 2.1544487968662564e-06, "loss": 0.4499, "step": 1909 }, { "epoch": 2.881509433962264, "grad_norm": 0.24761336600665296, "learning_rate": 2.1264689423614998e-06, "loss": 0.5349, "step": 1910 }, { "epoch": 2.8830188679245285, "grad_norm": 0.28587028512046125, "learning_rate": 2.0984890878567436e-06, "loss": 0.4769, "step": 1911 }, { "epoch": 2.8845283018867924, "grad_norm": 0.23215345407826837, "learning_rate": 2.0705092333519865e-06, "loss": 0.5057, "step": 1912 }, { "epoch": 2.8860377358490568, "grad_norm": 0.23557540955317524, "learning_rate": 2.04252937884723e-06, "loss": 0.4983, "step": 1913 }, { "epoch": 2.8875471698113206, "grad_norm": 0.24826597774453552, "learning_rate": 2.0145495243424736e-06, "loss": 0.506, "step": 1914 }, { "epoch": 2.889056603773585, "grad_norm": 0.27097286067099186, "learning_rate": 1.986569669837717e-06, "loss": 0.4824, "step": 1915 }, { "epoch": 2.890566037735849, "grad_norm": 0.2323466023436389, "learning_rate": 1.9585898153329604e-06, "loss": 0.49, "step": 1916 }, { "epoch": 2.892075471698113, "grad_norm": 0.24530440733332493, "learning_rate": 1.9306099608282037e-06, "loss": 0.5128, "step": 1917 }, { "epoch": 2.8935849056603775, "grad_norm": 0.2217381642822851, "learning_rate": 1.9026301063234473e-06, "loss": 0.4941, "step": 1918 }, { "epoch": 2.8950943396226414, "grad_norm": 0.3033421523357643, "learning_rate": 1.8746502518186907e-06, "loss": 0.5488, "step": 1919 }, { "epoch": 2.8966037735849057, "grad_norm": 0.22952196943803127, "learning_rate": 1.8466703973139342e-06, "loss": 0.4873, "step": 1920 }, { "epoch": 2.89811320754717, "grad_norm": 0.24646380697980175, "learning_rate": 1.8186905428091774e-06, "loss": 0.5466, "step": 1921 }, { "epoch": 2.899622641509434, "grad_norm": 0.2931648546342374, "learning_rate": 1.790710688304421e-06, "loss": 0.494, "step": 1922 }, { "epoch": 2.9011320754716983, "grad_norm": 0.6019713876730823, "learning_rate": 1.7627308337996643e-06, "loss": 0.5361, "step": 1923 }, { "epoch": 2.902641509433962, "grad_norm": 0.2643446098419106, "learning_rate": 1.7347509792949075e-06, "loss": 0.51, "step": 1924 }, { "epoch": 2.9041509433962265, "grad_norm": 0.24476304649977285, "learning_rate": 1.7067711247901513e-06, "loss": 0.5391, "step": 1925 }, { "epoch": 2.9056603773584904, "grad_norm": 0.24501363596775003, "learning_rate": 1.6787912702853944e-06, "loss": 0.5126, "step": 1926 }, { "epoch": 2.9071698113207547, "grad_norm": 0.23738294037707477, "learning_rate": 1.650811415780638e-06, "loss": 0.4981, "step": 1927 }, { "epoch": 2.908679245283019, "grad_norm": 0.2353858028048875, "learning_rate": 1.6228315612758814e-06, "loss": 0.5393, "step": 1928 }, { "epoch": 2.910188679245283, "grad_norm": 0.24557576052226965, "learning_rate": 1.594851706771125e-06, "loss": 0.5146, "step": 1929 }, { "epoch": 2.9116981132075472, "grad_norm": 0.23598740859198736, "learning_rate": 1.5668718522663683e-06, "loss": 0.5069, "step": 1930 }, { "epoch": 2.9132075471698116, "grad_norm": 0.23965223087346654, "learning_rate": 1.5388919977616117e-06, "loss": 0.5293, "step": 1931 }, { "epoch": 2.9147169811320754, "grad_norm": 0.23578049631296016, "learning_rate": 1.510912143256855e-06, "loss": 0.5237, "step": 1932 }, { "epoch": 2.9162264150943398, "grad_norm": 0.2512094274756097, "learning_rate": 1.4829322887520986e-06, "loss": 0.5305, "step": 1933 }, { "epoch": 2.9177358490566037, "grad_norm": 0.2419640704406858, "learning_rate": 1.454952434247342e-06, "loss": 0.5019, "step": 1934 }, { "epoch": 2.919245283018868, "grad_norm": 0.2341231605907135, "learning_rate": 1.4269725797425853e-06, "loss": 0.493, "step": 1935 }, { "epoch": 2.920754716981132, "grad_norm": 0.22368464083267006, "learning_rate": 1.398992725237829e-06, "loss": 0.479, "step": 1936 }, { "epoch": 2.922264150943396, "grad_norm": 0.23934904699725973, "learning_rate": 1.3710128707330723e-06, "loss": 0.5182, "step": 1937 }, { "epoch": 2.9237735849056605, "grad_norm": 0.2423304768056013, "learning_rate": 1.3430330162283156e-06, "loss": 0.5287, "step": 1938 }, { "epoch": 2.9252830188679244, "grad_norm": 0.24220829389874426, "learning_rate": 1.3150531617235592e-06, "loss": 0.5131, "step": 1939 }, { "epoch": 2.9267924528301887, "grad_norm": 0.2408649756673509, "learning_rate": 1.2870733072188026e-06, "loss": 0.5256, "step": 1940 }, { "epoch": 2.928301886792453, "grad_norm": 0.26970041615884593, "learning_rate": 1.259093452714046e-06, "loss": 0.5247, "step": 1941 }, { "epoch": 2.929811320754717, "grad_norm": 0.2462934836700248, "learning_rate": 1.2311135982092895e-06, "loss": 0.5274, "step": 1942 }, { "epoch": 2.9313207547169813, "grad_norm": 0.24346784211748765, "learning_rate": 1.2031337437045327e-06, "loss": 0.5353, "step": 1943 }, { "epoch": 2.932830188679245, "grad_norm": 0.2308252685210892, "learning_rate": 1.1751538891997762e-06, "loss": 0.4403, "step": 1944 }, { "epoch": 2.9343396226415095, "grad_norm": 0.24611093220055966, "learning_rate": 1.1471740346950196e-06, "loss": 0.4911, "step": 1945 }, { "epoch": 2.9358490566037734, "grad_norm": 0.23677755966512776, "learning_rate": 1.119194180190263e-06, "loss": 0.4889, "step": 1946 }, { "epoch": 2.9373584905660377, "grad_norm": 0.2390396184181226, "learning_rate": 1.0912143256855065e-06, "loss": 0.4938, "step": 1947 }, { "epoch": 2.938867924528302, "grad_norm": 0.2298114470810896, "learning_rate": 1.0632344711807499e-06, "loss": 0.5164, "step": 1948 }, { "epoch": 2.940377358490566, "grad_norm": 0.23867767626825967, "learning_rate": 1.0352546166759932e-06, "loss": 0.5339, "step": 1949 }, { "epoch": 2.9418867924528302, "grad_norm": 0.2273239656737244, "learning_rate": 1.0072747621712368e-06, "loss": 0.4873, "step": 1950 }, { "epoch": 2.9433962264150946, "grad_norm": 0.2510529360776652, "learning_rate": 9.792949076664802e-07, "loss": 0.541, "step": 1951 }, { "epoch": 2.9449056603773585, "grad_norm": 0.2420207367421682, "learning_rate": 9.513150531617237e-07, "loss": 0.5357, "step": 1952 }, { "epoch": 2.946415094339623, "grad_norm": 0.23866840294873037, "learning_rate": 9.233351986569671e-07, "loss": 0.516, "step": 1953 }, { "epoch": 2.9479245283018867, "grad_norm": 0.23833048632270618, "learning_rate": 8.953553441522105e-07, "loss": 0.5001, "step": 1954 }, { "epoch": 2.949433962264151, "grad_norm": 0.23240358196803712, "learning_rate": 8.673754896474537e-07, "loss": 0.5263, "step": 1955 }, { "epoch": 2.950943396226415, "grad_norm": 0.27536994744227083, "learning_rate": 8.393956351426972e-07, "loss": 0.4919, "step": 1956 }, { "epoch": 2.952452830188679, "grad_norm": 0.23738342408866947, "learning_rate": 8.114157806379407e-07, "loss": 0.4924, "step": 1957 }, { "epoch": 2.9539622641509435, "grad_norm": 0.2227032657641613, "learning_rate": 7.834359261331841e-07, "loss": 0.4912, "step": 1958 }, { "epoch": 2.9554716981132074, "grad_norm": 0.23463528292989802, "learning_rate": 7.554560716284275e-07, "loss": 0.4838, "step": 1959 }, { "epoch": 2.9569811320754718, "grad_norm": 0.2351782144870339, "learning_rate": 7.27476217123671e-07, "loss": 0.5385, "step": 1960 }, { "epoch": 2.958490566037736, "grad_norm": 0.30869561893156994, "learning_rate": 6.994963626189144e-07, "loss": 0.5375, "step": 1961 }, { "epoch": 2.96, "grad_norm": 0.2507617576529779, "learning_rate": 6.715165081141578e-07, "loss": 0.5086, "step": 1962 }, { "epoch": 2.9615094339622643, "grad_norm": 0.26263822536508746, "learning_rate": 6.435366536094013e-07, "loss": 0.5411, "step": 1963 }, { "epoch": 2.963018867924528, "grad_norm": 0.22776692640780824, "learning_rate": 6.155567991046447e-07, "loss": 0.4882, "step": 1964 }, { "epoch": 2.9645283018867925, "grad_norm": 0.23187613900036108, "learning_rate": 5.875769445998881e-07, "loss": 0.5031, "step": 1965 }, { "epoch": 2.9660377358490564, "grad_norm": 0.23980976100159304, "learning_rate": 5.595970900951315e-07, "loss": 0.5111, "step": 1966 }, { "epoch": 2.9675471698113207, "grad_norm": 0.23454625576676985, "learning_rate": 5.316172355903749e-07, "loss": 0.4886, "step": 1967 }, { "epoch": 2.969056603773585, "grad_norm": 0.22466469775472325, "learning_rate": 5.036373810856184e-07, "loss": 0.4927, "step": 1968 }, { "epoch": 2.970566037735849, "grad_norm": 0.2424670877215881, "learning_rate": 4.7565752658086183e-07, "loss": 0.5188, "step": 1969 }, { "epoch": 2.9720754716981133, "grad_norm": 0.2634277432088806, "learning_rate": 4.4767767207610524e-07, "loss": 0.4995, "step": 1970 }, { "epoch": 2.9735849056603776, "grad_norm": 0.24871804164038816, "learning_rate": 4.196978175713486e-07, "loss": 0.4975, "step": 1971 }, { "epoch": 2.9750943396226415, "grad_norm": 0.22918011887355255, "learning_rate": 3.917179630665921e-07, "loss": 0.5146, "step": 1972 }, { "epoch": 2.976603773584906, "grad_norm": 0.24494562015577118, "learning_rate": 3.637381085618355e-07, "loss": 0.5191, "step": 1973 }, { "epoch": 2.9781132075471697, "grad_norm": 0.24554508049209534, "learning_rate": 3.357582540570789e-07, "loss": 0.5134, "step": 1974 }, { "epoch": 2.979622641509434, "grad_norm": 0.24686957526571518, "learning_rate": 3.077783995523224e-07, "loss": 0.4754, "step": 1975 }, { "epoch": 2.981132075471698, "grad_norm": 0.2309101984662458, "learning_rate": 2.7979854504756574e-07, "loss": 0.4805, "step": 1976 }, { "epoch": 2.9826415094339622, "grad_norm": 0.23759627074403436, "learning_rate": 2.518186905428092e-07, "loss": 0.5359, "step": 1977 }, { "epoch": 2.9841509433962266, "grad_norm": 0.25434227095914247, "learning_rate": 2.2383883603805262e-07, "loss": 0.5258, "step": 1978 }, { "epoch": 2.9856603773584904, "grad_norm": 0.24774479007599787, "learning_rate": 1.9585898153329604e-07, "loss": 0.5029, "step": 1979 }, { "epoch": 2.9871698113207548, "grad_norm": 0.2562930857219345, "learning_rate": 1.6787912702853945e-07, "loss": 0.5213, "step": 1980 }, { "epoch": 2.988679245283019, "grad_norm": 0.24488783175852774, "learning_rate": 1.3989927252378287e-07, "loss": 0.5062, "step": 1981 }, { "epoch": 2.990188679245283, "grad_norm": 0.24061088726491453, "learning_rate": 1.1191941801902631e-07, "loss": 0.5221, "step": 1982 }, { "epoch": 2.9916981132075473, "grad_norm": 0.23745813306754143, "learning_rate": 8.393956351426973e-08, "loss": 0.4621, "step": 1983 }, { "epoch": 2.993207547169811, "grad_norm": 0.24766707785226677, "learning_rate": 5.5959709009513155e-08, "loss": 0.5439, "step": 1984 }, { "epoch": 2.9947169811320755, "grad_norm": 0.22755699352891728, "learning_rate": 2.7979854504756578e-08, "loss": 0.4968, "step": 1985 }, { "epoch": 2.9962264150943394, "grad_norm": 0.2639171679918981, "learning_rate": 0.0, "loss": 0.5176, "step": 1986 }, { "epoch": 2.9962264150943394, "step": 1986, "total_flos": 1.6837767016679997e+18, "train_loss": 0.7350575219978619, "train_runtime": 115424.0134, "train_samples_per_second": 0.275, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 1986, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6837767016679997e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }