{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997262581596125, "eval_steps": 500, "global_step": 1480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033691303432301536, "grad_norm": 6.109744437799009, "learning_rate": 5.405405405405406e-07, "loss": 0.8395, "step": 1 }, { "epoch": 0.006738260686460307, "grad_norm": 6.213278091341755, "learning_rate": 1.0810810810810812e-06, "loss": 0.8539, "step": 2 }, { "epoch": 0.01010739102969046, "grad_norm": 6.2309201627607536, "learning_rate": 1.6216216216216219e-06, "loss": 0.862, "step": 3 }, { "epoch": 0.013476521372920615, "grad_norm": 5.960777622679009, "learning_rate": 2.1621621621621623e-06, "loss": 0.8418, "step": 4 }, { "epoch": 0.01684565171615077, "grad_norm": 5.5645531135403825, "learning_rate": 2.702702702702703e-06, "loss": 0.8303, "step": 5 }, { "epoch": 0.02021478205938092, "grad_norm": 4.368766112287261, "learning_rate": 3.2432432432432437e-06, "loss": 0.7949, "step": 6 }, { "epoch": 0.023583912402611075, "grad_norm": 2.4282762566778153, "learning_rate": 3.7837837837837844e-06, "loss": 0.7538, "step": 7 }, { "epoch": 0.02695304274584123, "grad_norm": 2.286826804105074, "learning_rate": 4.324324324324325e-06, "loss": 0.7537, "step": 8 }, { "epoch": 0.030322173089071383, "grad_norm": 1.7230122995757746, "learning_rate": 4.864864864864866e-06, "loss": 0.7465, "step": 9 }, { "epoch": 0.03369130343230154, "grad_norm": 4.178444915489739, "learning_rate": 5.405405405405406e-06, "loss": 0.7473, "step": 10 }, { "epoch": 0.03706043377553169, "grad_norm": 4.418045830813186, "learning_rate": 5.945945945945947e-06, "loss": 0.7374, "step": 11 }, { "epoch": 0.04042956411876184, "grad_norm": 4.465657225144453, "learning_rate": 6.486486486486487e-06, "loss": 0.7306, "step": 12 }, { "epoch": 0.043798694461992, "grad_norm": 4.089235560381299, "learning_rate": 7.027027027027028e-06, "loss": 0.6953, "step": 13 }, { "epoch": 0.04716782480522215, "grad_norm": 3.9135602100241798, "learning_rate": 7.567567567567569e-06, "loss": 0.6963, "step": 14 }, { "epoch": 0.05053695514845231, "grad_norm": 2.799692430785198, "learning_rate": 8.108108108108109e-06, "loss": 0.6717, "step": 15 }, { "epoch": 0.05390608549168246, "grad_norm": 1.7463496609273474, "learning_rate": 8.64864864864865e-06, "loss": 0.6544, "step": 16 }, { "epoch": 0.057275215834912616, "grad_norm": 1.788941312078948, "learning_rate": 9.189189189189191e-06, "loss": 0.6521, "step": 17 }, { "epoch": 0.060644346178142766, "grad_norm": 2.287622821921788, "learning_rate": 9.729729729729732e-06, "loss": 0.6429, "step": 18 }, { "epoch": 0.06401347652137292, "grad_norm": 1.960430434087058, "learning_rate": 1.027027027027027e-05, "loss": 0.6392, "step": 19 }, { "epoch": 0.06738260686460308, "grad_norm": 1.349903658959779, "learning_rate": 1.0810810810810812e-05, "loss": 0.6281, "step": 20 }, { "epoch": 0.07075173720783323, "grad_norm": 1.2019647976807795, "learning_rate": 1.1351351351351352e-05, "loss": 0.6151, "step": 21 }, { "epoch": 0.07412086755106338, "grad_norm": 0.9434380017683085, "learning_rate": 1.1891891891891894e-05, "loss": 0.604, "step": 22 }, { "epoch": 0.07748999789429353, "grad_norm": 1.1709993819720563, "learning_rate": 1.2432432432432433e-05, "loss": 0.6027, "step": 23 }, { "epoch": 0.08085912823752368, "grad_norm": 0.7729753637518647, "learning_rate": 1.2972972972972975e-05, "loss": 0.6009, "step": 24 }, { "epoch": 0.08422825858075385, "grad_norm": 0.9253134596999972, "learning_rate": 1.3513513513513515e-05, "loss": 0.5784, "step": 25 }, { "epoch": 0.087597388923984, "grad_norm": 0.7203325434817915, "learning_rate": 1.4054054054054055e-05, "loss": 0.5784, "step": 26 }, { "epoch": 0.09096651926721415, "grad_norm": 0.7738717415397595, "learning_rate": 1.4594594594594596e-05, "loss": 0.5724, "step": 27 }, { "epoch": 0.0943356496104443, "grad_norm": 0.756189527437592, "learning_rate": 1.5135135135135138e-05, "loss": 0.5751, "step": 28 }, { "epoch": 0.09770477995367446, "grad_norm": 0.5619223236064955, "learning_rate": 1.5675675675675676e-05, "loss": 0.5632, "step": 29 }, { "epoch": 0.10107391029690461, "grad_norm": 0.6416604397150266, "learning_rate": 1.6216216216216218e-05, "loss": 0.5624, "step": 30 }, { "epoch": 0.10444304064013477, "grad_norm": 0.5213594465327983, "learning_rate": 1.6756756756756757e-05, "loss": 0.5618, "step": 31 }, { "epoch": 0.10781217098336492, "grad_norm": 0.6096320957570693, "learning_rate": 1.72972972972973e-05, "loss": 0.5569, "step": 32 }, { "epoch": 0.11118130132659507, "grad_norm": 0.45054132971017113, "learning_rate": 1.783783783783784e-05, "loss": 0.547, "step": 33 }, { "epoch": 0.11455043166982523, "grad_norm": 0.4599454116974351, "learning_rate": 1.8378378378378383e-05, "loss": 0.5484, "step": 34 }, { "epoch": 0.11791956201305538, "grad_norm": 0.5174247359394964, "learning_rate": 1.891891891891892e-05, "loss": 0.5493, "step": 35 }, { "epoch": 0.12128869235628553, "grad_norm": 0.37366751205930016, "learning_rate": 1.9459459459459463e-05, "loss": 0.5391, "step": 36 }, { "epoch": 0.12465782269951568, "grad_norm": 0.48653413640763127, "learning_rate": 2e-05, "loss": 0.5403, "step": 37 }, { "epoch": 0.12802695304274583, "grad_norm": 0.3651703371460431, "learning_rate": 2.054054054054054e-05, "loss": 0.5432, "step": 38 }, { "epoch": 0.131396083385976, "grad_norm": 0.5149214857154895, "learning_rate": 2.1081081081081082e-05, "loss": 0.5351, "step": 39 }, { "epoch": 0.13476521372920616, "grad_norm": 0.3233162021565579, "learning_rate": 2.1621621621621624e-05, "loss": 0.5358, "step": 40 }, { "epoch": 0.1381343440724363, "grad_norm": 0.3994220446903501, "learning_rate": 2.2162162162162163e-05, "loss": 0.5215, "step": 41 }, { "epoch": 0.14150347441566646, "grad_norm": 0.3613182742062023, "learning_rate": 2.2702702702702705e-05, "loss": 0.5306, "step": 42 }, { "epoch": 0.1448726047588966, "grad_norm": 0.3444241276930956, "learning_rate": 2.3243243243243243e-05, "loss": 0.5255, "step": 43 }, { "epoch": 0.14824173510212676, "grad_norm": 0.4493831957153716, "learning_rate": 2.378378378378379e-05, "loss": 0.525, "step": 44 }, { "epoch": 0.15161086544535693, "grad_norm": 0.44813168016549754, "learning_rate": 2.4324324324324327e-05, "loss": 0.5182, "step": 45 }, { "epoch": 0.15497999578858707, "grad_norm": 0.6714598609090919, "learning_rate": 2.4864864864864866e-05, "loss": 0.5236, "step": 46 }, { "epoch": 0.15834912613181723, "grad_norm": 0.8782480079092542, "learning_rate": 2.5405405405405404e-05, "loss": 0.5189, "step": 47 }, { "epoch": 0.16171825647504737, "grad_norm": 0.933082396567328, "learning_rate": 2.594594594594595e-05, "loss": 0.5136, "step": 48 }, { "epoch": 0.16508738681827753, "grad_norm": 0.770716538259162, "learning_rate": 2.6486486486486488e-05, "loss": 0.515, "step": 49 }, { "epoch": 0.1684565171615077, "grad_norm": 0.5162725525339301, "learning_rate": 2.702702702702703e-05, "loss": 0.5102, "step": 50 }, { "epoch": 0.17182564750473783, "grad_norm": 1.015431854648304, "learning_rate": 2.756756756756757e-05, "loss": 0.5155, "step": 51 }, { "epoch": 0.175194777847968, "grad_norm": 0.9581799901627799, "learning_rate": 2.810810810810811e-05, "loss": 0.5144, "step": 52 }, { "epoch": 0.17856390819119813, "grad_norm": 0.7963519939531664, "learning_rate": 2.8648648648648653e-05, "loss": 0.5097, "step": 53 }, { "epoch": 0.1819330385344283, "grad_norm": 1.2154735765132731, "learning_rate": 2.918918918918919e-05, "loss": 0.5, "step": 54 }, { "epoch": 0.18530216887765846, "grad_norm": 0.5396425181695459, "learning_rate": 2.972972972972973e-05, "loss": 0.5028, "step": 55 }, { "epoch": 0.1886712992208886, "grad_norm": 1.02978967009914, "learning_rate": 3.0270270270270275e-05, "loss": 0.5081, "step": 56 }, { "epoch": 0.19204042956411876, "grad_norm": 0.8713408712796716, "learning_rate": 3.081081081081082e-05, "loss": 0.5117, "step": 57 }, { "epoch": 0.19540955990734893, "grad_norm": 0.5039355010657219, "learning_rate": 3.135135135135135e-05, "loss": 0.5062, "step": 58 }, { "epoch": 0.19877869025057907, "grad_norm": 0.8903957505195225, "learning_rate": 3.1891891891891894e-05, "loss": 0.5065, "step": 59 }, { "epoch": 0.20214782059380923, "grad_norm": 0.6438414779422286, "learning_rate": 3.2432432432432436e-05, "loss": 0.505, "step": 60 }, { "epoch": 0.20551695093703937, "grad_norm": 0.7196844068315907, "learning_rate": 3.297297297297298e-05, "loss": 0.5027, "step": 61 }, { "epoch": 0.20888608128026953, "grad_norm": 0.768470871542238, "learning_rate": 3.351351351351351e-05, "loss": 0.4952, "step": 62 }, { "epoch": 0.2122552116234997, "grad_norm": 1.2385285643157924, "learning_rate": 3.4054054054054055e-05, "loss": 0.4998, "step": 63 }, { "epoch": 0.21562434196672983, "grad_norm": 1.2926026603680456, "learning_rate": 3.45945945945946e-05, "loss": 0.5059, "step": 64 }, { "epoch": 0.21899347230996, "grad_norm": 0.5270463926051342, "learning_rate": 3.513513513513514e-05, "loss": 0.4882, "step": 65 }, { "epoch": 0.22236260265319013, "grad_norm": 0.9232242256936453, "learning_rate": 3.567567567567568e-05, "loss": 0.5001, "step": 66 }, { "epoch": 0.2257317329964203, "grad_norm": 1.1802567310747412, "learning_rate": 3.6216216216216216e-05, "loss": 0.4988, "step": 67 }, { "epoch": 0.22910086333965046, "grad_norm": 0.6824243910642153, "learning_rate": 3.6756756756756765e-05, "loss": 0.4968, "step": 68 }, { "epoch": 0.2324699936828806, "grad_norm": 1.1492977773270214, "learning_rate": 3.72972972972973e-05, "loss": 0.492, "step": 69 }, { "epoch": 0.23583912402611076, "grad_norm": 0.6936995427226362, "learning_rate": 3.783783783783784e-05, "loss": 0.484, "step": 70 }, { "epoch": 0.2392082543693409, "grad_norm": 0.8248787799708727, "learning_rate": 3.837837837837838e-05, "loss": 0.4875, "step": 71 }, { "epoch": 0.24257738471257106, "grad_norm": 0.9148972625023225, "learning_rate": 3.8918918918918926e-05, "loss": 0.4936, "step": 72 }, { "epoch": 0.24594651505580123, "grad_norm": 0.9090987008379405, "learning_rate": 3.945945945945946e-05, "loss": 0.4915, "step": 73 }, { "epoch": 0.24931564539903137, "grad_norm": 1.3515657782891444, "learning_rate": 4e-05, "loss": 0.4898, "step": 74 }, { "epoch": 0.2526847757422615, "grad_norm": 0.9152494686227338, "learning_rate": 4.0540540540540545e-05, "loss": 0.492, "step": 75 }, { "epoch": 0.25605390608549167, "grad_norm": 1.5681418879764368, "learning_rate": 4.108108108108108e-05, "loss": 0.4927, "step": 76 }, { "epoch": 0.25942303642872183, "grad_norm": 0.7357882817582275, "learning_rate": 4.162162162162163e-05, "loss": 0.4916, "step": 77 }, { "epoch": 0.262792166771952, "grad_norm": 1.8207664011692413, "learning_rate": 4.2162162162162164e-05, "loss": 0.496, "step": 78 }, { "epoch": 0.26616129711518216, "grad_norm": 1.1821288274944997, "learning_rate": 4.2702702702702706e-05, "loss": 0.4853, "step": 79 }, { "epoch": 0.2695304274584123, "grad_norm": 1.6328730676456176, "learning_rate": 4.324324324324325e-05, "loss": 0.4942, "step": 80 }, { "epoch": 0.27289955780164243, "grad_norm": 1.5462148913519038, "learning_rate": 4.3783783783783783e-05, "loss": 0.5016, "step": 81 }, { "epoch": 0.2762686881448726, "grad_norm": 1.0513902425500052, "learning_rate": 4.4324324324324325e-05, "loss": 0.4816, "step": 82 }, { "epoch": 0.27963781848810276, "grad_norm": 1.6621005750940228, "learning_rate": 4.4864864864864874e-05, "loss": 0.4999, "step": 83 }, { "epoch": 0.2830069488313329, "grad_norm": 1.0166569610477454, "learning_rate": 4.540540540540541e-05, "loss": 0.4909, "step": 84 }, { "epoch": 0.2863760791745631, "grad_norm": 1.3532816330257298, "learning_rate": 4.594594594594595e-05, "loss": 0.4853, "step": 85 }, { "epoch": 0.2897452095177932, "grad_norm": 1.1921249261435982, "learning_rate": 4.6486486486486486e-05, "loss": 0.4907, "step": 86 }, { "epoch": 0.29311433986102337, "grad_norm": 0.7446095855395451, "learning_rate": 4.702702702702703e-05, "loss": 0.4829, "step": 87 }, { "epoch": 0.29648347020425353, "grad_norm": 1.13704598401882, "learning_rate": 4.756756756756758e-05, "loss": 0.4923, "step": 88 }, { "epoch": 0.2998526005474837, "grad_norm": 0.9773983706335432, "learning_rate": 4.810810810810811e-05, "loss": 0.4925, "step": 89 }, { "epoch": 0.30322173089071386, "grad_norm": 1.405804106766561, "learning_rate": 4.8648648648648654e-05, "loss": 0.4988, "step": 90 }, { "epoch": 0.30659086123394397, "grad_norm": 1.2120207198526078, "learning_rate": 4.9189189189189196e-05, "loss": 0.493, "step": 91 }, { "epoch": 0.30995999157717413, "grad_norm": 1.209860911666279, "learning_rate": 4.972972972972973e-05, "loss": 0.4871, "step": 92 }, { "epoch": 0.3133291219204043, "grad_norm": 1.1742086251606596, "learning_rate": 5.027027027027027e-05, "loss": 0.4892, "step": 93 }, { "epoch": 0.31669825226363446, "grad_norm": 0.9715216714272247, "learning_rate": 5.081081081081081e-05, "loss": 0.4869, "step": 94 }, { "epoch": 0.3200673826068646, "grad_norm": 1.1287315453457836, "learning_rate": 5.135135135135136e-05, "loss": 0.483, "step": 95 }, { "epoch": 0.32343651295009473, "grad_norm": 1.3753216500561123, "learning_rate": 5.18918918918919e-05, "loss": 0.5011, "step": 96 }, { "epoch": 0.3268056432933249, "grad_norm": 1.0595415038900966, "learning_rate": 5.2432432432432434e-05, "loss": 0.4872, "step": 97 }, { "epoch": 0.33017477363655506, "grad_norm": 0.993959867595984, "learning_rate": 5.2972972972972976e-05, "loss": 0.4874, "step": 98 }, { "epoch": 0.33354390397978523, "grad_norm": 0.9824629794475075, "learning_rate": 5.3513513513513525e-05, "loss": 0.4813, "step": 99 }, { "epoch": 0.3369130343230154, "grad_norm": 1.1711893920755474, "learning_rate": 5.405405405405406e-05, "loss": 0.4799, "step": 100 }, { "epoch": 0.3402821646662455, "grad_norm": 0.6759536241348655, "learning_rate": 5.45945945945946e-05, "loss": 0.472, "step": 101 }, { "epoch": 0.34365129500947567, "grad_norm": 0.7703133612780192, "learning_rate": 5.513513513513514e-05, "loss": 0.484, "step": 102 }, { "epoch": 0.34702042535270583, "grad_norm": 0.7769913812178919, "learning_rate": 5.567567567567568e-05, "loss": 0.4758, "step": 103 }, { "epoch": 0.350389555695936, "grad_norm": 0.8455984342427874, "learning_rate": 5.621621621621622e-05, "loss": 0.4807, "step": 104 }, { "epoch": 0.35375868603916616, "grad_norm": 1.0743068767474058, "learning_rate": 5.6756756756756757e-05, "loss": 0.4845, "step": 105 }, { "epoch": 0.35712781638239627, "grad_norm": 1.098648574354644, "learning_rate": 5.7297297297297305e-05, "loss": 0.4912, "step": 106 }, { "epoch": 0.36049694672562643, "grad_norm": 1.6110582797161879, "learning_rate": 5.783783783783785e-05, "loss": 0.4975, "step": 107 }, { "epoch": 0.3638660770688566, "grad_norm": 0.8739341305512099, "learning_rate": 5.837837837837838e-05, "loss": 0.4785, "step": 108 }, { "epoch": 0.36723520741208676, "grad_norm": 1.267103393588983, "learning_rate": 5.8918918918918924e-05, "loss": 0.4922, "step": 109 }, { "epoch": 0.3706043377553169, "grad_norm": 1.2547562478396024, "learning_rate": 5.945945945945946e-05, "loss": 0.4865, "step": 110 }, { "epoch": 0.37397346809854703, "grad_norm": 1.1629837530431066, "learning_rate": 6.000000000000001e-05, "loss": 0.4815, "step": 111 }, { "epoch": 0.3773425984417772, "grad_norm": 0.8450218760805568, "learning_rate": 6.054054054054055e-05, "loss": 0.4795, "step": 112 }, { "epoch": 0.38071172878500736, "grad_norm": 1.116338785518496, "learning_rate": 6.108108108108108e-05, "loss": 0.4806, "step": 113 }, { "epoch": 0.38408085912823753, "grad_norm": 0.8964893535442878, "learning_rate": 6.162162162162163e-05, "loss": 0.4756, "step": 114 }, { "epoch": 0.3874499894714677, "grad_norm": 0.5881211331403472, "learning_rate": 6.216216216216216e-05, "loss": 0.4732, "step": 115 }, { "epoch": 0.39081911981469786, "grad_norm": 0.7134703598403237, "learning_rate": 6.27027027027027e-05, "loss": 0.4788, "step": 116 }, { "epoch": 0.39418825015792797, "grad_norm": 0.5897113644451194, "learning_rate": 6.324324324324325e-05, "loss": 0.4728, "step": 117 }, { "epoch": 0.39755738050115813, "grad_norm": 0.6261295369849983, "learning_rate": 6.378378378378379e-05, "loss": 0.4773, "step": 118 }, { "epoch": 0.4009265108443883, "grad_norm": 0.6923893019220413, "learning_rate": 6.432432432432433e-05, "loss": 0.4762, "step": 119 }, { "epoch": 0.40429564118761846, "grad_norm": 0.9773203912887567, "learning_rate": 6.486486486486487e-05, "loss": 0.4835, "step": 120 }, { "epoch": 0.4076647715308486, "grad_norm": 1.2845541454753984, "learning_rate": 6.540540540540541e-05, "loss": 0.4742, "step": 121 }, { "epoch": 0.41103390187407873, "grad_norm": 0.7789854415201711, "learning_rate": 6.594594594594596e-05, "loss": 0.4693, "step": 122 }, { "epoch": 0.4144030322173089, "grad_norm": 0.8431373845569413, "learning_rate": 6.648648648648648e-05, "loss": 0.4831, "step": 123 }, { "epoch": 0.41777216256053906, "grad_norm": 0.8809025032679428, "learning_rate": 6.702702702702703e-05, "loss": 0.48, "step": 124 }, { "epoch": 0.4211412929037692, "grad_norm": 0.7787166669908304, "learning_rate": 6.756756756756758e-05, "loss": 0.4861, "step": 125 }, { "epoch": 0.4245104232469994, "grad_norm": 0.7332477510232297, "learning_rate": 6.810810810810811e-05, "loss": 0.478, "step": 126 }, { "epoch": 0.4278795535902295, "grad_norm": 1.3066224708152618, "learning_rate": 6.864864864864865e-05, "loss": 0.4813, "step": 127 }, { "epoch": 0.43124868393345966, "grad_norm": 0.9836726211958252, "learning_rate": 6.91891891891892e-05, "loss": 0.4714, "step": 128 }, { "epoch": 0.43461781427668983, "grad_norm": 0.9119614187689974, "learning_rate": 6.972972972972974e-05, "loss": 0.4769, "step": 129 }, { "epoch": 0.43798694461992, "grad_norm": 0.6243806457837586, "learning_rate": 7.027027027027028e-05, "loss": 0.4794, "step": 130 }, { "epoch": 0.44135607496315016, "grad_norm": 0.7687415551391915, "learning_rate": 7.081081081081081e-05, "loss": 0.4732, "step": 131 }, { "epoch": 0.44472520530638027, "grad_norm": 1.0092750754274926, "learning_rate": 7.135135135135136e-05, "loss": 0.4776, "step": 132 }, { "epoch": 0.44809433564961043, "grad_norm": 0.8380704714410115, "learning_rate": 7.18918918918919e-05, "loss": 0.4648, "step": 133 }, { "epoch": 0.4514634659928406, "grad_norm": 0.6767690752757246, "learning_rate": 7.243243243243243e-05, "loss": 0.4609, "step": 134 }, { "epoch": 0.45483259633607076, "grad_norm": 0.9804619524504721, "learning_rate": 7.297297297297297e-05, "loss": 0.4713, "step": 135 }, { "epoch": 0.4582017266793009, "grad_norm": 1.3776587580151205, "learning_rate": 7.351351351351353e-05, "loss": 0.4794, "step": 136 }, { "epoch": 0.46157085702253103, "grad_norm": 0.5714903502719861, "learning_rate": 7.405405405405406e-05, "loss": 0.4693, "step": 137 }, { "epoch": 0.4649399873657612, "grad_norm": 1.1165381199232975, "learning_rate": 7.45945945945946e-05, "loss": 0.4789, "step": 138 }, { "epoch": 0.46830911770899136, "grad_norm": 1.1391177830520112, "learning_rate": 7.513513513513514e-05, "loss": 0.4763, "step": 139 }, { "epoch": 0.4716782480522215, "grad_norm": 0.8405758310660678, "learning_rate": 7.567567567567568e-05, "loss": 0.4708, "step": 140 }, { "epoch": 0.4750473783954517, "grad_norm": 0.673259823629261, "learning_rate": 7.621621621621623e-05, "loss": 0.4709, "step": 141 }, { "epoch": 0.4784165087386818, "grad_norm": 0.9085593163135716, "learning_rate": 7.675675675675675e-05, "loss": 0.4658, "step": 142 }, { "epoch": 0.48178563908191196, "grad_norm": 0.9065492978219933, "learning_rate": 7.729729729729731e-05, "loss": 0.4661, "step": 143 }, { "epoch": 0.48515476942514213, "grad_norm": 0.8751584723634406, "learning_rate": 7.783783783783785e-05, "loss": 0.4725, "step": 144 }, { "epoch": 0.4885238997683723, "grad_norm": 0.6907562652250656, "learning_rate": 7.837837837837838e-05, "loss": 0.4684, "step": 145 }, { "epoch": 0.49189303011160246, "grad_norm": 0.5990641326148477, "learning_rate": 7.891891891891892e-05, "loss": 0.4672, "step": 146 }, { "epoch": 0.4952621604548326, "grad_norm": 0.6994191437855128, "learning_rate": 7.945945945945946e-05, "loss": 0.4662, "step": 147 }, { "epoch": 0.49863129079806273, "grad_norm": 0.5573598940486624, "learning_rate": 8e-05, "loss": 0.4665, "step": 148 }, { "epoch": 0.502000421141293, "grad_norm": 0.6145912929008095, "learning_rate": 7.999988874460243e-05, "loss": 0.4669, "step": 149 }, { "epoch": 0.505369551484523, "grad_norm": 0.8011807879753905, "learning_rate": 7.999955497902857e-05, "loss": 0.4669, "step": 150 }, { "epoch": 0.5087386818277532, "grad_norm": 0.8616234404683479, "learning_rate": 7.99989987051351e-05, "loss": 0.4721, "step": 151 }, { "epoch": 0.5121078121709833, "grad_norm": 0.7813814403741567, "learning_rate": 7.999821992601645e-05, "loss": 0.4753, "step": 152 }, { "epoch": 0.5154769425142135, "grad_norm": 0.8208221005516424, "learning_rate": 7.999721864600476e-05, "loss": 0.4648, "step": 153 }, { "epoch": 0.5188460728574437, "grad_norm": 0.7471274236204338, "learning_rate": 7.999599487066996e-05, "loss": 0.4665, "step": 154 }, { "epoch": 0.5222152032006738, "grad_norm": 0.6025705010343646, "learning_rate": 7.999454860681961e-05, "loss": 0.4646, "step": 155 }, { "epoch": 0.525584333543904, "grad_norm": 0.6278670733672859, "learning_rate": 7.999287986249894e-05, "loss": 0.4582, "step": 156 }, { "epoch": 0.5289534638871342, "grad_norm": 0.7363381482182718, "learning_rate": 7.999098864699078e-05, "loss": 0.4644, "step": 157 }, { "epoch": 0.5323225942303643, "grad_norm": 0.5321478176964342, "learning_rate": 7.998887497081555e-05, "loss": 0.4558, "step": 158 }, { "epoch": 0.5356917245735945, "grad_norm": 0.5084748356369074, "learning_rate": 7.998653884573114e-05, "loss": 0.4576, "step": 159 }, { "epoch": 0.5390608549168246, "grad_norm": 0.44466968144745794, "learning_rate": 7.998398028473287e-05, "loss": 0.4628, "step": 160 }, { "epoch": 0.5424299852600547, "grad_norm": 0.5300560878644925, "learning_rate": 7.998119930205342e-05, "loss": 0.4587, "step": 161 }, { "epoch": 0.5457991156032849, "grad_norm": 0.4482671223105369, "learning_rate": 7.997819591316278e-05, "loss": 0.4595, "step": 162 }, { "epoch": 0.549168245946515, "grad_norm": 0.3831134887002804, "learning_rate": 7.997497013476808e-05, "loss": 0.4621, "step": 163 }, { "epoch": 0.5525373762897452, "grad_norm": 0.42236120459010645, "learning_rate": 7.99715219848136e-05, "loss": 0.4574, "step": 164 }, { "epoch": 0.5559065066329754, "grad_norm": 0.4457523958461928, "learning_rate": 7.996785148248062e-05, "loss": 0.4597, "step": 165 }, { "epoch": 0.5592756369762055, "grad_norm": 0.5006917210647484, "learning_rate": 7.996395864818727e-05, "loss": 0.4594, "step": 166 }, { "epoch": 0.5626447673194357, "grad_norm": 0.5999241658726214, "learning_rate": 7.995984350358851e-05, "loss": 0.4578, "step": 167 }, { "epoch": 0.5660138976626659, "grad_norm": 0.7291489485043735, "learning_rate": 7.995550607157592e-05, "loss": 0.4538, "step": 168 }, { "epoch": 0.569383028005896, "grad_norm": 0.7577464603442905, "learning_rate": 7.995094637627767e-05, "loss": 0.4507, "step": 169 }, { "epoch": 0.5727521583491262, "grad_norm": 0.5373115669466836, "learning_rate": 7.994616444305826e-05, "loss": 0.4602, "step": 170 }, { "epoch": 0.5761212886923562, "grad_norm": 0.5783937366804819, "learning_rate": 7.994116029851852e-05, "loss": 0.4621, "step": 171 }, { "epoch": 0.5794904190355864, "grad_norm": 0.7289647138839453, "learning_rate": 7.993593397049533e-05, "loss": 0.4569, "step": 172 }, { "epoch": 0.5828595493788166, "grad_norm": 0.7726864760162053, "learning_rate": 7.993048548806155e-05, "loss": 0.4609, "step": 173 }, { "epoch": 0.5862286797220467, "grad_norm": 0.7101749816908381, "learning_rate": 7.992481488152585e-05, "loss": 0.4628, "step": 174 }, { "epoch": 0.5895978100652769, "grad_norm": 0.7787526674806393, "learning_rate": 7.991892218243251e-05, "loss": 0.4664, "step": 175 }, { "epoch": 0.5929669404085071, "grad_norm": 0.9193285112654672, "learning_rate": 7.991280742356124e-05, "loss": 0.4583, "step": 176 }, { "epoch": 0.5963360707517372, "grad_norm": 0.863766394540256, "learning_rate": 7.990647063892704e-05, "loss": 0.4532, "step": 177 }, { "epoch": 0.5997052010949674, "grad_norm": 0.7969950754484971, "learning_rate": 7.989991186378e-05, "loss": 0.4649, "step": 178 }, { "epoch": 0.6030743314381976, "grad_norm": 0.9175228695778532, "learning_rate": 7.989313113460506e-05, "loss": 0.4598, "step": 179 }, { "epoch": 0.6064434617814277, "grad_norm": 1.189324294096932, "learning_rate": 7.988612848912186e-05, "loss": 0.4616, "step": 180 }, { "epoch": 0.6098125921246578, "grad_norm": 0.5502633850375939, "learning_rate": 7.987890396628451e-05, "loss": 0.4506, "step": 181 }, { "epoch": 0.6131817224678879, "grad_norm": 0.6418325300837303, "learning_rate": 7.987145760628138e-05, "loss": 0.4589, "step": 182 }, { "epoch": 0.6165508528111181, "grad_norm": 0.8487957991579048, "learning_rate": 7.986378945053483e-05, "loss": 0.4534, "step": 183 }, { "epoch": 0.6199199831543483, "grad_norm": 0.729090543693198, "learning_rate": 7.985589954170107e-05, "loss": 0.4502, "step": 184 }, { "epoch": 0.6232891134975784, "grad_norm": 0.564140229622775, "learning_rate": 7.984778792366983e-05, "loss": 0.4561, "step": 185 }, { "epoch": 0.6266582438408086, "grad_norm": 0.5489014465662102, "learning_rate": 7.983945464156419e-05, "loss": 0.4511, "step": 186 }, { "epoch": 0.6300273741840388, "grad_norm": 0.4439092473485429, "learning_rate": 7.983089974174026e-05, "loss": 0.4592, "step": 187 }, { "epoch": 0.6333965045272689, "grad_norm": 0.4899343556871492, "learning_rate": 7.982212327178699e-05, "loss": 0.4576, "step": 188 }, { "epoch": 0.6367656348704991, "grad_norm": 0.4429930723656228, "learning_rate": 7.981312528052587e-05, "loss": 0.4527, "step": 189 }, { "epoch": 0.6401347652137293, "grad_norm": 0.3517045134537643, "learning_rate": 7.980390581801064e-05, "loss": 0.4533, "step": 190 }, { "epoch": 0.6435038955569593, "grad_norm": 0.35596532078238685, "learning_rate": 7.979446493552708e-05, "loss": 0.4512, "step": 191 }, { "epoch": 0.6468730259001895, "grad_norm": 0.4117813215790183, "learning_rate": 7.97848026855926e-05, "loss": 0.4427, "step": 192 }, { "epoch": 0.6502421562434196, "grad_norm": 0.42570694906406503, "learning_rate": 7.977491912195611e-05, "loss": 0.4559, "step": 193 }, { "epoch": 0.6536112865866498, "grad_norm": 0.32926038316817535, "learning_rate": 7.976481429959758e-05, "loss": 0.4525, "step": 194 }, { "epoch": 0.65698041692988, "grad_norm": 0.3352588049162969, "learning_rate": 7.975448827472782e-05, "loss": 0.4465, "step": 195 }, { "epoch": 0.6603495472731101, "grad_norm": 0.3121745237951815, "learning_rate": 7.974394110478813e-05, "loss": 0.4504, "step": 196 }, { "epoch": 0.6637186776163403, "grad_norm": 0.3514443346936628, "learning_rate": 7.973317284844998e-05, "loss": 0.4543, "step": 197 }, { "epoch": 0.6670878079595705, "grad_norm": 0.36563500765518064, "learning_rate": 7.972218356561471e-05, "loss": 0.4466, "step": 198 }, { "epoch": 0.6704569383028006, "grad_norm": 0.36993328537084813, "learning_rate": 7.971097331741318e-05, "loss": 0.447, "step": 199 }, { "epoch": 0.6738260686460308, "grad_norm": 0.4218574088374599, "learning_rate": 7.96995421662054e-05, "loss": 0.4456, "step": 200 }, { "epoch": 0.677195198989261, "grad_norm": 0.5127127798248658, "learning_rate": 7.968789017558026e-05, "loss": 0.4367, "step": 201 }, { "epoch": 0.680564329332491, "grad_norm": 0.5533862982628416, "learning_rate": 7.967601741035507e-05, "loss": 0.4464, "step": 202 }, { "epoch": 0.6839334596757212, "grad_norm": 0.5128169904646379, "learning_rate": 7.966392393657533e-05, "loss": 0.4493, "step": 203 }, { "epoch": 0.6873025900189513, "grad_norm": 0.47256773564418525, "learning_rate": 7.965160982151422e-05, "loss": 0.4536, "step": 204 }, { "epoch": 0.6906717203621815, "grad_norm": 0.452879409095403, "learning_rate": 7.963907513367234e-05, "loss": 0.4589, "step": 205 }, { "epoch": 0.6940408507054117, "grad_norm": 0.455219584683228, "learning_rate": 7.962631994277728e-05, "loss": 0.4414, "step": 206 }, { "epoch": 0.6974099810486418, "grad_norm": 0.47863589957769587, "learning_rate": 7.961334431978321e-05, "loss": 0.4486, "step": 207 }, { "epoch": 0.700779111391872, "grad_norm": 0.5110385780704738, "learning_rate": 7.960014833687055e-05, "loss": 0.4495, "step": 208 }, { "epoch": 0.7041482417351022, "grad_norm": 0.4683257451933529, "learning_rate": 7.958673206744553e-05, "loss": 0.4522, "step": 209 }, { "epoch": 0.7075173720783323, "grad_norm": 0.4506553993940309, "learning_rate": 7.957309558613974e-05, "loss": 0.4452, "step": 210 }, { "epoch": 0.7108865024215625, "grad_norm": 0.4526028368594711, "learning_rate": 7.955923896880982e-05, "loss": 0.4456, "step": 211 }, { "epoch": 0.7142556327647925, "grad_norm": 0.5212859488073646, "learning_rate": 7.954516229253691e-05, "loss": 0.4482, "step": 212 }, { "epoch": 0.7176247631080227, "grad_norm": 0.4908480827080424, "learning_rate": 7.953086563562635e-05, "loss": 0.4404, "step": 213 }, { "epoch": 0.7209938934512529, "grad_norm": 0.43474906852801837, "learning_rate": 7.951634907760713e-05, "loss": 0.4415, "step": 214 }, { "epoch": 0.724363023794483, "grad_norm": 0.5465543422325746, "learning_rate": 7.950161269923153e-05, "loss": 0.453, "step": 215 }, { "epoch": 0.7277321541377132, "grad_norm": 0.5191090578880476, "learning_rate": 7.948665658247463e-05, "loss": 0.4511, "step": 216 }, { "epoch": 0.7311012844809434, "grad_norm": 0.41056922017028197, "learning_rate": 7.947148081053388e-05, "loss": 0.4428, "step": 217 }, { "epoch": 0.7344704148241735, "grad_norm": 0.4280367756173325, "learning_rate": 7.945608546782858e-05, "loss": 0.4552, "step": 218 }, { "epoch": 0.7378395451674037, "grad_norm": 0.44143498781875934, "learning_rate": 7.944047063999952e-05, "loss": 0.4461, "step": 219 }, { "epoch": 0.7412086755106339, "grad_norm": 0.4671020826488003, "learning_rate": 7.942463641390834e-05, "loss": 0.433, "step": 220 }, { "epoch": 0.744577805853864, "grad_norm": 0.4802991806753108, "learning_rate": 7.940858287763724e-05, "loss": 0.4487, "step": 221 }, { "epoch": 0.7479469361970941, "grad_norm": 0.4271821601132076, "learning_rate": 7.939231012048833e-05, "loss": 0.4509, "step": 222 }, { "epoch": 0.7513160665403242, "grad_norm": 0.38123610223687315, "learning_rate": 7.93758182329832e-05, "loss": 0.4372, "step": 223 }, { "epoch": 0.7546851968835544, "grad_norm": 0.465830487650423, "learning_rate": 7.935910730686246e-05, "loss": 0.4444, "step": 224 }, { "epoch": 0.7580543272267846, "grad_norm": 0.5651393352119582, "learning_rate": 7.934217743508513e-05, "loss": 0.4468, "step": 225 }, { "epoch": 0.7614234575700147, "grad_norm": 0.6526912705793722, "learning_rate": 7.932502871182818e-05, "loss": 0.4509, "step": 226 }, { "epoch": 0.7647925879132449, "grad_norm": 0.7684525411435036, "learning_rate": 7.930766123248602e-05, "loss": 0.4475, "step": 227 }, { "epoch": 0.7681617182564751, "grad_norm": 0.8868257582573387, "learning_rate": 7.929007509366994e-05, "loss": 0.4486, "step": 228 }, { "epoch": 0.7715308485997052, "grad_norm": 0.9592751619745519, "learning_rate": 7.927227039320758e-05, "loss": 0.442, "step": 229 }, { "epoch": 0.7748999789429354, "grad_norm": 0.8928159966805775, "learning_rate": 7.925424723014239e-05, "loss": 0.4541, "step": 230 }, { "epoch": 0.7782691092861656, "grad_norm": 0.7880900131568054, "learning_rate": 7.923600570473308e-05, "loss": 0.4514, "step": 231 }, { "epoch": 0.7816382396293957, "grad_norm": 0.4783123604515285, "learning_rate": 7.921754591845307e-05, "loss": 0.4442, "step": 232 }, { "epoch": 0.7850073699726258, "grad_norm": 0.4520386015737669, "learning_rate": 7.91988679739899e-05, "loss": 0.448, "step": 233 }, { "epoch": 0.7883765003158559, "grad_norm": 0.6605527609379506, "learning_rate": 7.917997197524467e-05, "loss": 0.4435, "step": 234 }, { "epoch": 0.7917456306590861, "grad_norm": 0.7089385732745206, "learning_rate": 7.916085802733147e-05, "loss": 0.4449, "step": 235 }, { "epoch": 0.7951147610023163, "grad_norm": 0.5904512970852802, "learning_rate": 7.914152623657678e-05, "loss": 0.448, "step": 236 }, { "epoch": 0.7984838913455464, "grad_norm": 0.5165195483185807, "learning_rate": 7.912197671051894e-05, "loss": 0.4475, "step": 237 }, { "epoch": 0.8018530216887766, "grad_norm": 0.47278629514591364, "learning_rate": 7.910220955790746e-05, "loss": 0.447, "step": 238 }, { "epoch": 0.8052221520320068, "grad_norm": 0.4466680465677497, "learning_rate": 7.908222488870243e-05, "loss": 0.4471, "step": 239 }, { "epoch": 0.8085912823752369, "grad_norm": 0.40052321749076436, "learning_rate": 7.906202281407398e-05, "loss": 0.4453, "step": 240 }, { "epoch": 0.8119604127184671, "grad_norm": 0.3808574042244712, "learning_rate": 7.90416034464016e-05, "loss": 0.4467, "step": 241 }, { "epoch": 0.8153295430616972, "grad_norm": 0.3009379630644614, "learning_rate": 7.902096689927355e-05, "loss": 0.4405, "step": 242 }, { "epoch": 0.8186986734049273, "grad_norm": 0.4006333439696202, "learning_rate": 7.900011328748619e-05, "loss": 0.441, "step": 243 }, { "epoch": 0.8220678037481575, "grad_norm": 0.36250537572683333, "learning_rate": 7.897904272704333e-05, "loss": 0.4382, "step": 244 }, { "epoch": 0.8254369340913876, "grad_norm": 0.37232144501481734, "learning_rate": 7.895775533515569e-05, "loss": 0.4455, "step": 245 }, { "epoch": 0.8288060644346178, "grad_norm": 0.4169869556836039, "learning_rate": 7.893625123024011e-05, "loss": 0.4356, "step": 246 }, { "epoch": 0.832175194777848, "grad_norm": 0.3864353557408192, "learning_rate": 7.891453053191898e-05, "loss": 0.4435, "step": 247 }, { "epoch": 0.8355443251210781, "grad_norm": 0.3608352846793135, "learning_rate": 7.889259336101957e-05, "loss": 0.4462, "step": 248 }, { "epoch": 0.8389134554643083, "grad_norm": 0.32373631118958723, "learning_rate": 7.887043983957327e-05, "loss": 0.4375, "step": 249 }, { "epoch": 0.8422825858075385, "grad_norm": 0.26424914090383317, "learning_rate": 7.884807009081506e-05, "loss": 0.4375, "step": 250 }, { "epoch": 0.8456517161507686, "grad_norm": 0.22444081020907958, "learning_rate": 7.882548423918268e-05, "loss": 0.4413, "step": 251 }, { "epoch": 0.8490208464939988, "grad_norm": 0.26045857383329957, "learning_rate": 7.880268241031604e-05, "loss": 0.4317, "step": 252 }, { "epoch": 0.8523899768372288, "grad_norm": 0.30550339254012787, "learning_rate": 7.877966473105645e-05, "loss": 0.4458, "step": 253 }, { "epoch": 0.855759107180459, "grad_norm": 0.34559528308231324, "learning_rate": 7.875643132944599e-05, "loss": 0.4403, "step": 254 }, { "epoch": 0.8591282375236892, "grad_norm": 0.35710994685108394, "learning_rate": 7.873298233472671e-05, "loss": 0.4394, "step": 255 }, { "epoch": 0.8624973678669193, "grad_norm": 0.42956681122910056, "learning_rate": 7.870931787733996e-05, "loss": 0.4403, "step": 256 }, { "epoch": 0.8658664982101495, "grad_norm": 0.5626197718228877, "learning_rate": 7.868543808892569e-05, "loss": 0.4387, "step": 257 }, { "epoch": 0.8692356285533797, "grad_norm": 0.6076789146858117, "learning_rate": 7.866134310232167e-05, "loss": 0.4439, "step": 258 }, { "epoch": 0.8726047588966098, "grad_norm": 0.5742280027785791, "learning_rate": 7.863703305156273e-05, "loss": 0.4455, "step": 259 }, { "epoch": 0.87597388923984, "grad_norm": 0.5069317059933754, "learning_rate": 7.861250807188014e-05, "loss": 0.4476, "step": 260 }, { "epoch": 0.8793430195830702, "grad_norm": 0.4288223928021788, "learning_rate": 7.858776829970069e-05, "loss": 0.4379, "step": 261 }, { "epoch": 0.8827121499263003, "grad_norm": 0.5442592728854474, "learning_rate": 7.856281387264603e-05, "loss": 0.4379, "step": 262 }, { "epoch": 0.8860812802695305, "grad_norm": 0.5638482346313414, "learning_rate": 7.853764492953192e-05, "loss": 0.4444, "step": 263 }, { "epoch": 0.8894504106127605, "grad_norm": 0.4523819114426828, "learning_rate": 7.851226161036739e-05, "loss": 0.4394, "step": 264 }, { "epoch": 0.8928195409559907, "grad_norm": 0.5349306408767115, "learning_rate": 7.848666405635398e-05, "loss": 0.441, "step": 265 }, { "epoch": 0.8961886712992209, "grad_norm": 0.5452142089194884, "learning_rate": 7.846085240988503e-05, "loss": 0.4483, "step": 266 }, { "epoch": 0.899557801642451, "grad_norm": 0.4222443920522887, "learning_rate": 7.843482681454476e-05, "loss": 0.4407, "step": 267 }, { "epoch": 0.9029269319856812, "grad_norm": 0.5310106072977896, "learning_rate": 7.840858741510758e-05, "loss": 0.4442, "step": 268 }, { "epoch": 0.9062960623289114, "grad_norm": 0.5876077411696179, "learning_rate": 7.838213435753724e-05, "loss": 0.4438, "step": 269 }, { "epoch": 0.9096651926721415, "grad_norm": 0.6100738415200538, "learning_rate": 7.835546778898599e-05, "loss": 0.4465, "step": 270 }, { "epoch": 0.9130343230153717, "grad_norm": 0.6760561504138676, "learning_rate": 7.832858785779383e-05, "loss": 0.4338, "step": 271 }, { "epoch": 0.9164034533586018, "grad_norm": 0.45392830007094576, "learning_rate": 7.830149471348763e-05, "loss": 0.431, "step": 272 }, { "epoch": 0.919772583701832, "grad_norm": 0.30596440551036547, "learning_rate": 7.827418850678034e-05, "loss": 0.4396, "step": 273 }, { "epoch": 0.9231417140450621, "grad_norm": 0.4969999175377505, "learning_rate": 7.824666938957004e-05, "loss": 0.4375, "step": 274 }, { "epoch": 0.9265108443882922, "grad_norm": 0.5437640388773309, "learning_rate": 7.82189375149393e-05, "loss": 0.444, "step": 275 }, { "epoch": 0.9298799747315224, "grad_norm": 0.4134501055661062, "learning_rate": 7.819099303715414e-05, "loss": 0.4385, "step": 276 }, { "epoch": 0.9332491050747526, "grad_norm": 0.3810051790575615, "learning_rate": 7.816283611166328e-05, "loss": 0.4339, "step": 277 }, { "epoch": 0.9366182354179827, "grad_norm": 0.4135193612689647, "learning_rate": 7.813446689509718e-05, "loss": 0.4413, "step": 278 }, { "epoch": 0.9399873657612129, "grad_norm": 0.5154216890519913, "learning_rate": 7.810588554526728e-05, "loss": 0.4409, "step": 279 }, { "epoch": 0.943356496104443, "grad_norm": 0.5335234306967277, "learning_rate": 7.807709222116506e-05, "loss": 0.4392, "step": 280 }, { "epoch": 0.9467256264476732, "grad_norm": 0.4582890089443176, "learning_rate": 7.804808708296116e-05, "loss": 0.44, "step": 281 }, { "epoch": 0.9500947567909034, "grad_norm": 0.41636142631229706, "learning_rate": 7.801887029200448e-05, "loss": 0.4359, "step": 282 }, { "epoch": 0.9534638871341335, "grad_norm": 0.3777680522962764, "learning_rate": 7.798944201082128e-05, "loss": 0.4305, "step": 283 }, { "epoch": 0.9568330174773636, "grad_norm": 0.31197040692277506, "learning_rate": 7.795980240311436e-05, "loss": 0.4378, "step": 284 }, { "epoch": 0.9602021478205938, "grad_norm": 0.2615719658181643, "learning_rate": 7.7929951633762e-05, "loss": 0.4349, "step": 285 }, { "epoch": 0.9635712781638239, "grad_norm": 0.27255928093352183, "learning_rate": 7.789988986881719e-05, "loss": 0.4324, "step": 286 }, { "epoch": 0.9669404085070541, "grad_norm": 0.3086259327892651, "learning_rate": 7.78696172755066e-05, "loss": 0.4338, "step": 287 }, { "epoch": 0.9703095388502843, "grad_norm": 0.3128738492807504, "learning_rate": 7.78391340222297e-05, "loss": 0.4327, "step": 288 }, { "epoch": 0.9736786691935144, "grad_norm": 0.28991557468061835, "learning_rate": 7.78084402785578e-05, "loss": 0.4368, "step": 289 }, { "epoch": 0.9770477995367446, "grad_norm": 0.3462635013389902, "learning_rate": 7.777753621523316e-05, "loss": 0.4376, "step": 290 }, { "epoch": 0.9804169298799748, "grad_norm": 0.41703460759212563, "learning_rate": 7.774642200416795e-05, "loss": 0.4364, "step": 291 }, { "epoch": 0.9837860602232049, "grad_norm": 0.5058437435233563, "learning_rate": 7.771509781844338e-05, "loss": 0.4392, "step": 292 }, { "epoch": 0.9871551905664351, "grad_norm": 0.49478795685868665, "learning_rate": 7.768356383230868e-05, "loss": 0.4387, "step": 293 }, { "epoch": 0.9905243209096652, "grad_norm": 0.4745986454402833, "learning_rate": 7.765182022118014e-05, "loss": 0.435, "step": 294 }, { "epoch": 0.9938934512528953, "grad_norm": 0.4611674206006931, "learning_rate": 7.761986716164019e-05, "loss": 0.4379, "step": 295 }, { "epoch": 0.9972625815961255, "grad_norm": 0.42674160555276347, "learning_rate": 7.758770483143634e-05, "loss": 0.4408, "step": 296 }, { "epoch": 1.0033691303432302, "grad_norm": 0.4680506210026581, "learning_rate": 7.755533340948024e-05, "loss": 0.4223, "step": 297 }, { "epoch": 1.0067382606864603, "grad_norm": 0.5238195474514908, "learning_rate": 7.752275307584664e-05, "loss": 0.4295, "step": 298 }, { "epoch": 1.0101073910296905, "grad_norm": 0.5889650401759404, "learning_rate": 7.748996401177244e-05, "loss": 0.4275, "step": 299 }, { "epoch": 1.0134765213729207, "grad_norm": 0.5507636965946558, "learning_rate": 7.745696639965569e-05, "loss": 0.4194, "step": 300 }, { "epoch": 1.0168456517161508, "grad_norm": 0.6157938140990948, "learning_rate": 7.742376042305449e-05, "loss": 0.433, "step": 301 }, { "epoch": 1.020214782059381, "grad_norm": 0.6252887191974348, "learning_rate": 7.739034626668605e-05, "loss": 0.4262, "step": 302 }, { "epoch": 1.0235839124026112, "grad_norm": 0.5880567265579987, "learning_rate": 7.735672411642562e-05, "loss": 0.4233, "step": 303 }, { "epoch": 1.0269530427458413, "grad_norm": 0.5726576390809455, "learning_rate": 7.732289415930549e-05, "loss": 0.424, "step": 304 }, { "epoch": 1.0303221730890715, "grad_norm": 0.47054865900441445, "learning_rate": 7.728885658351395e-05, "loss": 0.4176, "step": 305 }, { "epoch": 1.0336913034323016, "grad_norm": 0.4115743947585953, "learning_rate": 7.725461157839417e-05, "loss": 0.4292, "step": 306 }, { "epoch": 1.0370604337755316, "grad_norm": 0.4210186740776416, "learning_rate": 7.722015933444325e-05, "loss": 0.4247, "step": 307 }, { "epoch": 1.0404295641187618, "grad_norm": 0.36198066116445515, "learning_rate": 7.71855000433111e-05, "loss": 0.4193, "step": 308 }, { "epoch": 1.043798694461992, "grad_norm": 0.43778422486602975, "learning_rate": 7.715063389779936e-05, "loss": 0.4238, "step": 309 }, { "epoch": 1.047167824805222, "grad_norm": 0.45920696669429717, "learning_rate": 7.711556109186039e-05, "loss": 0.4237, "step": 310 }, { "epoch": 1.0505369551484522, "grad_norm": 0.3341781881526272, "learning_rate": 7.708028182059612e-05, "loss": 0.4239, "step": 311 }, { "epoch": 1.0539060854916824, "grad_norm": 0.3082296798332506, "learning_rate": 7.704479628025704e-05, "loss": 0.4167, "step": 312 }, { "epoch": 1.0572752158349126, "grad_norm": 0.33284929605340835, "learning_rate": 7.700910466824104e-05, "loss": 0.4233, "step": 313 }, { "epoch": 1.0606443461781427, "grad_norm": 0.333894193250551, "learning_rate": 7.697320718309235e-05, "loss": 0.4177, "step": 314 }, { "epoch": 1.064013476521373, "grad_norm": 0.39528163268670363, "learning_rate": 7.69371040245004e-05, "loss": 0.4188, "step": 315 }, { "epoch": 1.067382606864603, "grad_norm": 0.28394188370498197, "learning_rate": 7.690079539329875e-05, "loss": 0.4129, "step": 316 }, { "epoch": 1.0707517372078332, "grad_norm": 0.2953142618107928, "learning_rate": 7.686428149146398e-05, "loss": 0.4188, "step": 317 }, { "epoch": 1.0741208675510634, "grad_norm": 0.2804966323905774, "learning_rate": 7.682756252211453e-05, "loss": 0.4171, "step": 318 }, { "epoch": 1.0774899978942936, "grad_norm": 0.2510744302434169, "learning_rate": 7.679063868950955e-05, "loss": 0.4182, "step": 319 }, { "epoch": 1.0808591282375237, "grad_norm": 0.307553691452299, "learning_rate": 7.675351019904785e-05, "loss": 0.4177, "step": 320 }, { "epoch": 1.084228258580754, "grad_norm": 0.30605445723544605, "learning_rate": 7.671617725726666e-05, "loss": 0.4158, "step": 321 }, { "epoch": 1.087597388923984, "grad_norm": 0.3011711568174157, "learning_rate": 7.667864007184054e-05, "loss": 0.4141, "step": 322 }, { "epoch": 1.0909665192672142, "grad_norm": 0.286345264353555, "learning_rate": 7.664089885158023e-05, "loss": 0.4187, "step": 323 }, { "epoch": 1.0943356496104444, "grad_norm": 0.3592297464995333, "learning_rate": 7.660295380643144e-05, "loss": 0.4175, "step": 324 }, { "epoch": 1.0977047799536745, "grad_norm": 0.4650620300291997, "learning_rate": 7.656480514747374e-05, "loss": 0.4258, "step": 325 }, { "epoch": 1.1010739102969047, "grad_norm": 0.6004744804102117, "learning_rate": 7.652645308691933e-05, "loss": 0.419, "step": 326 }, { "epoch": 1.1044430406401347, "grad_norm": 0.6574099091252659, "learning_rate": 7.648789783811191e-05, "loss": 0.4217, "step": 327 }, { "epoch": 1.1078121709833648, "grad_norm": 0.6456572845078092, "learning_rate": 7.644913961552544e-05, "loss": 0.4207, "step": 328 }, { "epoch": 1.111181301326595, "grad_norm": 0.5305128144292774, "learning_rate": 7.641017863476298e-05, "loss": 0.4215, "step": 329 }, { "epoch": 1.1145504316698251, "grad_norm": 0.3067892240749629, "learning_rate": 7.637101511255554e-05, "loss": 0.4127, "step": 330 }, { "epoch": 1.1179195620130553, "grad_norm": 0.3619393126280428, "learning_rate": 7.633164926676076e-05, "loss": 0.4144, "step": 331 }, { "epoch": 1.1212886923562855, "grad_norm": 0.5099026443948101, "learning_rate": 7.629208131636179e-05, "loss": 0.4247, "step": 332 }, { "epoch": 1.1246578226995156, "grad_norm": 0.4600109550186491, "learning_rate": 7.625231148146601e-05, "loss": 0.4277, "step": 333 }, { "epoch": 1.1280269530427458, "grad_norm": 0.34171771897702874, "learning_rate": 7.621233998330387e-05, "loss": 0.4111, "step": 334 }, { "epoch": 1.131396083385976, "grad_norm": 0.3788094599826585, "learning_rate": 7.617216704422763e-05, "loss": 0.4238, "step": 335 }, { "epoch": 1.1347652137292061, "grad_norm": 0.38930545757784435, "learning_rate": 7.61317928877101e-05, "loss": 0.4266, "step": 336 }, { "epoch": 1.1381343440724363, "grad_norm": 0.3433858843038104, "learning_rate": 7.609121773834341e-05, "loss": 0.4113, "step": 337 }, { "epoch": 1.1415034744156665, "grad_norm": 0.3664698026597823, "learning_rate": 7.605044182183779e-05, "loss": 0.4215, "step": 338 }, { "epoch": 1.1448726047588966, "grad_norm": 0.3669871707797489, "learning_rate": 7.600946536502028e-05, "loss": 0.4187, "step": 339 }, { "epoch": 1.1482417351021268, "grad_norm": 0.36360389026471707, "learning_rate": 7.596828859583347e-05, "loss": 0.4179, "step": 340 }, { "epoch": 1.151610865445357, "grad_norm": 0.34303435626525425, "learning_rate": 7.592691174333426e-05, "loss": 0.4166, "step": 341 }, { "epoch": 1.1549799957885871, "grad_norm": 0.37256557828106257, "learning_rate": 7.588533503769257e-05, "loss": 0.4181, "step": 342 }, { "epoch": 1.1583491261318173, "grad_norm": 0.41467336010702505, "learning_rate": 7.584355871019002e-05, "loss": 0.4195, "step": 343 }, { "epoch": 1.1617182564750475, "grad_norm": 0.37682711741626357, "learning_rate": 7.580158299321872e-05, "loss": 0.4226, "step": 344 }, { "epoch": 1.1650873868182776, "grad_norm": 0.2646890963052802, "learning_rate": 7.575940812027993e-05, "loss": 0.4094, "step": 345 }, { "epoch": 1.1684565171615078, "grad_norm": 0.23766486308489482, "learning_rate": 7.571703432598275e-05, "loss": 0.42, "step": 346 }, { "epoch": 1.171825647504738, "grad_norm": 0.23844909696838593, "learning_rate": 7.567446184604285e-05, "loss": 0.4189, "step": 347 }, { "epoch": 1.175194777847968, "grad_norm": 0.23287909504956197, "learning_rate": 7.563169091728115e-05, "loss": 0.4123, "step": 348 }, { "epoch": 1.178563908191198, "grad_norm": 0.21692865660818864, "learning_rate": 7.558872177762246e-05, "loss": 0.4193, "step": 349 }, { "epoch": 1.1819330385344282, "grad_norm": 0.2191653011493883, "learning_rate": 7.554555466609425e-05, "loss": 0.4271, "step": 350 }, { "epoch": 1.1853021688776584, "grad_norm": 0.23748843252543808, "learning_rate": 7.550218982282518e-05, "loss": 0.4196, "step": 351 }, { "epoch": 1.1886712992208885, "grad_norm": 0.24616974908904704, "learning_rate": 7.545862748904394e-05, "loss": 0.4146, "step": 352 }, { "epoch": 1.1920404295641187, "grad_norm": 0.25196821392623664, "learning_rate": 7.541486790707776e-05, "loss": 0.4266, "step": 353 }, { "epoch": 1.1954095599073489, "grad_norm": 0.2470207075626988, "learning_rate": 7.537091132035111e-05, "loss": 0.4148, "step": 354 }, { "epoch": 1.198778690250579, "grad_norm": 0.2314470630644042, "learning_rate": 7.532675797338438e-05, "loss": 0.4033, "step": 355 }, { "epoch": 1.2021478205938092, "grad_norm": 0.23746828407475515, "learning_rate": 7.528240811179245e-05, "loss": 0.4203, "step": 356 }, { "epoch": 1.2055169509370394, "grad_norm": 0.28754749236703137, "learning_rate": 7.523786198228344e-05, "loss": 0.4182, "step": 357 }, { "epoch": 1.2088860812802695, "grad_norm": 0.3151739472415091, "learning_rate": 7.519311983265718e-05, "loss": 0.4222, "step": 358 }, { "epoch": 1.2122552116234997, "grad_norm": 0.34818120293706006, "learning_rate": 7.514818191180397e-05, "loss": 0.4162, "step": 359 }, { "epoch": 1.2156243419667299, "grad_norm": 0.39359185740609565, "learning_rate": 7.510304846970311e-05, "loss": 0.4179, "step": 360 }, { "epoch": 1.21899347230996, "grad_norm": 0.49169697889587877, "learning_rate": 7.505771975742157e-05, "loss": 0.42, "step": 361 }, { "epoch": 1.2223626026531902, "grad_norm": 0.6588501716182329, "learning_rate": 7.501219602711253e-05, "loss": 0.4207, "step": 362 }, { "epoch": 1.2257317329964204, "grad_norm": 0.6704211038154936, "learning_rate": 7.496647753201403e-05, "loss": 0.419, "step": 363 }, { "epoch": 1.2291008633396505, "grad_norm": 0.5759654898267196, "learning_rate": 7.492056452644753e-05, "loss": 0.418, "step": 364 }, { "epoch": 1.2324699936828807, "grad_norm": 0.46697984648682656, "learning_rate": 7.487445726581654e-05, "loss": 0.4202, "step": 365 }, { "epoch": 1.2358391240261108, "grad_norm": 0.4072897949316555, "learning_rate": 7.48281560066051e-05, "loss": 0.416, "step": 366 }, { "epoch": 1.2392082543693408, "grad_norm": 0.3677992679979327, "learning_rate": 7.47816610063765e-05, "loss": 0.4184, "step": 367 }, { "epoch": 1.242577384712571, "grad_norm": 0.39067043330522583, "learning_rate": 7.473497252377171e-05, "loss": 0.4246, "step": 368 }, { "epoch": 1.2459465150558011, "grad_norm": 0.4371263217453357, "learning_rate": 7.468809081850802e-05, "loss": 0.4154, "step": 369 }, { "epoch": 1.2493156453990313, "grad_norm": 0.466667275005644, "learning_rate": 7.464101615137756e-05, "loss": 0.4221, "step": 370 }, { "epoch": 1.2526847757422614, "grad_norm": 0.40517656554168363, "learning_rate": 7.459374878424585e-05, "loss": 0.4149, "step": 371 }, { "epoch": 1.2560539060854916, "grad_norm": 0.318301658777578, "learning_rate": 7.454628898005043e-05, "loss": 0.4117, "step": 372 }, { "epoch": 1.2594230364287218, "grad_norm": 0.27016373739597804, "learning_rate": 7.449863700279923e-05, "loss": 0.4151, "step": 373 }, { "epoch": 1.262792166771952, "grad_norm": 0.27155592369639425, "learning_rate": 7.445079311756924e-05, "loss": 0.4121, "step": 374 }, { "epoch": 1.266161297115182, "grad_norm": 0.3009069725735746, "learning_rate": 7.440275759050499e-05, "loss": 0.4209, "step": 375 }, { "epoch": 1.2695304274584123, "grad_norm": 0.2981596620618354, "learning_rate": 7.435453068881706e-05, "loss": 0.4127, "step": 376 }, { "epoch": 1.2728995578016424, "grad_norm": 0.323823588891271, "learning_rate": 7.430611268078059e-05, "loss": 0.4097, "step": 377 }, { "epoch": 1.2762686881448726, "grad_norm": 0.3872478492699807, "learning_rate": 7.425750383573384e-05, "loss": 0.4142, "step": 378 }, { "epoch": 1.2796378184881028, "grad_norm": 0.38340080550258643, "learning_rate": 7.420870442407662e-05, "loss": 0.4158, "step": 379 }, { "epoch": 1.283006948831333, "grad_norm": 0.33826799912854405, "learning_rate": 7.415971471726884e-05, "loss": 0.4181, "step": 380 }, { "epoch": 1.286376079174563, "grad_norm": 0.3412122527192401, "learning_rate": 7.411053498782893e-05, "loss": 0.4115, "step": 381 }, { "epoch": 1.2897452095177933, "grad_norm": 0.339753253979875, "learning_rate": 7.406116550933246e-05, "loss": 0.414, "step": 382 }, { "epoch": 1.2931143398610234, "grad_norm": 0.27036940059101494, "learning_rate": 7.401160655641044e-05, "loss": 0.4134, "step": 383 }, { "epoch": 1.2964834702042536, "grad_norm": 0.26771109575539487, "learning_rate": 7.396185840474792e-05, "loss": 0.4145, "step": 384 }, { "epoch": 1.2998526005474837, "grad_norm": 0.26853385874933655, "learning_rate": 7.391192133108243e-05, "loss": 0.4196, "step": 385 }, { "epoch": 1.303221730890714, "grad_norm": 0.25978626330709237, "learning_rate": 7.386179561320243e-05, "loss": 0.4179, "step": 386 }, { "epoch": 1.306590861233944, "grad_norm": 0.27982894045702544, "learning_rate": 7.381148152994573e-05, "loss": 0.4134, "step": 387 }, { "epoch": 1.3099599915771742, "grad_norm": 0.2323947422520883, "learning_rate": 7.376097936119803e-05, "loss": 0.4125, "step": 388 }, { "epoch": 1.3133291219204044, "grad_norm": 0.30635653504253, "learning_rate": 7.371028938789122e-05, "loss": 0.4169, "step": 389 }, { "epoch": 1.3166982522636346, "grad_norm": 0.30800676867326193, "learning_rate": 7.365941189200201e-05, "loss": 0.4124, "step": 390 }, { "epoch": 1.3200673826068647, "grad_norm": 0.26291762070095065, "learning_rate": 7.360834715655019e-05, "loss": 0.4163, "step": 391 }, { "epoch": 1.3234365129500947, "grad_norm": 0.28430697703638136, "learning_rate": 7.35570954655971e-05, "loss": 0.4126, "step": 392 }, { "epoch": 1.3268056432933248, "grad_norm": 0.26422175633500083, "learning_rate": 7.350565710424414e-05, "loss": 0.4089, "step": 393 }, { "epoch": 1.330174773636555, "grad_norm": 0.2426759795255368, "learning_rate": 7.345403235863105e-05, "loss": 0.4164, "step": 394 }, { "epoch": 1.3335439039797852, "grad_norm": 0.25948928094236345, "learning_rate": 7.340222151593443e-05, "loss": 0.4184, "step": 395 }, { "epoch": 1.3369130343230153, "grad_norm": 0.3377516247942669, "learning_rate": 7.335022486436608e-05, "loss": 0.4169, "step": 396 }, { "epoch": 1.3402821646662455, "grad_norm": 0.39062210665305114, "learning_rate": 7.329804269317137e-05, "loss": 0.4212, "step": 397 }, { "epoch": 1.3436512950094757, "grad_norm": 0.43725817266447464, "learning_rate": 7.324567529262775e-05, "loss": 0.4162, "step": 398 }, { "epoch": 1.3470204253527058, "grad_norm": 0.3940951458992484, "learning_rate": 7.319312295404301e-05, "loss": 0.4109, "step": 399 }, { "epoch": 1.350389555695936, "grad_norm": 0.30866362073984877, "learning_rate": 7.31403859697537e-05, "loss": 0.4138, "step": 400 }, { "epoch": 1.3537586860391662, "grad_norm": 0.26875958761791713, "learning_rate": 7.308746463312353e-05, "loss": 0.417, "step": 401 }, { "epoch": 1.3571278163823963, "grad_norm": 0.3115107888080639, "learning_rate": 7.303435923854172e-05, "loss": 0.4122, "step": 402 }, { "epoch": 1.3604969467256265, "grad_norm": 0.36714492695394935, "learning_rate": 7.298107008142139e-05, "loss": 0.4159, "step": 403 }, { "epoch": 1.3638660770688567, "grad_norm": 0.3981685397894353, "learning_rate": 7.292759745819781e-05, "loss": 0.4133, "step": 404 }, { "epoch": 1.3672352074120868, "grad_norm": 0.3069454557345131, "learning_rate": 7.287394166632691e-05, "loss": 0.4208, "step": 405 }, { "epoch": 1.370604337755317, "grad_norm": 0.24748441489038914, "learning_rate": 7.282010300428351e-05, "loss": 0.4104, "step": 406 }, { "epoch": 1.373973468098547, "grad_norm": 0.2118791200536055, "learning_rate": 7.276608177155968e-05, "loss": 0.4124, "step": 407 }, { "epoch": 1.377342598441777, "grad_norm": 0.24963361520147168, "learning_rate": 7.271187826866312e-05, "loss": 0.4149, "step": 408 }, { "epoch": 1.3807117287850073, "grad_norm": 0.31609061336459937, "learning_rate": 7.265749279711543e-05, "loss": 0.4266, "step": 409 }, { "epoch": 1.3840808591282374, "grad_norm": 0.35611885888992273, "learning_rate": 7.260292565945049e-05, "loss": 0.4144, "step": 410 }, { "epoch": 1.3874499894714676, "grad_norm": 0.36813941496856034, "learning_rate": 7.254817715921273e-05, "loss": 0.4148, "step": 411 }, { "epoch": 1.3908191198146977, "grad_norm": 0.3386310771485794, "learning_rate": 7.249324760095544e-05, "loss": 0.4157, "step": 412 }, { "epoch": 1.394188250157928, "grad_norm": 0.3286044748961816, "learning_rate": 7.243813729023913e-05, "loss": 0.418, "step": 413 }, { "epoch": 1.397557380501158, "grad_norm": 0.36106557337509826, "learning_rate": 7.238284653362977e-05, "loss": 0.4127, "step": 414 }, { "epoch": 1.4009265108443882, "grad_norm": 0.3713183412724868, "learning_rate": 7.232737563869711e-05, "loss": 0.4223, "step": 415 }, { "epoch": 1.4042956411876184, "grad_norm": 0.4108948812505769, "learning_rate": 7.227172491401299e-05, "loss": 0.4159, "step": 416 }, { "epoch": 1.4076647715308486, "grad_norm": 0.42746635756913826, "learning_rate": 7.221589466914955e-05, "loss": 0.4183, "step": 417 }, { "epoch": 1.4110339018740787, "grad_norm": 0.4281754377696307, "learning_rate": 7.215988521467763e-05, "loss": 0.4143, "step": 418 }, { "epoch": 1.414403032217309, "grad_norm": 0.34581190375042925, "learning_rate": 7.210369686216492e-05, "loss": 0.4232, "step": 419 }, { "epoch": 1.417772162560539, "grad_norm": 0.24817011068233216, "learning_rate": 7.204732992417431e-05, "loss": 0.4203, "step": 420 }, { "epoch": 1.4211412929037692, "grad_norm": 0.2703015486109723, "learning_rate": 7.199078471426208e-05, "loss": 0.4188, "step": 421 }, { "epoch": 1.4245104232469994, "grad_norm": 0.3376907597722382, "learning_rate": 7.193406154697625e-05, "loss": 0.4123, "step": 422 }, { "epoch": 1.4278795535902296, "grad_norm": 0.35688284736368614, "learning_rate": 7.187716073785471e-05, "loss": 0.4073, "step": 423 }, { "epoch": 1.4312486839334597, "grad_norm": 0.29210262958830335, "learning_rate": 7.18200826034236e-05, "loss": 0.4155, "step": 424 }, { "epoch": 1.4346178142766899, "grad_norm": 0.20624868853539452, "learning_rate": 7.176282746119544e-05, "loss": 0.4082, "step": 425 }, { "epoch": 1.43798694461992, "grad_norm": 0.21431087254932987, "learning_rate": 7.17053956296674e-05, "loss": 0.4072, "step": 426 }, { "epoch": 1.4413560749631502, "grad_norm": 0.25982900003092185, "learning_rate": 7.164778742831954e-05, "loss": 0.4113, "step": 427 }, { "epoch": 1.4447252053063804, "grad_norm": 0.3503298873194117, "learning_rate": 7.159000317761305e-05, "loss": 0.4128, "step": 428 }, { "epoch": 1.4480943356496105, "grad_norm": 0.4693051629184559, "learning_rate": 7.153204319898839e-05, "loss": 0.4138, "step": 429 }, { "epoch": 1.4514634659928407, "grad_norm": 0.502991287048126, "learning_rate": 7.14739078148636e-05, "loss": 0.4157, "step": 430 }, { "epoch": 1.4548325963360709, "grad_norm": 0.5001041791172387, "learning_rate": 7.141559734863245e-05, "loss": 0.4082, "step": 431 }, { "epoch": 1.458201726679301, "grad_norm": 0.4696810029288007, "learning_rate": 7.135711212466264e-05, "loss": 0.4198, "step": 432 }, { "epoch": 1.461570857022531, "grad_norm": 0.43034902073433023, "learning_rate": 7.1298452468294e-05, "loss": 0.4165, "step": 433 }, { "epoch": 1.4649399873657611, "grad_norm": 0.4022839654121198, "learning_rate": 7.123961870583671e-05, "loss": 0.4096, "step": 434 }, { "epoch": 1.4683091177089913, "grad_norm": 0.3107712308577315, "learning_rate": 7.118061116456944e-05, "loss": 0.4137, "step": 435 }, { "epoch": 1.4716782480522215, "grad_norm": 0.3140180702883453, "learning_rate": 7.112143017273759e-05, "loss": 0.4108, "step": 436 }, { "epoch": 1.4750473783954516, "grad_norm": 0.40495663409539695, "learning_rate": 7.106207605955136e-05, "loss": 0.4166, "step": 437 }, { "epoch": 1.4784165087386818, "grad_norm": 0.4652370041483942, "learning_rate": 7.100254915518408e-05, "loss": 0.414, "step": 438 }, { "epoch": 1.481785639081912, "grad_norm": 0.41391982007664, "learning_rate": 7.094284979077015e-05, "loss": 0.4131, "step": 439 }, { "epoch": 1.4851547694251421, "grad_norm": 0.34516805959620245, "learning_rate": 7.088297829840346e-05, "loss": 0.4129, "step": 440 }, { "epoch": 1.4885238997683723, "grad_norm": 0.32652038382328485, "learning_rate": 7.08229350111353e-05, "loss": 0.413, "step": 441 }, { "epoch": 1.4918930301116025, "grad_norm": 0.22506092814882847, "learning_rate": 7.076272026297268e-05, "loss": 0.4127, "step": 442 }, { "epoch": 1.4952621604548326, "grad_norm": 0.2282536847065667, "learning_rate": 7.070233438887639e-05, "loss": 0.4071, "step": 443 }, { "epoch": 1.4986312907980628, "grad_norm": 0.2446847320184482, "learning_rate": 7.064177772475912e-05, "loss": 0.4138, "step": 444 }, { "epoch": 1.502000421141293, "grad_norm": 0.25152698752852437, "learning_rate": 7.05810506074837e-05, "loss": 0.4141, "step": 445 }, { "epoch": 1.505369551484523, "grad_norm": 0.2548217617366647, "learning_rate": 7.052015337486109e-05, "loss": 0.4098, "step": 446 }, { "epoch": 1.508738681827753, "grad_norm": 0.2731777853595498, "learning_rate": 7.045908636564858e-05, "loss": 0.4118, "step": 447 }, { "epoch": 1.5121078121709832, "grad_norm": 0.3121024086238583, "learning_rate": 7.03978499195479e-05, "loss": 0.4111, "step": 448 }, { "epoch": 1.5154769425142134, "grad_norm": 0.28013154989340816, "learning_rate": 7.03364443772033e-05, "loss": 0.4123, "step": 449 }, { "epoch": 1.5188460728574436, "grad_norm": 0.20045789950968235, "learning_rate": 7.027487008019969e-05, "loss": 0.41, "step": 450 }, { "epoch": 1.5222152032006737, "grad_norm": 0.1935253416836786, "learning_rate": 7.021312737106068e-05, "loss": 0.4184, "step": 451 }, { "epoch": 1.5255843335439039, "grad_norm": 0.2182563856327327, "learning_rate": 7.015121659324678e-05, "loss": 0.4121, "step": 452 }, { "epoch": 1.528953463887134, "grad_norm": 0.20129933934815375, "learning_rate": 7.00891380911534e-05, "loss": 0.4136, "step": 453 }, { "epoch": 1.5323225942303642, "grad_norm": 0.19011310030838788, "learning_rate": 7.002689221010897e-05, "loss": 0.4113, "step": 454 }, { "epoch": 1.5356917245735944, "grad_norm": 0.19585723303180483, "learning_rate": 6.9964479296373e-05, "loss": 0.4139, "step": 455 }, { "epoch": 1.5390608549168245, "grad_norm": 0.1740680287737997, "learning_rate": 6.990189969713416e-05, "loss": 0.4141, "step": 456 }, { "epoch": 1.5424299852600547, "grad_norm": 0.2068670733390012, "learning_rate": 6.983915376050833e-05, "loss": 0.4093, "step": 457 }, { "epoch": 1.5457991156032849, "grad_norm": 0.2583283837253456, "learning_rate": 6.977624183553676e-05, "loss": 0.4192, "step": 458 }, { "epoch": 1.549168245946515, "grad_norm": 0.28194252885557924, "learning_rate": 6.971316427218399e-05, "loss": 0.412, "step": 459 }, { "epoch": 1.5525373762897452, "grad_norm": 0.27071463569696164, "learning_rate": 6.964992142133602e-05, "loss": 0.4207, "step": 460 }, { "epoch": 1.5559065066329754, "grad_norm": 0.27470579632282327, "learning_rate": 6.958651363479822e-05, "loss": 0.4165, "step": 461 }, { "epoch": 1.5592756369762055, "grad_norm": 0.2703402600040993, "learning_rate": 6.952294126529356e-05, "loss": 0.4134, "step": 462 }, { "epoch": 1.5626447673194357, "grad_norm": 0.26465479604538705, "learning_rate": 6.94592046664605e-05, "loss": 0.4136, "step": 463 }, { "epoch": 1.5660138976626659, "grad_norm": 0.31132857636043815, "learning_rate": 6.939530419285104e-05, "loss": 0.4163, "step": 464 }, { "epoch": 1.569383028005896, "grad_norm": 0.4012221142274, "learning_rate": 6.933124019992884e-05, "loss": 0.4138, "step": 465 }, { "epoch": 1.5727521583491262, "grad_norm": 0.5021621002447393, "learning_rate": 6.926701304406713e-05, "loss": 0.4105, "step": 466 }, { "epoch": 1.5761212886923563, "grad_norm": 0.5905418251776803, "learning_rate": 6.920262308254683e-05, "loss": 0.4147, "step": 467 }, { "epoch": 1.5794904190355865, "grad_norm": 0.6182317762023337, "learning_rate": 6.913807067355445e-05, "loss": 0.4128, "step": 468 }, { "epoch": 1.5828595493788167, "grad_norm": 0.4945917435433832, "learning_rate": 6.907335617618018e-05, "loss": 0.4167, "step": 469 }, { "epoch": 1.5862286797220468, "grad_norm": 0.3166116083838581, "learning_rate": 6.90084799504159e-05, "loss": 0.4136, "step": 470 }, { "epoch": 1.589597810065277, "grad_norm": 0.2848441164225104, "learning_rate": 6.894344235715311e-05, "loss": 0.4127, "step": 471 }, { "epoch": 1.5929669404085072, "grad_norm": 0.35210847111444277, "learning_rate": 6.887824375818099e-05, "loss": 0.4125, "step": 472 }, { "epoch": 1.5963360707517373, "grad_norm": 0.36122192833869504, "learning_rate": 6.881288451618431e-05, "loss": 0.4175, "step": 473 }, { "epoch": 1.5997052010949675, "grad_norm": 0.30874010342588315, "learning_rate": 6.874736499474154e-05, "loss": 0.4123, "step": 474 }, { "epoch": 1.6030743314381977, "grad_norm": 0.2415425383601781, "learning_rate": 6.868168555832266e-05, "loss": 0.409, "step": 475 }, { "epoch": 1.6064434617814278, "grad_norm": 0.2777593930598247, "learning_rate": 6.861584657228728e-05, "loss": 0.4109, "step": 476 }, { "epoch": 1.6098125921246578, "grad_norm": 0.2552160489277856, "learning_rate": 6.854984840288253e-05, "loss": 0.4063, "step": 477 }, { "epoch": 1.613181722467888, "grad_norm": 0.21292379117303817, "learning_rate": 6.848369141724104e-05, "loss": 0.4113, "step": 478 }, { "epoch": 1.616550852811118, "grad_norm": 0.25826725556041485, "learning_rate": 6.841737598337886e-05, "loss": 0.4162, "step": 479 }, { "epoch": 1.6199199831543483, "grad_norm": 0.24587379643844692, "learning_rate": 6.835090247019354e-05, "loss": 0.4098, "step": 480 }, { "epoch": 1.6232891134975784, "grad_norm": 0.22506059025604672, "learning_rate": 6.828427124746191e-05, "loss": 0.4177, "step": 481 }, { "epoch": 1.6266582438408086, "grad_norm": 0.2625291980432951, "learning_rate": 6.821748268583813e-05, "loss": 0.4138, "step": 482 }, { "epoch": 1.6300273741840388, "grad_norm": 0.2899682108073399, "learning_rate": 6.815053715685161e-05, "loss": 0.4112, "step": 483 }, { "epoch": 1.633396504527269, "grad_norm": 0.24684733944107418, "learning_rate": 6.808343503290491e-05, "loss": 0.4084, "step": 484 }, { "epoch": 1.636765634870499, "grad_norm": 0.22856568944562095, "learning_rate": 6.80161766872717e-05, "loss": 0.4099, "step": 485 }, { "epoch": 1.6401347652137293, "grad_norm": 0.2528553309235842, "learning_rate": 6.79487624940947e-05, "loss": 0.4074, "step": 486 }, { "epoch": 1.6435038955569592, "grad_norm": 0.24954291821287325, "learning_rate": 6.788119282838355e-05, "loss": 0.4156, "step": 487 }, { "epoch": 1.6468730259001894, "grad_norm": 0.2486958212588815, "learning_rate": 6.781346806601273e-05, "loss": 0.4148, "step": 488 }, { "epoch": 1.6502421562434195, "grad_norm": 0.20838834765340428, "learning_rate": 6.774558858371952e-05, "loss": 0.4107, "step": 489 }, { "epoch": 1.6536112865866497, "grad_norm": 0.157993940020379, "learning_rate": 6.767755475910185e-05, "loss": 0.4112, "step": 490 }, { "epoch": 1.6569804169298799, "grad_norm": 0.24383891745288697, "learning_rate": 6.760936697061626e-05, "loss": 0.4117, "step": 491 }, { "epoch": 1.66034954727311, "grad_norm": 0.28630859094765176, "learning_rate": 6.754102559757569e-05, "loss": 0.4108, "step": 492 }, { "epoch": 1.6637186776163402, "grad_norm": 0.2744705368738465, "learning_rate": 6.74725310201475e-05, "loss": 0.4068, "step": 493 }, { "epoch": 1.6670878079595703, "grad_norm": 0.2832510381791776, "learning_rate": 6.740388361935125e-05, "loss": 0.4072, "step": 494 }, { "epoch": 1.6704569383028005, "grad_norm": 0.2988249231230451, "learning_rate": 6.733508377705661e-05, "loss": 0.4077, "step": 495 }, { "epoch": 1.6738260686460307, "grad_norm": 0.24557523045791532, "learning_rate": 6.726613187598132e-05, "loss": 0.416, "step": 496 }, { "epoch": 1.6771951989892608, "grad_norm": 0.21450213834423756, "learning_rate": 6.71970282996889e-05, "loss": 0.4099, "step": 497 }, { "epoch": 1.680564329332491, "grad_norm": 0.2564463597465919, "learning_rate": 6.712777343258666e-05, "loss": 0.4113, "step": 498 }, { "epoch": 1.6839334596757212, "grad_norm": 0.28973958295073354, "learning_rate": 6.705836765992348e-05, "loss": 0.4173, "step": 499 }, { "epoch": 1.6873025900189513, "grad_norm": 0.3093418967185147, "learning_rate": 6.698881136778771e-05, "loss": 0.4173, "step": 500 }, { "epoch": 1.6906717203621815, "grad_norm": 0.30710292961925306, "learning_rate": 6.691910494310499e-05, "loss": 0.4202, "step": 501 }, { "epoch": 1.6940408507054117, "grad_norm": 0.298386372490933, "learning_rate": 6.684924877363613e-05, "loss": 0.4063, "step": 502 }, { "epoch": 1.6974099810486418, "grad_norm": 0.31358421654801716, "learning_rate": 6.67792432479749e-05, "loss": 0.4117, "step": 503 }, { "epoch": 1.700779111391872, "grad_norm": 0.34684913918298366, "learning_rate": 6.670908875554594e-05, "loss": 0.4103, "step": 504 }, { "epoch": 1.7041482417351022, "grad_norm": 0.3071849696400485, "learning_rate": 6.663878568660258e-05, "loss": 0.4064, "step": 505 }, { "epoch": 1.7075173720783323, "grad_norm": 0.25934260311596186, "learning_rate": 6.656833443222458e-05, "loss": 0.4026, "step": 506 }, { "epoch": 1.7108865024215625, "grad_norm": 0.254331135385578, "learning_rate": 6.649773538431605e-05, "loss": 0.4123, "step": 507 }, { "epoch": 1.7142556327647926, "grad_norm": 0.2696672284837906, "learning_rate": 6.642698893560327e-05, "loss": 0.4135, "step": 508 }, { "epoch": 1.7176247631080228, "grad_norm": 0.3170338993835499, "learning_rate": 6.635609547963243e-05, "loss": 0.4078, "step": 509 }, { "epoch": 1.720993893451253, "grad_norm": 0.34598694657993484, "learning_rate": 6.628505541076755e-05, "loss": 0.4143, "step": 510 }, { "epoch": 1.7243630237944831, "grad_norm": 0.3659302514618013, "learning_rate": 6.621386912418816e-05, "loss": 0.413, "step": 511 }, { "epoch": 1.7277321541377133, "grad_norm": 0.3036155922766547, "learning_rate": 6.614253701588718e-05, "loss": 0.413, "step": 512 }, { "epoch": 1.7311012844809435, "grad_norm": 0.26442302840915777, "learning_rate": 6.607105948266872e-05, "loss": 0.4141, "step": 513 }, { "epoch": 1.7344704148241736, "grad_norm": 0.2820703196464, "learning_rate": 6.599943692214587e-05, "loss": 0.4154, "step": 514 }, { "epoch": 1.7378395451674038, "grad_norm": 0.2716579783783052, "learning_rate": 6.592766973273843e-05, "loss": 0.418, "step": 515 }, { "epoch": 1.741208675510634, "grad_norm": 0.2320214556767005, "learning_rate": 6.585575831367078e-05, "loss": 0.4136, "step": 516 }, { "epoch": 1.7445778058538641, "grad_norm": 0.20790915888905742, "learning_rate": 6.578370306496957e-05, "loss": 0.4126, "step": 517 }, { "epoch": 1.747946936197094, "grad_norm": 0.2165582926633229, "learning_rate": 6.571150438746157e-05, "loss": 0.4112, "step": 518 }, { "epoch": 1.7513160665403242, "grad_norm": 0.24261057128754013, "learning_rate": 6.563916268277144e-05, "loss": 0.413, "step": 519 }, { "epoch": 1.7546851968835544, "grad_norm": 0.2755800264624728, "learning_rate": 6.55666783533194e-05, "loss": 0.4166, "step": 520 }, { "epoch": 1.7580543272267846, "grad_norm": 0.28813858434017786, "learning_rate": 6.549405180231911e-05, "loss": 0.404, "step": 521 }, { "epoch": 1.7614234575700147, "grad_norm": 0.24090919880210407, "learning_rate": 6.542128343377536e-05, "loss": 0.4075, "step": 522 }, { "epoch": 1.764792587913245, "grad_norm": 0.21389800108034238, "learning_rate": 6.534837365248185e-05, "loss": 0.4124, "step": 523 }, { "epoch": 1.768161718256475, "grad_norm": 0.2562042134322129, "learning_rate": 6.527532286401889e-05, "loss": 0.4174, "step": 524 }, { "epoch": 1.7715308485997052, "grad_norm": 0.2571401145743441, "learning_rate": 6.520213147475123e-05, "loss": 0.4144, "step": 525 }, { "epoch": 1.7748999789429354, "grad_norm": 0.2423820773625362, "learning_rate": 6.51287998918257e-05, "loss": 0.4046, "step": 526 }, { "epoch": 1.7782691092861656, "grad_norm": 0.2310131148631897, "learning_rate": 6.505532852316904e-05, "loss": 0.407, "step": 527 }, { "epoch": 1.7816382396293957, "grad_norm": 0.2467085051059651, "learning_rate": 6.498171777748557e-05, "loss": 0.4134, "step": 528 }, { "epoch": 1.7850073699726257, "grad_norm": 0.2429312927228722, "learning_rate": 6.49079680642549e-05, "loss": 0.4136, "step": 529 }, { "epoch": 1.7883765003158558, "grad_norm": 0.18962286619000535, "learning_rate": 6.483407979372975e-05, "loss": 0.4094, "step": 530 }, { "epoch": 1.791745630659086, "grad_norm": 0.17276030637120937, "learning_rate": 6.476005337693355e-05, "loss": 0.4127, "step": 531 }, { "epoch": 1.7951147610023162, "grad_norm": 0.1991873488324741, "learning_rate": 6.468588922565822e-05, "loss": 0.407, "step": 532 }, { "epoch": 1.7984838913455463, "grad_norm": 0.23230143768755912, "learning_rate": 6.461158775246186e-05, "loss": 0.4069, "step": 533 }, { "epoch": 1.8018530216887765, "grad_norm": 0.25362081452848795, "learning_rate": 6.453714937066648e-05, "loss": 0.4089, "step": 534 }, { "epoch": 1.8052221520320066, "grad_norm": 0.20024317986028692, "learning_rate": 6.446257449435566e-05, "loss": 0.4062, "step": 535 }, { "epoch": 1.8085912823752368, "grad_norm": 0.16636181558776822, "learning_rate": 6.438786353837228e-05, "loss": 0.4061, "step": 536 }, { "epoch": 1.811960412718467, "grad_norm": 0.20687002125002474, "learning_rate": 6.43130169183162e-05, "loss": 0.4131, "step": 537 }, { "epoch": 1.8153295430616971, "grad_norm": 0.2568138645034864, "learning_rate": 6.423803505054193e-05, "loss": 0.411, "step": 538 }, { "epoch": 1.8186986734049273, "grad_norm": 0.3369872578212292, "learning_rate": 6.416291835215636e-05, "loss": 0.4077, "step": 539 }, { "epoch": 1.8220678037481575, "grad_norm": 0.41379320932213953, "learning_rate": 6.408766724101638e-05, "loss": 0.4077, "step": 540 }, { "epoch": 1.8254369340913876, "grad_norm": 0.43767998472550695, "learning_rate": 6.401228213572663e-05, "loss": 0.4151, "step": 541 }, { "epoch": 1.8288060644346178, "grad_norm": 0.4536984763596022, "learning_rate": 6.393676345563708e-05, "loss": 0.42, "step": 542 }, { "epoch": 1.832175194777848, "grad_norm": 0.4692529959956868, "learning_rate": 6.386111162084078e-05, "loss": 0.4002, "step": 543 }, { "epoch": 1.8355443251210781, "grad_norm": 0.34237321055490366, "learning_rate": 6.378532705217148e-05, "loss": 0.406, "step": 544 }, { "epoch": 1.8389134554643083, "grad_norm": 0.2659729255014706, "learning_rate": 6.370941017120127e-05, "loss": 0.4135, "step": 545 }, { "epoch": 1.8422825858075385, "grad_norm": 0.32797296963486666, "learning_rate": 6.363336140023833e-05, "loss": 0.4088, "step": 546 }, { "epoch": 1.8456517161507686, "grad_norm": 0.35579650932418716, "learning_rate": 6.355718116232444e-05, "loss": 0.4093, "step": 547 }, { "epoch": 1.8490208464939988, "grad_norm": 0.2907411351475013, "learning_rate": 6.348086988123274e-05, "loss": 0.4116, "step": 548 }, { "epoch": 1.852389976837229, "grad_norm": 0.2732388318681213, "learning_rate": 6.340442798146535e-05, "loss": 0.4091, "step": 549 }, { "epoch": 1.855759107180459, "grad_norm": 0.35761144655913124, "learning_rate": 6.332785588825094e-05, "loss": 0.4037, "step": 550 }, { "epoch": 1.8591282375236893, "grad_norm": 0.3014328362434633, "learning_rate": 6.325115402754245e-05, "loss": 0.4072, "step": 551 }, { "epoch": 1.8624973678669194, "grad_norm": 0.2340334979203501, "learning_rate": 6.317432282601469e-05, "loss": 0.403, "step": 552 }, { "epoch": 1.8658664982101496, "grad_norm": 0.33855256005840595, "learning_rate": 6.309736271106193e-05, "loss": 0.4106, "step": 553 }, { "epoch": 1.8692356285533798, "grad_norm": 0.31482993852294594, "learning_rate": 6.302027411079562e-05, "loss": 0.4079, "step": 554 }, { "epoch": 1.87260475889661, "grad_norm": 0.21683415129270545, "learning_rate": 6.294305745404185e-05, "loss": 0.4032, "step": 555 }, { "epoch": 1.87597388923984, "grad_norm": 0.209469978649313, "learning_rate": 6.286571317033915e-05, "loss": 0.4088, "step": 556 }, { "epoch": 1.8793430195830703, "grad_norm": 0.2816343476274617, "learning_rate": 6.278824168993596e-05, "loss": 0.4126, "step": 557 }, { "epoch": 1.8827121499263004, "grad_norm": 0.32252631746288557, "learning_rate": 6.271064344378832e-05, "loss": 0.4086, "step": 558 }, { "epoch": 1.8860812802695306, "grad_norm": 0.2900131891387989, "learning_rate": 6.263291886355738e-05, "loss": 0.4086, "step": 559 }, { "epoch": 1.8894504106127605, "grad_norm": 0.26445922268042416, "learning_rate": 6.255506838160711e-05, "loss": 0.4093, "step": 560 }, { "epoch": 1.8928195409559907, "grad_norm": 0.2561028521945913, "learning_rate": 6.247709243100185e-05, "loss": 0.4136, "step": 561 }, { "epoch": 1.8961886712992209, "grad_norm": 0.23899571940882475, "learning_rate": 6.239899144550383e-05, "loss": 0.4058, "step": 562 }, { "epoch": 1.899557801642451, "grad_norm": 0.2338421290415243, "learning_rate": 6.232076585957087e-05, "loss": 0.4074, "step": 563 }, { "epoch": 1.9029269319856812, "grad_norm": 0.18752299712254275, "learning_rate": 6.224241610835391e-05, "loss": 0.4096, "step": 564 }, { "epoch": 1.9062960623289114, "grad_norm": 0.19324708447438393, "learning_rate": 6.216394262769459e-05, "loss": 0.4096, "step": 565 }, { "epoch": 1.9096651926721415, "grad_norm": 0.21276012461948887, "learning_rate": 6.208534585412282e-05, "loss": 0.4033, "step": 566 }, { "epoch": 1.9130343230153717, "grad_norm": 0.18970083289771164, "learning_rate": 6.200662622485435e-05, "loss": 0.4054, "step": 567 }, { "epoch": 1.9164034533586018, "grad_norm": 0.1696360552220803, "learning_rate": 6.19277841777884e-05, "loss": 0.4069, "step": 568 }, { "epoch": 1.919772583701832, "grad_norm": 0.19478504599245822, "learning_rate": 6.18488201515051e-05, "loss": 0.4054, "step": 569 }, { "epoch": 1.923141714045062, "grad_norm": 0.16721019486842992, "learning_rate": 6.176973458526317e-05, "loss": 0.4142, "step": 570 }, { "epoch": 1.9265108443882921, "grad_norm": 0.18059816629328238, "learning_rate": 6.169052791899742e-05, "loss": 0.4047, "step": 571 }, { "epoch": 1.9298799747315223, "grad_norm": 0.2125539453111369, "learning_rate": 6.161120059331628e-05, "loss": 0.4074, "step": 572 }, { "epoch": 1.9332491050747524, "grad_norm": 0.19087275687720429, "learning_rate": 6.153175304949946e-05, "loss": 0.411, "step": 573 }, { "epoch": 1.9366182354179826, "grad_norm": 0.18049162279809125, "learning_rate": 6.14521857294953e-05, "loss": 0.4055, "step": 574 }, { "epoch": 1.9399873657612128, "grad_norm": 0.17375875826436044, "learning_rate": 6.137249907591855e-05, "loss": 0.4065, "step": 575 }, { "epoch": 1.943356496104443, "grad_norm": 0.1739704448036202, "learning_rate": 6.129269353204769e-05, "loss": 0.4055, "step": 576 }, { "epoch": 1.946725626447673, "grad_norm": 0.18538527661707113, "learning_rate": 6.121276954182261e-05, "loss": 0.4097, "step": 577 }, { "epoch": 1.9500947567909033, "grad_norm": 0.15156397322647622, "learning_rate": 6.113272754984206e-05, "loss": 0.4061, "step": 578 }, { "epoch": 1.9534638871341334, "grad_norm": 0.18018187705246097, "learning_rate": 6.105256800136125e-05, "loss": 0.4086, "step": 579 }, { "epoch": 1.9568330174773636, "grad_norm": 0.1842284584819115, "learning_rate": 6.0972291342289274e-05, "loss": 0.413, "step": 580 }, { "epoch": 1.9602021478205938, "grad_norm": 0.20065268901018266, "learning_rate": 6.0891898019186726e-05, "loss": 0.4068, "step": 581 }, { "epoch": 1.963571278163824, "grad_norm": 0.20725303582942523, "learning_rate": 6.081138847926317e-05, "loss": 0.4102, "step": 582 }, { "epoch": 1.966940408507054, "grad_norm": 0.19644421357341532, "learning_rate": 6.0730763170374636e-05, "loss": 0.4053, "step": 583 }, { "epoch": 1.9703095388502843, "grad_norm": 0.20950085034614344, "learning_rate": 6.065002254102116e-05, "loss": 0.4043, "step": 584 }, { "epoch": 1.9736786691935144, "grad_norm": 0.22898989423400687, "learning_rate": 6.056916704034429e-05, "loss": 0.4038, "step": 585 }, { "epoch": 1.9770477995367446, "grad_norm": 0.2379556008347109, "learning_rate": 6.048819711812457e-05, "loss": 0.4075, "step": 586 }, { "epoch": 1.9804169298799748, "grad_norm": 0.23608922426333814, "learning_rate": 6.040711322477906e-05, "loss": 0.4074, "step": 587 }, { "epoch": 1.983786060223205, "grad_norm": 0.2036587578092891, "learning_rate": 6.032591581135878e-05, "loss": 0.4116, "step": 588 }, { "epoch": 1.987155190566435, "grad_norm": 0.1851902404809834, "learning_rate": 6.024460532954626e-05, "loss": 0.4015, "step": 589 }, { "epoch": 1.9905243209096652, "grad_norm": 0.18802588423448818, "learning_rate": 6.0163182231652985e-05, "loss": 0.4054, "step": 590 }, { "epoch": 1.9938934512528954, "grad_norm": 0.22345260630855865, "learning_rate": 6.008164697061695e-05, "loss": 0.4055, "step": 591 }, { "epoch": 1.9972625815961256, "grad_norm": 0.23969549917986255, "learning_rate": 6.000000000000001e-05, "loss": 0.4015, "step": 592 }, { "epoch": 2.00336913034323, "grad_norm": 0.2867299003150961, "learning_rate": 5.991824177398549e-05, "loss": 0.3913, "step": 593 }, { "epoch": 2.0067382606864603, "grad_norm": 0.34375862252314415, "learning_rate": 5.983637274737558e-05, "loss": 0.391, "step": 594 }, { "epoch": 2.0101073910296905, "grad_norm": 0.3635152444198319, "learning_rate": 5.975439337558886e-05, "loss": 0.3799, "step": 595 }, { "epoch": 2.0134765213729207, "grad_norm": 0.3422619581016819, "learning_rate": 5.967230411465768e-05, "loss": 0.388, "step": 596 }, { "epoch": 2.016845651716151, "grad_norm": 0.32857568135445225, "learning_rate": 5.9590105421225715e-05, "loss": 0.3873, "step": 597 }, { "epoch": 2.020214782059381, "grad_norm": 0.34465546224144156, "learning_rate": 5.950779775254539e-05, "loss": 0.3864, "step": 598 }, { "epoch": 2.023583912402611, "grad_norm": 0.3318091541966093, "learning_rate": 5.9425381566475316e-05, "loss": 0.3901, "step": 599 }, { "epoch": 2.0269530427458413, "grad_norm": 0.3211852458337534, "learning_rate": 5.934285732147778e-05, "loss": 0.3865, "step": 600 }, { "epoch": 2.0303221730890715, "grad_norm": 0.28372803606540153, "learning_rate": 5.9260225476616157e-05, "loss": 0.3809, "step": 601 }, { "epoch": 2.0336913034323016, "grad_norm": 0.26378333051858827, "learning_rate": 5.91774864915524e-05, "loss": 0.3825, "step": 602 }, { "epoch": 2.037060433775532, "grad_norm": 0.2699942011391507, "learning_rate": 5.909464082654442e-05, "loss": 0.3814, "step": 603 }, { "epoch": 2.040429564118762, "grad_norm": 0.32423565538212784, "learning_rate": 5.90116889424436e-05, "loss": 0.3949, "step": 604 }, { "epoch": 2.043798694461992, "grad_norm": 0.3504800062724603, "learning_rate": 5.8928631300692185e-05, "loss": 0.3919, "step": 605 }, { "epoch": 2.0471678248052223, "grad_norm": 0.28670213447600656, "learning_rate": 5.884546836332072e-05, "loss": 0.3848, "step": 606 }, { "epoch": 2.0505369551484525, "grad_norm": 0.24765267252916567, "learning_rate": 5.8762200592945484e-05, "loss": 0.3862, "step": 607 }, { "epoch": 2.0539060854916826, "grad_norm": 0.25397158563496697, "learning_rate": 5.867882845276593e-05, "loss": 0.384, "step": 608 }, { "epoch": 2.057275215834913, "grad_norm": 0.19777815923412465, "learning_rate": 5.859535240656208e-05, "loss": 0.385, "step": 609 }, { "epoch": 2.060644346178143, "grad_norm": 0.25257499668230105, "learning_rate": 5.851177291869197e-05, "loss": 0.3902, "step": 610 }, { "epoch": 2.064013476521373, "grad_norm": 0.23438152088089984, "learning_rate": 5.842809045408905e-05, "loss": 0.3828, "step": 611 }, { "epoch": 2.0673826068646033, "grad_norm": 0.24579596547862945, "learning_rate": 5.834430547825964e-05, "loss": 0.3895, "step": 612 }, { "epoch": 2.070751737207833, "grad_norm": 0.254567202187919, "learning_rate": 5.826041845728026e-05, "loss": 0.3884, "step": 613 }, { "epoch": 2.074120867551063, "grad_norm": 0.26694805867978466, "learning_rate": 5.8176429857795104e-05, "loss": 0.3884, "step": 614 }, { "epoch": 2.0774899978942933, "grad_norm": 0.292686078529123, "learning_rate": 5.809234014701342e-05, "loss": 0.3869, "step": 615 }, { "epoch": 2.0808591282375235, "grad_norm": 0.2543773210365024, "learning_rate": 5.8008149792706936e-05, "loss": 0.3841, "step": 616 }, { "epoch": 2.0842282585807537, "grad_norm": 0.23117543050120432, "learning_rate": 5.7923859263207205e-05, "loss": 0.3839, "step": 617 }, { "epoch": 2.087597388923984, "grad_norm": 0.32949270894440474, "learning_rate": 5.783946902740304e-05, "loss": 0.3848, "step": 618 }, { "epoch": 2.090966519267214, "grad_norm": 0.3487344164810163, "learning_rate": 5.7754979554737924e-05, "loss": 0.3841, "step": 619 }, { "epoch": 2.094335649610444, "grad_norm": 0.23249972606551436, "learning_rate": 5.767039131520733e-05, "loss": 0.3808, "step": 620 }, { "epoch": 2.0977047799536743, "grad_norm": 0.1642526127565639, "learning_rate": 5.758570477935618e-05, "loss": 0.3852, "step": 621 }, { "epoch": 2.1010739102969045, "grad_norm": 0.22737138050339126, "learning_rate": 5.750092041827618e-05, "loss": 0.3862, "step": 622 }, { "epoch": 2.1044430406401347, "grad_norm": 0.22187422496371617, "learning_rate": 5.7416038703603216e-05, "loss": 0.39, "step": 623 }, { "epoch": 2.107812170983365, "grad_norm": 0.1976542359852637, "learning_rate": 5.7331060107514754e-05, "loss": 0.3828, "step": 624 }, { "epoch": 2.111181301326595, "grad_norm": 0.22929255732564582, "learning_rate": 5.724598510272714e-05, "loss": 0.3865, "step": 625 }, { "epoch": 2.114550431669825, "grad_norm": 0.2281829564525587, "learning_rate": 5.716081416249307e-05, "loss": 0.3834, "step": 626 }, { "epoch": 2.1179195620130553, "grad_norm": 0.1711530750792344, "learning_rate": 5.707554776059886e-05, "loss": 0.3864, "step": 627 }, { "epoch": 2.1212886923562855, "grad_norm": 0.1952598465412235, "learning_rate": 5.699018637136192e-05, "loss": 0.3853, "step": 628 }, { "epoch": 2.1246578226995156, "grad_norm": 0.21178404694012465, "learning_rate": 5.6904730469627985e-05, "loss": 0.394, "step": 629 }, { "epoch": 2.128026953042746, "grad_norm": 0.2291084803798316, "learning_rate": 5.681918053076858e-05, "loss": 0.3851, "step": 630 }, { "epoch": 2.131396083385976, "grad_norm": 0.2550272051240587, "learning_rate": 5.673353703067832e-05, "loss": 0.3872, "step": 631 }, { "epoch": 2.134765213729206, "grad_norm": 0.2497998419444254, "learning_rate": 5.664780044577231e-05, "loss": 0.3881, "step": 632 }, { "epoch": 2.1381343440724363, "grad_norm": 0.2222082480877385, "learning_rate": 5.6561971252983424e-05, "loss": 0.388, "step": 633 }, { "epoch": 2.1415034744156665, "grad_norm": 0.18680744639544267, "learning_rate": 5.6476049929759714e-05, "loss": 0.3891, "step": 634 }, { "epoch": 2.1448726047588966, "grad_norm": 0.21245971460544757, "learning_rate": 5.6390036954061726e-05, "loss": 0.3863, "step": 635 }, { "epoch": 2.148241735102127, "grad_norm": 0.2162219122370638, "learning_rate": 5.6303932804359857e-05, "loss": 0.3909, "step": 636 }, { "epoch": 2.151610865445357, "grad_norm": 0.15581628741660436, "learning_rate": 5.621773795963166e-05, "loss": 0.3879, "step": 637 }, { "epoch": 2.154979995788587, "grad_norm": 0.22990888646168536, "learning_rate": 5.613145289935926e-05, "loss": 0.3882, "step": 638 }, { "epoch": 2.1583491261318173, "grad_norm": 0.24959544004712048, "learning_rate": 5.6045078103526545e-05, "loss": 0.3799, "step": 639 }, { "epoch": 2.1617182564750475, "grad_norm": 0.2308113655952683, "learning_rate": 5.595861405261666e-05, "loss": 0.3879, "step": 640 }, { "epoch": 2.1650873868182776, "grad_norm": 0.2092244335914582, "learning_rate": 5.58720612276092e-05, "loss": 0.3871, "step": 641 }, { "epoch": 2.168456517161508, "grad_norm": 0.2134067897632055, "learning_rate": 5.578542010997764e-05, "loss": 0.3822, "step": 642 }, { "epoch": 2.171825647504738, "grad_norm": 0.20839647987055449, "learning_rate": 5.569869118168655e-05, "loss": 0.3848, "step": 643 }, { "epoch": 2.175194777847968, "grad_norm": 0.16985344503865618, "learning_rate": 5.561187492518903e-05, "loss": 0.3858, "step": 644 }, { "epoch": 2.1785639081911983, "grad_norm": 0.20941799721128232, "learning_rate": 5.5524971823423905e-05, "loss": 0.392, "step": 645 }, { "epoch": 2.1819330385344284, "grad_norm": 0.21048667694813664, "learning_rate": 5.5437982359813156e-05, "loss": 0.3837, "step": 646 }, { "epoch": 2.1853021688776586, "grad_norm": 0.17246060013503955, "learning_rate": 5.5350907018259135e-05, "loss": 0.3863, "step": 647 }, { "epoch": 2.1886712992208888, "grad_norm": 0.1808917523018754, "learning_rate": 5.526374628314195e-05, "loss": 0.3873, "step": 648 }, { "epoch": 2.192040429564119, "grad_norm": 0.16962189075007583, "learning_rate": 5.5176500639316693e-05, "loss": 0.3806, "step": 649 }, { "epoch": 2.195409559907349, "grad_norm": 0.15829489129124838, "learning_rate": 5.50891705721108e-05, "loss": 0.3912, "step": 650 }, { "epoch": 2.1987786902505793, "grad_norm": 0.20128590320313494, "learning_rate": 5.5001756567321355e-05, "loss": 0.3792, "step": 651 }, { "epoch": 2.2021478205938094, "grad_norm": 0.218877863583923, "learning_rate": 5.4914259111212355e-05, "loss": 0.3865, "step": 652 }, { "epoch": 2.2055169509370396, "grad_norm": 0.17606235529471279, "learning_rate": 5.482667869051199e-05, "loss": 0.3917, "step": 653 }, { "epoch": 2.2088860812802693, "grad_norm": 0.14890556371643418, "learning_rate": 5.473901579241e-05, "loss": 0.38, "step": 654 }, { "epoch": 2.2122552116235, "grad_norm": 0.1654643380961197, "learning_rate": 5.4651270904554915e-05, "loss": 0.394, "step": 655 }, { "epoch": 2.2156243419667296, "grad_norm": 0.1570214426630876, "learning_rate": 5.4563444515051354e-05, "loss": 0.3854, "step": 656 }, { "epoch": 2.21899347230996, "grad_norm": 0.1691883131216727, "learning_rate": 5.44755371124573e-05, "loss": 0.3851, "step": 657 }, { "epoch": 2.22236260265319, "grad_norm": 0.17557198906026328, "learning_rate": 5.438754918578144e-05, "loss": 0.3913, "step": 658 }, { "epoch": 2.22573173299642, "grad_norm": 0.16768631591392807, "learning_rate": 5.429948122448031e-05, "loss": 0.386, "step": 659 }, { "epoch": 2.2291008633396503, "grad_norm": 0.14731731125382688, "learning_rate": 5.4211333718455756e-05, "loss": 0.3922, "step": 660 }, { "epoch": 2.2324699936828805, "grad_norm": 0.17746489461476853, "learning_rate": 5.4123107158052034e-05, "loss": 0.387, "step": 661 }, { "epoch": 2.2358391240261106, "grad_norm": 0.1903089984499793, "learning_rate": 5.4034802034053223e-05, "loss": 0.3833, "step": 662 }, { "epoch": 2.239208254369341, "grad_norm": 0.17184011460057994, "learning_rate": 5.394641883768041e-05, "loss": 0.39, "step": 663 }, { "epoch": 2.242577384712571, "grad_norm": 0.20233097347593668, "learning_rate": 5.3857958060588955e-05, "loss": 0.3891, "step": 664 }, { "epoch": 2.245946515055801, "grad_norm": 0.21958650033217517, "learning_rate": 5.3769420194865806e-05, "loss": 0.3856, "step": 665 }, { "epoch": 2.2493156453990313, "grad_norm": 0.18358377095064263, "learning_rate": 5.368080573302676e-05, "loss": 0.3828, "step": 666 }, { "epoch": 2.2526847757422614, "grad_norm": 0.17979672984272335, "learning_rate": 5.359211516801365e-05, "loss": 0.3804, "step": 667 }, { "epoch": 2.2560539060854916, "grad_norm": 0.16294334924828324, "learning_rate": 5.3503348993191706e-05, "loss": 0.3825, "step": 668 }, { "epoch": 2.2594230364287218, "grad_norm": 0.1508454226549176, "learning_rate": 5.34145077023467e-05, "loss": 0.385, "step": 669 }, { "epoch": 2.262792166771952, "grad_norm": 0.15470462637665758, "learning_rate": 5.332559178968231e-05, "loss": 0.3778, "step": 670 }, { "epoch": 2.266161297115182, "grad_norm": 0.1359656397629021, "learning_rate": 5.3236601749817296e-05, "loss": 0.3896, "step": 671 }, { "epoch": 2.2695304274584123, "grad_norm": 0.15226695399087686, "learning_rate": 5.314753807778276e-05, "loss": 0.3874, "step": 672 }, { "epoch": 2.2728995578016424, "grad_norm": 0.14503332183422835, "learning_rate": 5.3058401269019415e-05, "loss": 0.3878, "step": 673 }, { "epoch": 2.2762686881448726, "grad_norm": 0.15318787409886342, "learning_rate": 5.296919181937485e-05, "loss": 0.3857, "step": 674 }, { "epoch": 2.2796378184881028, "grad_norm": 0.16971373493795616, "learning_rate": 5.2879910225100655e-05, "loss": 0.3855, "step": 675 }, { "epoch": 2.283006948831333, "grad_norm": 0.1654804092839339, "learning_rate": 5.279055698284982e-05, "loss": 0.3877, "step": 676 }, { "epoch": 2.286376079174563, "grad_norm": 0.1505186583674958, "learning_rate": 5.270113258967386e-05, "loss": 0.3832, "step": 677 }, { "epoch": 2.2897452095177933, "grad_norm": 0.16676666984467559, "learning_rate": 5.261163754302011e-05, "loss": 0.386, "step": 678 }, { "epoch": 2.2931143398610234, "grad_norm": 0.18567032268425918, "learning_rate": 5.2522072340728896e-05, "loss": 0.3907, "step": 679 }, { "epoch": 2.2964834702042536, "grad_norm": 0.1765483695468527, "learning_rate": 5.2432437481030855e-05, "loss": 0.3882, "step": 680 }, { "epoch": 2.2998526005474837, "grad_norm": 0.165430115440251, "learning_rate": 5.234273346254406e-05, "loss": 0.3946, "step": 681 }, { "epoch": 2.303221730890714, "grad_norm": 0.1690494896953244, "learning_rate": 5.225296078427135e-05, "loss": 0.3857, "step": 682 }, { "epoch": 2.306590861233944, "grad_norm": 0.201198083663681, "learning_rate": 5.216311994559744e-05, "loss": 0.389, "step": 683 }, { "epoch": 2.3099599915771742, "grad_norm": 0.20812621009650192, "learning_rate": 5.207321144628628e-05, "loss": 0.3865, "step": 684 }, { "epoch": 2.3133291219204044, "grad_norm": 0.21426999240641148, "learning_rate": 5.198323578647813e-05, "loss": 0.3867, "step": 685 }, { "epoch": 2.3166982522636346, "grad_norm": 0.213657425755296, "learning_rate": 5.18931934666869e-05, "loss": 0.3922, "step": 686 }, { "epoch": 2.3200673826068647, "grad_norm": 0.17137164943244815, "learning_rate": 5.180308498779728e-05, "loss": 0.3789, "step": 687 }, { "epoch": 2.323436512950095, "grad_norm": 0.18022826820320403, "learning_rate": 5.171291085106202e-05, "loss": 0.3815, "step": 688 }, { "epoch": 2.326805643293325, "grad_norm": 0.1755115364994259, "learning_rate": 5.162267155809908e-05, "loss": 0.389, "step": 689 }, { "epoch": 2.3301747736365552, "grad_norm": 0.2011673377143987, "learning_rate": 5.153236761088888e-05, "loss": 0.3894, "step": 690 }, { "epoch": 2.3335439039797854, "grad_norm": 0.2305809255417625, "learning_rate": 5.14419995117715e-05, "loss": 0.3811, "step": 691 }, { "epoch": 2.3369130343230156, "grad_norm": 0.2115835801437973, "learning_rate": 5.135156776344389e-05, "loss": 0.3892, "step": 692 }, { "epoch": 2.3402821646662453, "grad_norm": 0.19470845993737926, "learning_rate": 5.126107286895702e-05, "loss": 0.3832, "step": 693 }, { "epoch": 2.343651295009476, "grad_norm": 0.16438102517886552, "learning_rate": 5.117051533171321e-05, "loss": 0.3863, "step": 694 }, { "epoch": 2.3470204253527056, "grad_norm": 0.17475480058915455, "learning_rate": 5.1079895655463177e-05, "loss": 0.3859, "step": 695 }, { "epoch": 2.350389555695936, "grad_norm": 0.18741810484417695, "learning_rate": 5.098921434430333e-05, "loss": 0.3825, "step": 696 }, { "epoch": 2.353758686039166, "grad_norm": 0.1687881382681767, "learning_rate": 5.0898471902672917e-05, "loss": 0.3758, "step": 697 }, { "epoch": 2.357127816382396, "grad_norm": 0.18436298872908952, "learning_rate": 5.080766883535129e-05, "loss": 0.3852, "step": 698 }, { "epoch": 2.3604969467256263, "grad_norm": 0.19845837669577285, "learning_rate": 5.0716805647455006e-05, "loss": 0.3854, "step": 699 }, { "epoch": 2.3638660770688564, "grad_norm": 0.18343761135804904, "learning_rate": 5.062588284443505e-05, "loss": 0.3825, "step": 700 }, { "epoch": 2.3672352074120866, "grad_norm": 0.13923107512819735, "learning_rate": 5.053490093207408e-05, "loss": 0.3797, "step": 701 }, { "epoch": 2.3706043377553168, "grad_norm": 0.1783129344294203, "learning_rate": 5.0443860416483536e-05, "loss": 0.3813, "step": 702 }, { "epoch": 2.373973468098547, "grad_norm": 0.2047126526455967, "learning_rate": 5.0352761804100835e-05, "loss": 0.3869, "step": 703 }, { "epoch": 2.377342598441777, "grad_norm": 0.18677317936073162, "learning_rate": 5.026160560168661e-05, "loss": 0.3829, "step": 704 }, { "epoch": 2.3807117287850073, "grad_norm": 0.15858411985283818, "learning_rate": 5.0170392316321826e-05, "loss": 0.3906, "step": 705 }, { "epoch": 2.3840808591282374, "grad_norm": 0.1542922309469812, "learning_rate": 5.0079122455405014e-05, "loss": 0.3898, "step": 706 }, { "epoch": 2.3874499894714676, "grad_norm": 0.16034757146153225, "learning_rate": 4.9987796526649394e-05, "loss": 0.3856, "step": 707 }, { "epoch": 2.3908191198146977, "grad_norm": 0.17396513204876746, "learning_rate": 4.989641503808011e-05, "loss": 0.3845, "step": 708 }, { "epoch": 2.394188250157928, "grad_norm": 0.14385199298465493, "learning_rate": 4.9804978498031326e-05, "loss": 0.383, "step": 709 }, { "epoch": 2.397557380501158, "grad_norm": 0.1424278412585639, "learning_rate": 4.971348741514349e-05, "loss": 0.3923, "step": 710 }, { "epoch": 2.4009265108443882, "grad_norm": 0.18492577887926495, "learning_rate": 4.962194229836045e-05, "loss": 0.3841, "step": 711 }, { "epoch": 2.4042956411876184, "grad_norm": 0.1732020596072231, "learning_rate": 4.95303436569266e-05, "loss": 0.3915, "step": 712 }, { "epoch": 2.4076647715308486, "grad_norm": 0.12301305622548196, "learning_rate": 4.943869200038413e-05, "loss": 0.384, "step": 713 }, { "epoch": 2.4110339018740787, "grad_norm": 0.18053993824097098, "learning_rate": 4.934698783857011e-05, "loss": 0.3817, "step": 714 }, { "epoch": 2.414403032217309, "grad_norm": 0.21725687137817615, "learning_rate": 4.9255231681613674e-05, "loss": 0.3887, "step": 715 }, { "epoch": 2.417772162560539, "grad_norm": 0.17070860183839026, "learning_rate": 4.91634240399332e-05, "loss": 0.3842, "step": 716 }, { "epoch": 2.4211412929037692, "grad_norm": 0.16062080472612222, "learning_rate": 4.907156542423351e-05, "loss": 0.3753, "step": 717 }, { "epoch": 2.4245104232469994, "grad_norm": 0.16452143222682503, "learning_rate": 4.8979656345502904e-05, "loss": 0.3819, "step": 718 }, { "epoch": 2.4278795535902296, "grad_norm": 0.17121464354448115, "learning_rate": 4.888769731501047e-05, "loss": 0.3829, "step": 719 }, { "epoch": 2.4312486839334597, "grad_norm": 0.1588530781256576, "learning_rate": 4.8795688844303114e-05, "loss": 0.3872, "step": 720 }, { "epoch": 2.43461781427669, "grad_norm": 0.15259487087295576, "learning_rate": 4.870363144520279e-05, "loss": 0.3878, "step": 721 }, { "epoch": 2.43798694461992, "grad_norm": 0.15808052014003177, "learning_rate": 4.861152562980362e-05, "loss": 0.3827, "step": 722 }, { "epoch": 2.44135607496315, "grad_norm": 0.18095527833139824, "learning_rate": 4.851937191046906e-05, "loss": 0.3828, "step": 723 }, { "epoch": 2.4447252053063804, "grad_norm": 0.17700515235134065, "learning_rate": 4.8427170799829055e-05, "loss": 0.3849, "step": 724 }, { "epoch": 2.4480943356496105, "grad_norm": 0.15108262997817984, "learning_rate": 4.833492281077717e-05, "loss": 0.3827, "step": 725 }, { "epoch": 2.4514634659928407, "grad_norm": 0.14610122044801815, "learning_rate": 4.824262845646771e-05, "loss": 0.3891, "step": 726 }, { "epoch": 2.454832596336071, "grad_norm": 0.17949690552168968, "learning_rate": 4.815028825031295e-05, "loss": 0.3824, "step": 727 }, { "epoch": 2.458201726679301, "grad_norm": 0.17860414349949053, "learning_rate": 4.805790270598021e-05, "loss": 0.3859, "step": 728 }, { "epoch": 2.461570857022531, "grad_norm": 0.15714664302158635, "learning_rate": 4.796547233738901e-05, "loss": 0.3805, "step": 729 }, { "epoch": 2.4649399873657614, "grad_norm": 0.13409742518350323, "learning_rate": 4.787299765870822e-05, "loss": 0.3894, "step": 730 }, { "epoch": 2.4683091177089915, "grad_norm": 0.1375698590454868, "learning_rate": 4.77804791843532e-05, "loss": 0.3885, "step": 731 }, { "epoch": 2.4716782480522217, "grad_norm": 0.1382618240475382, "learning_rate": 4.768791742898292e-05, "loss": 0.3875, "step": 732 }, { "epoch": 2.475047378395452, "grad_norm": 0.1398622806337096, "learning_rate": 4.7595312907497135e-05, "loss": 0.3853, "step": 733 }, { "epoch": 2.4784165087386816, "grad_norm": 0.14539506330457003, "learning_rate": 4.7502666135033486e-05, "loss": 0.3935, "step": 734 }, { "epoch": 2.481785639081912, "grad_norm": 0.13109075183048932, "learning_rate": 4.7409977626964666e-05, "loss": 0.3848, "step": 735 }, { "epoch": 2.485154769425142, "grad_norm": 0.12988278807806955, "learning_rate": 4.731724789889547e-05, "loss": 0.3839, "step": 736 }, { "epoch": 2.4885238997683725, "grad_norm": 0.1578289932884262, "learning_rate": 4.722447746666008e-05, "loss": 0.3836, "step": 737 }, { "epoch": 2.4918930301116022, "grad_norm": 0.1696600549846316, "learning_rate": 4.7131666846319036e-05, "loss": 0.3825, "step": 738 }, { "epoch": 2.495262160454833, "grad_norm": 0.13151686953984587, "learning_rate": 4.7038816554156484e-05, "loss": 0.3879, "step": 739 }, { "epoch": 2.4986312907980626, "grad_norm": 0.19638702203051203, "learning_rate": 4.694592710667723e-05, "loss": 0.3873, "step": 740 }, { "epoch": 2.502000421141293, "grad_norm": 0.18899466534966777, "learning_rate": 4.6852999020603864e-05, "loss": 0.3808, "step": 741 }, { "epoch": 2.505369551484523, "grad_norm": 0.12219071702355794, "learning_rate": 4.676003281287397e-05, "loss": 0.3876, "step": 742 }, { "epoch": 2.508738681827753, "grad_norm": 0.18236706911247189, "learning_rate": 4.6667029000637164e-05, "loss": 0.3846, "step": 743 }, { "epoch": 2.5121078121709832, "grad_norm": 0.1684130303158305, "learning_rate": 4.657398810125225e-05, "loss": 0.3888, "step": 744 }, { "epoch": 2.5154769425142134, "grad_norm": 0.16891778570455948, "learning_rate": 4.648091063228435e-05, "loss": 0.3878, "step": 745 }, { "epoch": 2.5188460728574436, "grad_norm": 0.16123369621023537, "learning_rate": 4.638779711150198e-05, "loss": 0.3888, "step": 746 }, { "epoch": 2.5222152032006737, "grad_norm": 0.13513366343949626, "learning_rate": 4.629464805687426e-05, "loss": 0.3826, "step": 747 }, { "epoch": 2.525584333543904, "grad_norm": 0.1460461212872677, "learning_rate": 4.620146398656792e-05, "loss": 0.3841, "step": 748 }, { "epoch": 2.528953463887134, "grad_norm": 0.16497117181141158, "learning_rate": 4.610824541894452e-05, "loss": 0.3842, "step": 749 }, { "epoch": 2.532322594230364, "grad_norm": 0.16290788207612428, "learning_rate": 4.601499287255748e-05, "loss": 0.3885, "step": 750 }, { "epoch": 2.5356917245735944, "grad_norm": 0.14489151093892186, "learning_rate": 4.592170686614926e-05, "loss": 0.3909, "step": 751 }, { "epoch": 2.5390608549168245, "grad_norm": 0.1464122207528577, "learning_rate": 4.582838791864846e-05, "loss": 0.3864, "step": 752 }, { "epoch": 2.5424299852600547, "grad_norm": 0.1543922436683134, "learning_rate": 4.5735036549166907e-05, "loss": 0.3781, "step": 753 }, { "epoch": 2.545799115603285, "grad_norm": 0.1511363443793848, "learning_rate": 4.5641653276996774e-05, "loss": 0.388, "step": 754 }, { "epoch": 2.549168245946515, "grad_norm": 0.14775900613642287, "learning_rate": 4.5548238621607735e-05, "loss": 0.3829, "step": 755 }, { "epoch": 2.552537376289745, "grad_norm": 0.1609040357156897, "learning_rate": 4.5454793102644006e-05, "loss": 0.3913, "step": 756 }, { "epoch": 2.5559065066329754, "grad_norm": 0.17452716126040962, "learning_rate": 4.5361317239921515e-05, "loss": 0.387, "step": 757 }, { "epoch": 2.5592756369762055, "grad_norm": 0.15479208730353294, "learning_rate": 4.5267811553424945e-05, "loss": 0.3794, "step": 758 }, { "epoch": 2.5626447673194357, "grad_norm": 0.16977092756406884, "learning_rate": 4.517427656330496e-05, "loss": 0.3813, "step": 759 }, { "epoch": 2.566013897662666, "grad_norm": 0.15943557512689435, "learning_rate": 4.5080712789875154e-05, "loss": 0.3886, "step": 760 }, { "epoch": 2.569383028005896, "grad_norm": 0.15146661036703893, "learning_rate": 4.498712075360929e-05, "loss": 0.3779, "step": 761 }, { "epoch": 2.572752158349126, "grad_norm": 0.1583016214192411, "learning_rate": 4.489350097513829e-05, "loss": 0.3861, "step": 762 }, { "epoch": 2.5761212886923563, "grad_norm": 0.18203713661130738, "learning_rate": 4.479985397524748e-05, "loss": 0.3872, "step": 763 }, { "epoch": 2.5794904190355865, "grad_norm": 0.1411770309939346, "learning_rate": 4.470618027487354e-05, "loss": 0.3833, "step": 764 }, { "epoch": 2.5828595493788167, "grad_norm": 0.15778048291503943, "learning_rate": 4.4612480395101736e-05, "loss": 0.3835, "step": 765 }, { "epoch": 2.586228679722047, "grad_norm": 0.20283325612723238, "learning_rate": 4.451875485716292e-05, "loss": 0.3804, "step": 766 }, { "epoch": 2.589597810065277, "grad_norm": 0.15957667387644875, "learning_rate": 4.44250041824307e-05, "loss": 0.3759, "step": 767 }, { "epoch": 2.592966940408507, "grad_norm": 0.15580437360078891, "learning_rate": 4.4331228892418473e-05, "loss": 0.3869, "step": 768 }, { "epoch": 2.5963360707517373, "grad_norm": 0.16733864762153852, "learning_rate": 4.4237429508776645e-05, "loss": 0.3901, "step": 769 }, { "epoch": 2.5997052010949675, "grad_norm": 0.16840382892762462, "learning_rate": 4.414360655328957e-05, "loss": 0.3887, "step": 770 }, { "epoch": 2.6030743314381977, "grad_norm": 0.16500477542253614, "learning_rate": 4.4049760547872786e-05, "loss": 0.3821, "step": 771 }, { "epoch": 2.606443461781428, "grad_norm": 0.17637661287184536, "learning_rate": 4.395589201457e-05, "loss": 0.3901, "step": 772 }, { "epoch": 2.6098125921246575, "grad_norm": 0.1426864712324038, "learning_rate": 4.386200147555027e-05, "loss": 0.3822, "step": 773 }, { "epoch": 2.613181722467888, "grad_norm": 0.1359883054124575, "learning_rate": 4.376808945310505e-05, "loss": 0.3907, "step": 774 }, { "epoch": 2.616550852811118, "grad_norm": 0.15390613245324686, "learning_rate": 4.3674156469645335e-05, "loss": 0.3844, "step": 775 }, { "epoch": 2.6199199831543485, "grad_norm": 0.12544051069791048, "learning_rate": 4.358020304769867e-05, "loss": 0.3848, "step": 776 }, { "epoch": 2.623289113497578, "grad_norm": 0.12982821849005882, "learning_rate": 4.348622970990634e-05, "loss": 0.386, "step": 777 }, { "epoch": 2.626658243840809, "grad_norm": 0.15120996993879657, "learning_rate": 4.339223697902037e-05, "loss": 0.3809, "step": 778 }, { "epoch": 2.6300273741840385, "grad_norm": 0.13233029817309008, "learning_rate": 4.329822537790073e-05, "loss": 0.3841, "step": 779 }, { "epoch": 2.633396504527269, "grad_norm": 0.14136223246926025, "learning_rate": 4.320419542951228e-05, "loss": 0.3838, "step": 780 }, { "epoch": 2.636765634870499, "grad_norm": 0.1228901057783663, "learning_rate": 4.3110147656922034e-05, "loss": 0.3802, "step": 781 }, { "epoch": 2.6401347652137295, "grad_norm": 0.13251524939594994, "learning_rate": 4.3016082583296067e-05, "loss": 0.378, "step": 782 }, { "epoch": 2.643503895556959, "grad_norm": 0.13001677701359055, "learning_rate": 4.292200073189676e-05, "loss": 0.3841, "step": 783 }, { "epoch": 2.6468730259001894, "grad_norm": 0.15991064871524435, "learning_rate": 4.2827902626079784e-05, "loss": 0.3875, "step": 784 }, { "epoch": 2.6502421562434195, "grad_norm": 0.12111670308432425, "learning_rate": 4.2733788789291275e-05, "loss": 0.3873, "step": 785 }, { "epoch": 2.6536112865866497, "grad_norm": 0.1593860904845142, "learning_rate": 4.263965974506483e-05, "loss": 0.3864, "step": 786 }, { "epoch": 2.65698041692988, "grad_norm": 0.16167383614529757, "learning_rate": 4.254551601701866e-05, "loss": 0.3845, "step": 787 }, { "epoch": 2.66034954727311, "grad_norm": 0.13801503703615994, "learning_rate": 4.2451358128852654e-05, "loss": 0.3876, "step": 788 }, { "epoch": 2.66371867761634, "grad_norm": 0.13674433021590243, "learning_rate": 4.23571866043455e-05, "loss": 0.3836, "step": 789 }, { "epoch": 2.6670878079595703, "grad_norm": 0.1567228984572654, "learning_rate": 4.22630019673517e-05, "loss": 0.3819, "step": 790 }, { "epoch": 2.6704569383028005, "grad_norm": 0.13292233430502193, "learning_rate": 4.216880474179871e-05, "loss": 0.3772, "step": 791 }, { "epoch": 2.6738260686460307, "grad_norm": 0.14610126476091434, "learning_rate": 4.207459545168405e-05, "loss": 0.391, "step": 792 }, { "epoch": 2.677195198989261, "grad_norm": 0.1295036399986597, "learning_rate": 4.198037462107228e-05, "loss": 0.39, "step": 793 }, { "epoch": 2.680564329332491, "grad_norm": 0.14286486693120076, "learning_rate": 4.188614277409224e-05, "loss": 0.3824, "step": 794 }, { "epoch": 2.683933459675721, "grad_norm": 0.1395089402065071, "learning_rate": 4.179190043493397e-05, "loss": 0.3893, "step": 795 }, { "epoch": 2.6873025900189513, "grad_norm": 0.1312675673324047, "learning_rate": 4.169764812784594e-05, "loss": 0.3839, "step": 796 }, { "epoch": 2.6906717203621815, "grad_norm": 0.15056150493927153, "learning_rate": 4.1603386377132045e-05, "loss": 0.3766, "step": 797 }, { "epoch": 2.6940408507054117, "grad_norm": 0.15234002339266034, "learning_rate": 4.1509115707148695e-05, "loss": 0.3875, "step": 798 }, { "epoch": 2.697409981048642, "grad_norm": 0.14172473902716337, "learning_rate": 4.1414836642301954e-05, "loss": 0.3835, "step": 799 }, { "epoch": 2.700779111391872, "grad_norm": 0.1244063349961557, "learning_rate": 4.132054970704454e-05, "loss": 0.384, "step": 800 }, { "epoch": 2.704148241735102, "grad_norm": 0.13151454461470574, "learning_rate": 4.122625542587301e-05, "loss": 0.3814, "step": 801 }, { "epoch": 2.7075173720783323, "grad_norm": 0.13472018853386267, "learning_rate": 4.1131954323324734e-05, "loss": 0.3832, "step": 802 }, { "epoch": 2.7108865024215625, "grad_norm": 0.14391402812007115, "learning_rate": 4.103764692397504e-05, "loss": 0.3907, "step": 803 }, { "epoch": 2.7142556327647926, "grad_norm": 0.1204377593661656, "learning_rate": 4.094333375243428e-05, "loss": 0.3779, "step": 804 }, { "epoch": 2.717624763108023, "grad_norm": 0.1345036853381592, "learning_rate": 4.084901533334495e-05, "loss": 0.3837, "step": 805 }, { "epoch": 2.720993893451253, "grad_norm": 0.151432229349483, "learning_rate": 4.075469219137868e-05, "loss": 0.3867, "step": 806 }, { "epoch": 2.724363023794483, "grad_norm": 0.13412559508113278, "learning_rate": 4.066036485123344e-05, "loss": 0.3809, "step": 807 }, { "epoch": 2.7277321541377133, "grad_norm": 0.1407083620047968, "learning_rate": 4.056603383763049e-05, "loss": 0.3893, "step": 808 }, { "epoch": 2.7311012844809435, "grad_norm": 0.1304023157361848, "learning_rate": 4.0471699675311564e-05, "loss": 0.3873, "step": 809 }, { "epoch": 2.7344704148241736, "grad_norm": 0.13069329962842927, "learning_rate": 4.0377362889035875e-05, "loss": 0.3845, "step": 810 }, { "epoch": 2.737839545167404, "grad_norm": 0.134836479542485, "learning_rate": 4.0283024003577284e-05, "loss": 0.3806, "step": 811 }, { "epoch": 2.741208675510634, "grad_norm": 0.12753418534583713, "learning_rate": 4.0188683543721295e-05, "loss": 0.3797, "step": 812 }, { "epoch": 2.744577805853864, "grad_norm": 0.13228859664320883, "learning_rate": 4.009434203426215e-05, "loss": 0.3856, "step": 813 }, { "epoch": 2.747946936197094, "grad_norm": 0.14892311316819778, "learning_rate": 4e-05, "loss": 0.3838, "step": 814 }, { "epoch": 2.7513160665403245, "grad_norm": 0.13386473278676905, "learning_rate": 3.9905657965737854e-05, "loss": 0.3829, "step": 815 }, { "epoch": 2.754685196883554, "grad_norm": 0.14219980607382138, "learning_rate": 3.981131645627872e-05, "loss": 0.3819, "step": 816 }, { "epoch": 2.758054327226785, "grad_norm": 0.1388449346696737, "learning_rate": 3.971697599642273e-05, "loss": 0.3834, "step": 817 }, { "epoch": 2.7614234575700145, "grad_norm": 0.12977851410941868, "learning_rate": 3.9622637110964125e-05, "loss": 0.3831, "step": 818 }, { "epoch": 2.764792587913245, "grad_norm": 0.13978459681010671, "learning_rate": 3.9528300324688456e-05, "loss": 0.383, "step": 819 }, { "epoch": 2.768161718256475, "grad_norm": 0.13570459222433323, "learning_rate": 3.943396616236953e-05, "loss": 0.3851, "step": 820 }, { "epoch": 2.7715308485997054, "grad_norm": 0.1347307304770039, "learning_rate": 3.933963514876657e-05, "loss": 0.3872, "step": 821 }, { "epoch": 2.774899978942935, "grad_norm": 0.14708015270111557, "learning_rate": 3.9245307808621325e-05, "loss": 0.385, "step": 822 }, { "epoch": 2.7782691092861658, "grad_norm": 0.12260128131766068, "learning_rate": 3.915098466665506e-05, "loss": 0.3855, "step": 823 }, { "epoch": 2.7816382396293955, "grad_norm": 0.12292579106408079, "learning_rate": 3.905666624756573e-05, "loss": 0.3869, "step": 824 }, { "epoch": 2.7850073699726257, "grad_norm": 0.14492807851132256, "learning_rate": 3.8962353076024984e-05, "loss": 0.3821, "step": 825 }, { "epoch": 2.788376500315856, "grad_norm": 0.14449210295060477, "learning_rate": 3.886804567667528e-05, "loss": 0.3808, "step": 826 }, { "epoch": 2.791745630659086, "grad_norm": 0.12971902156372891, "learning_rate": 3.8773744574127e-05, "loss": 0.3878, "step": 827 }, { "epoch": 2.795114761002316, "grad_norm": 0.14230416274316593, "learning_rate": 3.867945029295546e-05, "loss": 0.3814, "step": 828 }, { "epoch": 2.7984838913455463, "grad_norm": 0.1224339186137515, "learning_rate": 3.858516335769806e-05, "loss": 0.3819, "step": 829 }, { "epoch": 2.8018530216887765, "grad_norm": 0.16733669157218356, "learning_rate": 3.8490884292851325e-05, "loss": 0.3825, "step": 830 }, { "epoch": 2.8052221520320066, "grad_norm": 0.13398557625334945, "learning_rate": 3.839661362286797e-05, "loss": 0.3785, "step": 831 }, { "epoch": 2.808591282375237, "grad_norm": 0.14930405489150408, "learning_rate": 3.830235187215408e-05, "loss": 0.3806, "step": 832 }, { "epoch": 2.811960412718467, "grad_norm": 0.14534442897149916, "learning_rate": 3.820809956506604e-05, "loss": 0.3869, "step": 833 }, { "epoch": 2.815329543061697, "grad_norm": 0.14294161233646072, "learning_rate": 3.8113857225907783e-05, "loss": 0.3834, "step": 834 }, { "epoch": 2.8186986734049273, "grad_norm": 0.1304818403113972, "learning_rate": 3.801962537892773e-05, "loss": 0.3917, "step": 835 }, { "epoch": 2.8220678037481575, "grad_norm": 0.16153213081562928, "learning_rate": 3.792540454831596e-05, "loss": 0.3877, "step": 836 }, { "epoch": 2.8254369340913876, "grad_norm": 0.12199316427929723, "learning_rate": 3.7831195258201295e-05, "loss": 0.3836, "step": 837 }, { "epoch": 2.828806064434618, "grad_norm": 0.14527010576989632, "learning_rate": 3.7736998032648305e-05, "loss": 0.3827, "step": 838 }, { "epoch": 2.832175194777848, "grad_norm": 0.15971096124557288, "learning_rate": 3.7642813395654504e-05, "loss": 0.3801, "step": 839 }, { "epoch": 2.835544325121078, "grad_norm": 0.12345484787366505, "learning_rate": 3.754864187114736e-05, "loss": 0.3855, "step": 840 }, { "epoch": 2.8389134554643083, "grad_norm": 0.13837193216510435, "learning_rate": 3.745448398298135e-05, "loss": 0.3828, "step": 841 }, { "epoch": 2.8422825858075385, "grad_norm": 0.1545419687841436, "learning_rate": 3.736034025493519e-05, "loss": 0.3821, "step": 842 }, { "epoch": 2.8456517161507686, "grad_norm": 0.12965815907805744, "learning_rate": 3.726621121070873e-05, "loss": 0.3885, "step": 843 }, { "epoch": 2.849020846493999, "grad_norm": 0.14437205080738458, "learning_rate": 3.717209737392022e-05, "loss": 0.3757, "step": 844 }, { "epoch": 2.852389976837229, "grad_norm": 0.13760242198629977, "learning_rate": 3.707799926810326e-05, "loss": 0.3841, "step": 845 }, { "epoch": 2.855759107180459, "grad_norm": 0.16923959033588096, "learning_rate": 3.698391741670394e-05, "loss": 0.3837, "step": 846 }, { "epoch": 2.8591282375236893, "grad_norm": 0.1483758913428858, "learning_rate": 3.688985234307798e-05, "loss": 0.3854, "step": 847 }, { "epoch": 2.8624973678669194, "grad_norm": 0.1409446277936609, "learning_rate": 3.679580457048772e-05, "loss": 0.3865, "step": 848 }, { "epoch": 2.8658664982101496, "grad_norm": 0.13848959127311186, "learning_rate": 3.6701774622099286e-05, "loss": 0.3847, "step": 849 }, { "epoch": 2.8692356285533798, "grad_norm": 0.13440901679008035, "learning_rate": 3.660776302097965e-05, "loss": 0.3809, "step": 850 }, { "epoch": 2.87260475889661, "grad_norm": 0.13528288220600784, "learning_rate": 3.6513770290093674e-05, "loss": 0.3844, "step": 851 }, { "epoch": 2.87597388923984, "grad_norm": 0.11930769920642463, "learning_rate": 3.641979695230135e-05, "loss": 0.3853, "step": 852 }, { "epoch": 2.8793430195830703, "grad_norm": 0.1302640412084013, "learning_rate": 3.632584353035467e-05, "loss": 0.3834, "step": 853 }, { "epoch": 2.8827121499263004, "grad_norm": 0.12093299855424389, "learning_rate": 3.6231910546894956e-05, "loss": 0.3851, "step": 854 }, { "epoch": 2.8860812802695306, "grad_norm": 0.1342477899550942, "learning_rate": 3.613799852444975e-05, "loss": 0.3883, "step": 855 }, { "epoch": 2.8894504106127608, "grad_norm": 0.11778883888529185, "learning_rate": 3.6044107985430015e-05, "loss": 0.3823, "step": 856 }, { "epoch": 2.8928195409559905, "grad_norm": 0.12271043639462616, "learning_rate": 3.595023945212723e-05, "loss": 0.3816, "step": 857 }, { "epoch": 2.896188671299221, "grad_norm": 0.12188701757865371, "learning_rate": 3.585639344671043e-05, "loss": 0.3863, "step": 858 }, { "epoch": 2.899557801642451, "grad_norm": 0.12511895990769892, "learning_rate": 3.576257049122336e-05, "loss": 0.3829, "step": 859 }, { "epoch": 2.9029269319856814, "grad_norm": 0.12002503720509249, "learning_rate": 3.5668771107581526e-05, "loss": 0.377, "step": 860 }, { "epoch": 2.906296062328911, "grad_norm": 0.12993074211163566, "learning_rate": 3.5574995817569317e-05, "loss": 0.3755, "step": 861 }, { "epoch": 2.9096651926721417, "grad_norm": 0.10532634808065627, "learning_rate": 3.5481245142837095e-05, "loss": 0.3869, "step": 862 }, { "epoch": 2.9130343230153715, "grad_norm": 0.1296191433786778, "learning_rate": 3.5387519604898264e-05, "loss": 0.382, "step": 863 }, { "epoch": 2.916403453358602, "grad_norm": 0.10734185230078218, "learning_rate": 3.5293819725126464e-05, "loss": 0.3849, "step": 864 }, { "epoch": 2.919772583701832, "grad_norm": 0.1077939586524133, "learning_rate": 3.520014602475252e-05, "loss": 0.3828, "step": 865 }, { "epoch": 2.923141714045062, "grad_norm": 0.12191898052299041, "learning_rate": 3.5106499024861715e-05, "loss": 0.3809, "step": 866 }, { "epoch": 2.926510844388292, "grad_norm": 0.12081068176606237, "learning_rate": 3.501287924639074e-05, "loss": 0.3892, "step": 867 }, { "epoch": 2.9298799747315223, "grad_norm": 0.13361270574401832, "learning_rate": 3.491928721012485e-05, "loss": 0.3818, "step": 868 }, { "epoch": 2.9332491050747524, "grad_norm": 0.12126810590661805, "learning_rate": 3.482572343669506e-05, "loss": 0.3834, "step": 869 }, { "epoch": 2.9366182354179826, "grad_norm": 0.1258581729968798, "learning_rate": 3.4732188446575055e-05, "loss": 0.3822, "step": 870 }, { "epoch": 2.939987365761213, "grad_norm": 0.11858345315742196, "learning_rate": 3.4638682760078505e-05, "loss": 0.3922, "step": 871 }, { "epoch": 2.943356496104443, "grad_norm": 0.11372309799338015, "learning_rate": 3.454520689735602e-05, "loss": 0.3824, "step": 872 }, { "epoch": 2.946725626447673, "grad_norm": 0.14113850726940133, "learning_rate": 3.445176137839227e-05, "loss": 0.3796, "step": 873 }, { "epoch": 2.9500947567909033, "grad_norm": 0.11612037625898579, "learning_rate": 3.435834672300324e-05, "loss": 0.3873, "step": 874 }, { "epoch": 2.9534638871341334, "grad_norm": 0.12263857158882245, "learning_rate": 3.426496345083309e-05, "loss": 0.3807, "step": 875 }, { "epoch": 2.9568330174773636, "grad_norm": 0.13787793243918434, "learning_rate": 3.417161208135155e-05, "loss": 0.3865, "step": 876 }, { "epoch": 2.9602021478205938, "grad_norm": 0.12537808395950803, "learning_rate": 3.407829313385075e-05, "loss": 0.3887, "step": 877 }, { "epoch": 2.963571278163824, "grad_norm": 0.1233586121783003, "learning_rate": 3.398500712744254e-05, "loss": 0.3831, "step": 878 }, { "epoch": 2.966940408507054, "grad_norm": 0.127510517027595, "learning_rate": 3.38917545810555e-05, "loss": 0.3855, "step": 879 }, { "epoch": 2.9703095388502843, "grad_norm": 0.12958054002462321, "learning_rate": 3.379853601343209e-05, "loss": 0.3867, "step": 880 }, { "epoch": 2.9736786691935144, "grad_norm": 0.11339310625974686, "learning_rate": 3.3705351943125755e-05, "loss": 0.381, "step": 881 }, { "epoch": 2.9770477995367446, "grad_norm": 0.1441132631100554, "learning_rate": 3.361220288849804e-05, "loss": 0.3853, "step": 882 }, { "epoch": 2.9804169298799748, "grad_norm": 0.12590761879480403, "learning_rate": 3.351908936771566e-05, "loss": 0.3821, "step": 883 }, { "epoch": 2.983786060223205, "grad_norm": 0.12580062137496578, "learning_rate": 3.342601189874777e-05, "loss": 0.3912, "step": 884 }, { "epoch": 2.987155190566435, "grad_norm": 0.1375861040816144, "learning_rate": 3.3332970999362836e-05, "loss": 0.3843, "step": 885 }, { "epoch": 2.9905243209096652, "grad_norm": 0.11745115999108842, "learning_rate": 3.323996718712605e-05, "loss": 0.3793, "step": 886 }, { "epoch": 2.9938934512528954, "grad_norm": 0.1154957553487754, "learning_rate": 3.3147000979396156e-05, "loss": 0.386, "step": 887 }, { "epoch": 2.9972625815961256, "grad_norm": 0.14419491852541183, "learning_rate": 3.305407289332279e-05, "loss": 0.387, "step": 888 }, { "epoch": 3.00336913034323, "grad_norm": 0.17453356323499444, "learning_rate": 3.296118344584352e-05, "loss": 0.3658, "step": 889 }, { "epoch": 3.0067382606864603, "grad_norm": 0.19958889229278365, "learning_rate": 3.2868333153680964e-05, "loss": 0.3563, "step": 890 }, { "epoch": 3.0101073910296905, "grad_norm": 0.14823110731719627, "learning_rate": 3.277552253333993e-05, "loss": 0.3592, "step": 891 }, { "epoch": 3.0134765213729207, "grad_norm": 0.15078557386759514, "learning_rate": 3.2682752101104536e-05, "loss": 0.3648, "step": 892 }, { "epoch": 3.016845651716151, "grad_norm": 0.15261085897213972, "learning_rate": 3.259002237303535e-05, "loss": 0.365, "step": 893 }, { "epoch": 3.020214782059381, "grad_norm": 0.12773087702299238, "learning_rate": 3.249733386496653e-05, "loss": 0.359, "step": 894 }, { "epoch": 3.023583912402611, "grad_norm": 0.13787164527794113, "learning_rate": 3.2404687092502865e-05, "loss": 0.361, "step": 895 }, { "epoch": 3.0269530427458413, "grad_norm": 0.15396809320630023, "learning_rate": 3.231208257101709e-05, "loss": 0.3639, "step": 896 }, { "epoch": 3.0303221730890715, "grad_norm": 0.13565056548247828, "learning_rate": 3.221952081564682e-05, "loss": 0.3632, "step": 897 }, { "epoch": 3.0336913034323016, "grad_norm": 0.16070873287428322, "learning_rate": 3.212700234129179e-05, "loss": 0.3594, "step": 898 }, { "epoch": 3.037060433775532, "grad_norm": 0.14022297658804933, "learning_rate": 3.2034527662611e-05, "loss": 0.363, "step": 899 }, { "epoch": 3.040429564118762, "grad_norm": 0.14407779140042834, "learning_rate": 3.194209729401979e-05, "loss": 0.3612, "step": 900 }, { "epoch": 3.043798694461992, "grad_norm": 0.13752049086764745, "learning_rate": 3.184971174968705e-05, "loss": 0.3645, "step": 901 }, { "epoch": 3.0471678248052223, "grad_norm": 0.13823706755645496, "learning_rate": 3.175737154353231e-05, "loss": 0.3626, "step": 902 }, { "epoch": 3.0505369551484525, "grad_norm": 0.16264110826907188, "learning_rate": 3.166507718922285e-05, "loss": 0.3566, "step": 903 }, { "epoch": 3.0539060854916826, "grad_norm": 0.15511577954565434, "learning_rate": 3.157282920017096e-05, "loss": 0.361, "step": 904 }, { "epoch": 3.057275215834913, "grad_norm": 0.15232517037403773, "learning_rate": 3.1480628089530943e-05, "loss": 0.3662, "step": 905 }, { "epoch": 3.060644346178143, "grad_norm": 0.17112367414740937, "learning_rate": 3.1388474370196395e-05, "loss": 0.3638, "step": 906 }, { "epoch": 3.064013476521373, "grad_norm": 0.12748688705449465, "learning_rate": 3.129636855479723e-05, "loss": 0.3579, "step": 907 }, { "epoch": 3.0673826068646033, "grad_norm": 0.14714355107055627, "learning_rate": 3.12043111556969e-05, "loss": 0.3582, "step": 908 }, { "epoch": 3.070751737207833, "grad_norm": 0.13462631797401237, "learning_rate": 3.111230268498954e-05, "loss": 0.367, "step": 909 }, { "epoch": 3.074120867551063, "grad_norm": 0.1372418048121636, "learning_rate": 3.1020343654497096e-05, "loss": 0.3588, "step": 910 }, { "epoch": 3.0774899978942933, "grad_norm": 0.13072048530956415, "learning_rate": 3.0928434575766505e-05, "loss": 0.361, "step": 911 }, { "epoch": 3.0808591282375235, "grad_norm": 0.12852995212281998, "learning_rate": 3.083657596006681e-05, "loss": 0.3543, "step": 912 }, { "epoch": 3.0842282585807537, "grad_norm": 0.12589969103284174, "learning_rate": 3.0744768318386346e-05, "loss": 0.3573, "step": 913 }, { "epoch": 3.087597388923984, "grad_norm": 0.1042227599830766, "learning_rate": 3.065301216142991e-05, "loss": 0.3571, "step": 914 }, { "epoch": 3.090966519267214, "grad_norm": 0.12641784465437736, "learning_rate": 3.056130799961587e-05, "loss": 0.361, "step": 915 }, { "epoch": 3.094335649610444, "grad_norm": 0.1189011090318916, "learning_rate": 3.046965634307341e-05, "loss": 0.3653, "step": 916 }, { "epoch": 3.0977047799536743, "grad_norm": 0.11559017960748716, "learning_rate": 3.0378057701639575e-05, "loss": 0.371, "step": 917 }, { "epoch": 3.1010739102969045, "grad_norm": 0.1198695027252497, "learning_rate": 3.028651258485652e-05, "loss": 0.3667, "step": 918 }, { "epoch": 3.1044430406401347, "grad_norm": 0.11196979369755074, "learning_rate": 3.019502150196869e-05, "loss": 0.3575, "step": 919 }, { "epoch": 3.107812170983365, "grad_norm": 0.12042692289106809, "learning_rate": 3.010358496191991e-05, "loss": 0.3618, "step": 920 }, { "epoch": 3.111181301326595, "grad_norm": 0.1238521643735063, "learning_rate": 3.0012203473350616e-05, "loss": 0.3672, "step": 921 }, { "epoch": 3.114550431669825, "grad_norm": 0.11597672612469004, "learning_rate": 2.9920877544595002e-05, "loss": 0.3577, "step": 922 }, { "epoch": 3.1179195620130553, "grad_norm": 0.11363631100554263, "learning_rate": 2.982960768367818e-05, "loss": 0.3637, "step": 923 }, { "epoch": 3.1212886923562855, "grad_norm": 0.12223781700368476, "learning_rate": 2.9738394398313405e-05, "loss": 0.3575, "step": 924 }, { "epoch": 3.1246578226995156, "grad_norm": 0.11310391813366659, "learning_rate": 2.9647238195899168e-05, "loss": 0.3666, "step": 925 }, { "epoch": 3.128026953042746, "grad_norm": 0.12749001851980382, "learning_rate": 2.955613958351647e-05, "loss": 0.3577, "step": 926 }, { "epoch": 3.131396083385976, "grad_norm": 0.11106465012495607, "learning_rate": 2.946509906792593e-05, "loss": 0.3661, "step": 927 }, { "epoch": 3.134765213729206, "grad_norm": 0.13265615613597764, "learning_rate": 2.9374117155564957e-05, "loss": 0.3613, "step": 928 }, { "epoch": 3.1381343440724363, "grad_norm": 0.1062334645184232, "learning_rate": 2.928319435254501e-05, "loss": 0.3601, "step": 929 }, { "epoch": 3.1415034744156665, "grad_norm": 0.13654759521524176, "learning_rate": 2.919233116464872e-05, "loss": 0.357, "step": 930 }, { "epoch": 3.1448726047588966, "grad_norm": 0.12274484555896063, "learning_rate": 2.9101528097327093e-05, "loss": 0.3659, "step": 931 }, { "epoch": 3.148241735102127, "grad_norm": 0.11432950773248603, "learning_rate": 2.9010785655696698e-05, "loss": 0.3638, "step": 932 }, { "epoch": 3.151610865445357, "grad_norm": 0.11354842248203202, "learning_rate": 2.892010434453684e-05, "loss": 0.36, "step": 933 }, { "epoch": 3.154979995788587, "grad_norm": 0.12098639250864718, "learning_rate": 2.88294846682868e-05, "loss": 0.3591, "step": 934 }, { "epoch": 3.1583491261318173, "grad_norm": 0.11027079481756498, "learning_rate": 2.873892713104298e-05, "loss": 0.3595, "step": 935 }, { "epoch": 3.1617182564750475, "grad_norm": 0.12568594872253705, "learning_rate": 2.864843223655613e-05, "loss": 0.3678, "step": 936 }, { "epoch": 3.1650873868182776, "grad_norm": 0.11667961571614835, "learning_rate": 2.855800048822852e-05, "loss": 0.3608, "step": 937 }, { "epoch": 3.168456517161508, "grad_norm": 0.11058294572640527, "learning_rate": 2.8467632389111126e-05, "loss": 0.3683, "step": 938 }, { "epoch": 3.171825647504738, "grad_norm": 0.1187950796415824, "learning_rate": 2.837732844190094e-05, "loss": 0.3644, "step": 939 }, { "epoch": 3.175194777847968, "grad_norm": 0.10656085663558766, "learning_rate": 2.828708914893799e-05, "loss": 0.3671, "step": 940 }, { "epoch": 3.1785639081911983, "grad_norm": 0.10817099139196962, "learning_rate": 2.8196915012202728e-05, "loss": 0.3672, "step": 941 }, { "epoch": 3.1819330385344284, "grad_norm": 0.10075876050195509, "learning_rate": 2.8106806533313106e-05, "loss": 0.3631, "step": 942 }, { "epoch": 3.1853021688776586, "grad_norm": 0.11551691063136907, "learning_rate": 2.8016764213521875e-05, "loss": 0.3608, "step": 943 }, { "epoch": 3.1886712992208888, "grad_norm": 0.10092150997385874, "learning_rate": 2.7926788553713734e-05, "loss": 0.3652, "step": 944 }, { "epoch": 3.192040429564119, "grad_norm": 0.11020311291162539, "learning_rate": 2.783688005440256e-05, "loss": 0.3656, "step": 945 }, { "epoch": 3.195409559907349, "grad_norm": 0.10850184905841719, "learning_rate": 2.7747039215728667e-05, "loss": 0.3648, "step": 946 }, { "epoch": 3.1987786902505793, "grad_norm": 0.10954311066114457, "learning_rate": 2.7657266537455938e-05, "loss": 0.3651, "step": 947 }, { "epoch": 3.2021478205938094, "grad_norm": 0.10365234676829252, "learning_rate": 2.7567562518969155e-05, "loss": 0.3533, "step": 948 }, { "epoch": 3.2055169509370396, "grad_norm": 0.10204242463146666, "learning_rate": 2.7477927659271117e-05, "loss": 0.3622, "step": 949 }, { "epoch": 3.2088860812802693, "grad_norm": 0.10799341793471445, "learning_rate": 2.7388362456979906e-05, "loss": 0.3625, "step": 950 }, { "epoch": 3.2122552116235, "grad_norm": 0.11115544373524708, "learning_rate": 2.7298867410326155e-05, "loss": 0.3629, "step": 951 }, { "epoch": 3.2156243419667296, "grad_norm": 0.10949003369065348, "learning_rate": 2.7209443017150193e-05, "loss": 0.3635, "step": 952 }, { "epoch": 3.21899347230996, "grad_norm": 0.10963161775177817, "learning_rate": 2.712008977489936e-05, "loss": 0.3594, "step": 953 }, { "epoch": 3.22236260265319, "grad_norm": 0.11805544027584379, "learning_rate": 2.703080818062517e-05, "loss": 0.3635, "step": 954 }, { "epoch": 3.22573173299642, "grad_norm": 0.10196046146217858, "learning_rate": 2.694159873098058e-05, "loss": 0.3626, "step": 955 }, { "epoch": 3.2291008633396503, "grad_norm": 0.1120026689331707, "learning_rate": 2.6852461922217253e-05, "loss": 0.3649, "step": 956 }, { "epoch": 3.2324699936828805, "grad_norm": 0.10926346301227147, "learning_rate": 2.6763398250182714e-05, "loss": 0.3579, "step": 957 }, { "epoch": 3.2358391240261106, "grad_norm": 0.10913175373351278, "learning_rate": 2.66744082103177e-05, "loss": 0.3639, "step": 958 }, { "epoch": 3.239208254369341, "grad_norm": 0.10485736112258066, "learning_rate": 2.658549229765332e-05, "loss": 0.3592, "step": 959 }, { "epoch": 3.242577384712571, "grad_norm": 0.12101416878728995, "learning_rate": 2.6496651006808308e-05, "loss": 0.3574, "step": 960 }, { "epoch": 3.245946515055801, "grad_norm": 0.1071236277697119, "learning_rate": 2.6407884831986367e-05, "loss": 0.3627, "step": 961 }, { "epoch": 3.2493156453990313, "grad_norm": 0.11778875174805165, "learning_rate": 2.6319194266973256e-05, "loss": 0.365, "step": 962 }, { "epoch": 3.2526847757422614, "grad_norm": 0.12437906053481307, "learning_rate": 2.6230579805134203e-05, "loss": 0.3582, "step": 963 }, { "epoch": 3.2560539060854916, "grad_norm": 0.11016391828701566, "learning_rate": 2.614204193941107e-05, "loss": 0.3628, "step": 964 }, { "epoch": 3.2594230364287218, "grad_norm": 0.131288542140626, "learning_rate": 2.6053581162319606e-05, "loss": 0.3634, "step": 965 }, { "epoch": 3.262792166771952, "grad_norm": 0.10515921544500577, "learning_rate": 2.5965197965946783e-05, "loss": 0.3649, "step": 966 }, { "epoch": 3.266161297115182, "grad_norm": 0.12739731098762894, "learning_rate": 2.587689284194797e-05, "loss": 0.3703, "step": 967 }, { "epoch": 3.2695304274584123, "grad_norm": 0.10406377203116793, "learning_rate": 2.5788666281544258e-05, "loss": 0.3657, "step": 968 }, { "epoch": 3.2728995578016424, "grad_norm": 0.11191689402983139, "learning_rate": 2.5700518775519702e-05, "loss": 0.359, "step": 969 }, { "epoch": 3.2762686881448726, "grad_norm": 0.10680144927044027, "learning_rate": 2.561245081421857e-05, "loss": 0.3604, "step": 970 }, { "epoch": 3.2796378184881028, "grad_norm": 0.11505057898142523, "learning_rate": 2.5524462887542703e-05, "loss": 0.3599, "step": 971 }, { "epoch": 3.283006948831333, "grad_norm": 0.10674300454641518, "learning_rate": 2.5436555484948643e-05, "loss": 0.3625, "step": 972 }, { "epoch": 3.286376079174563, "grad_norm": 0.10772282874724956, "learning_rate": 2.534872909544509e-05, "loss": 0.3586, "step": 973 }, { "epoch": 3.2897452095177933, "grad_norm": 0.11061913724144044, "learning_rate": 2.5260984207590015e-05, "loss": 0.3695, "step": 974 }, { "epoch": 3.2931143398610234, "grad_norm": 0.11314868048581533, "learning_rate": 2.517332130948802e-05, "loss": 0.3597, "step": 975 }, { "epoch": 3.2964834702042536, "grad_norm": 0.10483488263899578, "learning_rate": 2.5085740888787662e-05, "loss": 0.3583, "step": 976 }, { "epoch": 3.2998526005474837, "grad_norm": 0.10912778564330813, "learning_rate": 2.4998243432678644e-05, "loss": 0.3601, "step": 977 }, { "epoch": 3.303221730890714, "grad_norm": 0.11466754101476578, "learning_rate": 2.4910829427889205e-05, "loss": 0.3643, "step": 978 }, { "epoch": 3.306590861233944, "grad_norm": 0.10733537636590312, "learning_rate": 2.4823499360683333e-05, "loss": 0.3651, "step": 979 }, { "epoch": 3.3099599915771742, "grad_norm": 0.1161393261879057, "learning_rate": 2.473625371685806e-05, "loss": 0.3599, "step": 980 }, { "epoch": 3.3133291219204044, "grad_norm": 0.0982571093572832, "learning_rate": 2.464909298174088e-05, "loss": 0.3526, "step": 981 }, { "epoch": 3.3166982522636346, "grad_norm": 0.1100159657444912, "learning_rate": 2.4562017640186847e-05, "loss": 0.3626, "step": 982 }, { "epoch": 3.3200673826068647, "grad_norm": 0.09926349760672294, "learning_rate": 2.4475028176576102e-05, "loss": 0.3677, "step": 983 }, { "epoch": 3.323436512950095, "grad_norm": 0.12050759797842048, "learning_rate": 2.4388125074810986e-05, "loss": 0.359, "step": 984 }, { "epoch": 3.326805643293325, "grad_norm": 0.09987805749588798, "learning_rate": 2.430130881831345e-05, "loss": 0.3618, "step": 985 }, { "epoch": 3.3301747736365552, "grad_norm": 0.1091783241310202, "learning_rate": 2.4214579890022373e-05, "loss": 0.3696, "step": 986 }, { "epoch": 3.3335439039797854, "grad_norm": 0.10898707191962656, "learning_rate": 2.41279387723908e-05, "loss": 0.3638, "step": 987 }, { "epoch": 3.3369130343230156, "grad_norm": 0.10558034784682291, "learning_rate": 2.404138594738335e-05, "loss": 0.357, "step": 988 }, { "epoch": 3.3402821646662453, "grad_norm": 0.10689449489731055, "learning_rate": 2.395492189647347e-05, "loss": 0.3594, "step": 989 }, { "epoch": 3.343651295009476, "grad_norm": 0.11118497131539316, "learning_rate": 2.386854710064075e-05, "loss": 0.3542, "step": 990 }, { "epoch": 3.3470204253527056, "grad_norm": 0.10782085280238568, "learning_rate": 2.3782262040368344e-05, "loss": 0.3608, "step": 991 }, { "epoch": 3.350389555695936, "grad_norm": 0.10697566924440428, "learning_rate": 2.369606719564015e-05, "loss": 0.3551, "step": 992 }, { "epoch": 3.353758686039166, "grad_norm": 0.09605638199170409, "learning_rate": 2.3609963045938288e-05, "loss": 0.3618, "step": 993 }, { "epoch": 3.357127816382396, "grad_norm": 0.10827169360976367, "learning_rate": 2.35239500702403e-05, "loss": 0.3565, "step": 994 }, { "epoch": 3.3604969467256263, "grad_norm": 0.10198375263244171, "learning_rate": 2.3438028747016586e-05, "loss": 0.3626, "step": 995 }, { "epoch": 3.3638660770688564, "grad_norm": 0.1159958447674676, "learning_rate": 2.3352199554227698e-05, "loss": 0.3629, "step": 996 }, { "epoch": 3.3672352074120866, "grad_norm": 0.10457139377595129, "learning_rate": 2.326646296932168e-05, "loss": 0.3638, "step": 997 }, { "epoch": 3.3706043377553168, "grad_norm": 0.10333006497152411, "learning_rate": 2.318081946923144e-05, "loss": 0.3612, "step": 998 }, { "epoch": 3.373973468098547, "grad_norm": 0.10461115888151253, "learning_rate": 2.3095269530372032e-05, "loss": 0.362, "step": 999 }, { "epoch": 3.377342598441777, "grad_norm": 0.10087292499347122, "learning_rate": 2.3009813628638085e-05, "loss": 0.3603, "step": 1000 }, { "epoch": 3.3807117287850073, "grad_norm": 0.09894098741998586, "learning_rate": 2.2924452239401153e-05, "loss": 0.3635, "step": 1001 }, { "epoch": 3.3840808591282374, "grad_norm": 0.10636129988239897, "learning_rate": 2.283918583750695e-05, "loss": 0.3589, "step": 1002 }, { "epoch": 3.3874499894714676, "grad_norm": 0.1087735124770059, "learning_rate": 2.2754014897272868e-05, "loss": 0.3603, "step": 1003 }, { "epoch": 3.3908191198146977, "grad_norm": 0.1045786633320159, "learning_rate": 2.266893989248527e-05, "loss": 0.3634, "step": 1004 }, { "epoch": 3.394188250157928, "grad_norm": 0.10630134016191294, "learning_rate": 2.258396129639679e-05, "loss": 0.3626, "step": 1005 }, { "epoch": 3.397557380501158, "grad_norm": 0.10814614823364664, "learning_rate": 2.2499079581723846e-05, "loss": 0.3682, "step": 1006 }, { "epoch": 3.4009265108443882, "grad_norm": 0.10249665362012134, "learning_rate": 2.2414295220643822e-05, "loss": 0.361, "step": 1007 }, { "epoch": 3.4042956411876184, "grad_norm": 0.10378027402659071, "learning_rate": 2.2329608684792676e-05, "loss": 0.3606, "step": 1008 }, { "epoch": 3.4076647715308486, "grad_norm": 0.10027376191210695, "learning_rate": 2.22450204452621e-05, "loss": 0.3608, "step": 1009 }, { "epoch": 3.4110339018740787, "grad_norm": 0.10689722485945972, "learning_rate": 2.216053097259697e-05, "loss": 0.3706, "step": 1010 }, { "epoch": 3.414403032217309, "grad_norm": 0.10357939152860053, "learning_rate": 2.2076140736792805e-05, "loss": 0.3623, "step": 1011 }, { "epoch": 3.417772162560539, "grad_norm": 0.0902315706129379, "learning_rate": 2.1991850207293064e-05, "loss": 0.3596, "step": 1012 }, { "epoch": 3.4211412929037692, "grad_norm": 0.10842563552035595, "learning_rate": 2.1907659852986588e-05, "loss": 0.3637, "step": 1013 }, { "epoch": 3.4245104232469994, "grad_norm": 0.09666903812158173, "learning_rate": 2.1823570142204902e-05, "loss": 0.3624, "step": 1014 }, { "epoch": 3.4278795535902296, "grad_norm": 0.100083090000888, "learning_rate": 2.1739581542719748e-05, "loss": 0.3624, "step": 1015 }, { "epoch": 3.4312486839334597, "grad_norm": 0.10755809720758686, "learning_rate": 2.1655694521740376e-05, "loss": 0.3624, "step": 1016 }, { "epoch": 3.43461781427669, "grad_norm": 0.1024231010803628, "learning_rate": 2.1571909545910953e-05, "loss": 0.3621, "step": 1017 }, { "epoch": 3.43798694461992, "grad_norm": 0.10562299735859218, "learning_rate": 2.1488227081308054e-05, "loss": 0.3626, "step": 1018 }, { "epoch": 3.44135607496315, "grad_norm": 0.0993759886031881, "learning_rate": 2.140464759343794e-05, "loss": 0.3654, "step": 1019 }, { "epoch": 3.4447252053063804, "grad_norm": 0.09933521966725083, "learning_rate": 2.132117154723408e-05, "loss": 0.356, "step": 1020 }, { "epoch": 3.4480943356496105, "grad_norm": 0.09953034686873165, "learning_rate": 2.123779940705453e-05, "loss": 0.366, "step": 1021 }, { "epoch": 3.4514634659928407, "grad_norm": 0.10175075266526791, "learning_rate": 2.115453163667929e-05, "loss": 0.3583, "step": 1022 }, { "epoch": 3.454832596336071, "grad_norm": 0.09594990983302608, "learning_rate": 2.1071368699307818e-05, "loss": 0.3584, "step": 1023 }, { "epoch": 3.458201726679301, "grad_norm": 0.10219150476255269, "learning_rate": 2.0988311057556397e-05, "loss": 0.3597, "step": 1024 }, { "epoch": 3.461570857022531, "grad_norm": 0.09691112693809913, "learning_rate": 2.0905359173455593e-05, "loss": 0.3621, "step": 1025 }, { "epoch": 3.4649399873657614, "grad_norm": 0.09661009238935536, "learning_rate": 2.0822513508447608e-05, "loss": 0.3567, "step": 1026 }, { "epoch": 3.4683091177089915, "grad_norm": 0.09590582546596066, "learning_rate": 2.073977452338384e-05, "loss": 0.3646, "step": 1027 }, { "epoch": 3.4716782480522217, "grad_norm": 0.09606588648905236, "learning_rate": 2.065714267852223e-05, "loss": 0.3641, "step": 1028 }, { "epoch": 3.475047378395452, "grad_norm": 0.10295819559523817, "learning_rate": 2.057461843352469e-05, "loss": 0.3557, "step": 1029 }, { "epoch": 3.4784165087386816, "grad_norm": 0.09150758299366415, "learning_rate": 2.049220224745463e-05, "loss": 0.3636, "step": 1030 }, { "epoch": 3.481785639081912, "grad_norm": 0.10198222794968945, "learning_rate": 2.0409894578774302e-05, "loss": 0.3642, "step": 1031 }, { "epoch": 3.485154769425142, "grad_norm": 0.09986839616807734, "learning_rate": 2.032769588534233e-05, "loss": 0.3673, "step": 1032 }, { "epoch": 3.4885238997683725, "grad_norm": 0.09939024454656914, "learning_rate": 2.0245606624411165e-05, "loss": 0.3591, "step": 1033 }, { "epoch": 3.4918930301116022, "grad_norm": 0.09144469769761462, "learning_rate": 2.0163627252624427e-05, "loss": 0.3683, "step": 1034 }, { "epoch": 3.495262160454833, "grad_norm": 0.09038126728850328, "learning_rate": 2.0081758226014516e-05, "loss": 0.3585, "step": 1035 }, { "epoch": 3.4986312907980626, "grad_norm": 0.09791848188862595, "learning_rate": 2.0000000000000012e-05, "loss": 0.3633, "step": 1036 }, { "epoch": 3.502000421141293, "grad_norm": 0.09451335114654308, "learning_rate": 1.9918353029383065e-05, "loss": 0.3563, "step": 1037 }, { "epoch": 3.505369551484523, "grad_norm": 0.09766694825632033, "learning_rate": 1.9836817768347015e-05, "loss": 0.3634, "step": 1038 }, { "epoch": 3.508738681827753, "grad_norm": 0.09523718569743331, "learning_rate": 1.9755394670453745e-05, "loss": 0.364, "step": 1039 }, { "epoch": 3.5121078121709832, "grad_norm": 0.09716008155147098, "learning_rate": 1.9674084188641235e-05, "loss": 0.3614, "step": 1040 }, { "epoch": 3.5154769425142134, "grad_norm": 0.09307186314834033, "learning_rate": 1.9592886775220957e-05, "loss": 0.3663, "step": 1041 }, { "epoch": 3.5188460728574436, "grad_norm": 0.0966569916505279, "learning_rate": 1.9511802881875438e-05, "loss": 0.3628, "step": 1042 }, { "epoch": 3.5222152032006737, "grad_norm": 0.09953663178152124, "learning_rate": 1.943083295965572e-05, "loss": 0.3653, "step": 1043 }, { "epoch": 3.525584333543904, "grad_norm": 0.09000177069317349, "learning_rate": 1.9349977458978846e-05, "loss": 0.357, "step": 1044 }, { "epoch": 3.528953463887134, "grad_norm": 0.09693735378111683, "learning_rate": 1.9269236829625387e-05, "loss": 0.3623, "step": 1045 }, { "epoch": 3.532322594230364, "grad_norm": 0.1010678964013295, "learning_rate": 1.9188611520736846e-05, "loss": 0.3631, "step": 1046 }, { "epoch": 3.5356917245735944, "grad_norm": 0.08709082546898574, "learning_rate": 1.9108101980813277e-05, "loss": 0.3559, "step": 1047 }, { "epoch": 3.5390608549168245, "grad_norm": 0.09973595422763583, "learning_rate": 1.902770865771074e-05, "loss": 0.3572, "step": 1048 }, { "epoch": 3.5424299852600547, "grad_norm": 0.0932062947908472, "learning_rate": 1.8947431998638762e-05, "loss": 0.3703, "step": 1049 }, { "epoch": 3.545799115603285, "grad_norm": 0.0927283626151012, "learning_rate": 1.886727245015794e-05, "loss": 0.3604, "step": 1050 }, { "epoch": 3.549168245946515, "grad_norm": 0.0899928028286008, "learning_rate": 1.8787230458177408e-05, "loss": 0.3596, "step": 1051 }, { "epoch": 3.552537376289745, "grad_norm": 0.09291563797483152, "learning_rate": 1.8707306467952323e-05, "loss": 0.3602, "step": 1052 }, { "epoch": 3.5559065066329754, "grad_norm": 0.0873067226862915, "learning_rate": 1.862750092408147e-05, "loss": 0.3632, "step": 1053 }, { "epoch": 3.5592756369762055, "grad_norm": 0.09201291502685034, "learning_rate": 1.8547814270504705e-05, "loss": 0.3665, "step": 1054 }, { "epoch": 3.5626447673194357, "grad_norm": 0.08672862490756593, "learning_rate": 1.8468246950500556e-05, "loss": 0.3595, "step": 1055 }, { "epoch": 3.566013897662666, "grad_norm": 0.08745897822977576, "learning_rate": 1.838879940668373e-05, "loss": 0.3605, "step": 1056 }, { "epoch": 3.569383028005896, "grad_norm": 0.09233748547600358, "learning_rate": 1.83094720810026e-05, "loss": 0.36, "step": 1057 }, { "epoch": 3.572752158349126, "grad_norm": 0.09364157413857832, "learning_rate": 1.823026541473684e-05, "loss": 0.3642, "step": 1058 }, { "epoch": 3.5761212886923563, "grad_norm": 0.0919710962105762, "learning_rate": 1.8151179848494905e-05, "loss": 0.3629, "step": 1059 }, { "epoch": 3.5794904190355865, "grad_norm": 0.09562060747735483, "learning_rate": 1.8072215822211613e-05, "loss": 0.3623, "step": 1060 }, { "epoch": 3.5828595493788167, "grad_norm": 0.08937202924753714, "learning_rate": 1.7993373775145663e-05, "loss": 0.3608, "step": 1061 }, { "epoch": 3.586228679722047, "grad_norm": 0.09294261879238143, "learning_rate": 1.7914654145877187e-05, "loss": 0.3605, "step": 1062 }, { "epoch": 3.589597810065277, "grad_norm": 0.08234694139654683, "learning_rate": 1.7836057372305423e-05, "loss": 0.3628, "step": 1063 }, { "epoch": 3.592966940408507, "grad_norm": 0.09244388782945556, "learning_rate": 1.77575838916461e-05, "loss": 0.3584, "step": 1064 }, { "epoch": 3.5963360707517373, "grad_norm": 0.0993638664940156, "learning_rate": 1.767923414042915e-05, "loss": 0.3614, "step": 1065 }, { "epoch": 3.5997052010949675, "grad_norm": 0.08789579064875053, "learning_rate": 1.760100855449619e-05, "loss": 0.3603, "step": 1066 }, { "epoch": 3.6030743314381977, "grad_norm": 0.10411154322718334, "learning_rate": 1.752290756899816e-05, "loss": 0.3624, "step": 1067 }, { "epoch": 3.606443461781428, "grad_norm": 0.08047024019249209, "learning_rate": 1.7444931618392894e-05, "loss": 0.3585, "step": 1068 }, { "epoch": 3.6098125921246575, "grad_norm": 0.10614805370086325, "learning_rate": 1.736708113644262e-05, "loss": 0.363, "step": 1069 }, { "epoch": 3.613181722467888, "grad_norm": 0.08611033060017922, "learning_rate": 1.7289356556211687e-05, "loss": 0.3637, "step": 1070 }, { "epoch": 3.616550852811118, "grad_norm": 0.0873884543774505, "learning_rate": 1.7211758310064042e-05, "loss": 0.3578, "step": 1071 }, { "epoch": 3.6199199831543485, "grad_norm": 0.09185990558523179, "learning_rate": 1.7134286829660855e-05, "loss": 0.3677, "step": 1072 }, { "epoch": 3.623289113497578, "grad_norm": 0.08742862884585491, "learning_rate": 1.7056942545958167e-05, "loss": 0.3657, "step": 1073 }, { "epoch": 3.626658243840809, "grad_norm": 0.08793012887671753, "learning_rate": 1.697972588920439e-05, "loss": 0.3655, "step": 1074 }, { "epoch": 3.6300273741840385, "grad_norm": 0.09014218865360733, "learning_rate": 1.6902637288938074e-05, "loss": 0.364, "step": 1075 }, { "epoch": 3.633396504527269, "grad_norm": 0.08892601725042051, "learning_rate": 1.6825677173985332e-05, "loss": 0.3665, "step": 1076 }, { "epoch": 3.636765634870499, "grad_norm": 0.0878924041737089, "learning_rate": 1.6748845972457562e-05, "loss": 0.3563, "step": 1077 }, { "epoch": 3.6401347652137295, "grad_norm": 0.09417513459021953, "learning_rate": 1.6672144111749066e-05, "loss": 0.3657, "step": 1078 }, { "epoch": 3.643503895556959, "grad_norm": 0.09041822306873473, "learning_rate": 1.659557201853465e-05, "loss": 0.3687, "step": 1079 }, { "epoch": 3.6468730259001894, "grad_norm": 0.08690354592106783, "learning_rate": 1.6519130118767258e-05, "loss": 0.3601, "step": 1080 }, { "epoch": 3.6502421562434195, "grad_norm": 0.08875815871506505, "learning_rate": 1.6442818837675578e-05, "loss": 0.3602, "step": 1081 }, { "epoch": 3.6536112865866497, "grad_norm": 0.08649489906143072, "learning_rate": 1.6366638599761676e-05, "loss": 0.362, "step": 1082 }, { "epoch": 3.65698041692988, "grad_norm": 0.08572937873824316, "learning_rate": 1.6290589828798736e-05, "loss": 0.3614, "step": 1083 }, { "epoch": 3.66034954727311, "grad_norm": 0.09798247509524252, "learning_rate": 1.621467294782854e-05, "loss": 0.3608, "step": 1084 }, { "epoch": 3.66371867761634, "grad_norm": 0.08586977784228111, "learning_rate": 1.6138888379159238e-05, "loss": 0.3602, "step": 1085 }, { "epoch": 3.6670878079595703, "grad_norm": 0.09119393564711122, "learning_rate": 1.606323654436293e-05, "loss": 0.3641, "step": 1086 }, { "epoch": 3.6704569383028005, "grad_norm": 0.09035766592284558, "learning_rate": 1.5987717864273377e-05, "loss": 0.366, "step": 1087 }, { "epoch": 3.6738260686460307, "grad_norm": 0.08837128914166983, "learning_rate": 1.591233275898363e-05, "loss": 0.3621, "step": 1088 }, { "epoch": 3.677195198989261, "grad_norm": 0.09501671591225473, "learning_rate": 1.5837081647843652e-05, "loss": 0.3655, "step": 1089 }, { "epoch": 3.680564329332491, "grad_norm": 0.08827549191913663, "learning_rate": 1.5761964949458076e-05, "loss": 0.3664, "step": 1090 }, { "epoch": 3.683933459675721, "grad_norm": 0.08653211178416792, "learning_rate": 1.5686983081683816e-05, "loss": 0.3613, "step": 1091 }, { "epoch": 3.6873025900189513, "grad_norm": 0.09007377723059057, "learning_rate": 1.5612136461627726e-05, "loss": 0.3605, "step": 1092 }, { "epoch": 3.6906717203621815, "grad_norm": 0.08875691955803114, "learning_rate": 1.5537425505644358e-05, "loss": 0.3692, "step": 1093 }, { "epoch": 3.6940408507054117, "grad_norm": 0.0865487137501953, "learning_rate": 1.546285062933352e-05, "loss": 0.3637, "step": 1094 }, { "epoch": 3.697409981048642, "grad_norm": 0.08441316995604657, "learning_rate": 1.5388412247538148e-05, "loss": 0.3566, "step": 1095 }, { "epoch": 3.700779111391872, "grad_norm": 0.08795109300895403, "learning_rate": 1.5314110774341803e-05, "loss": 0.3649, "step": 1096 }, { "epoch": 3.704148241735102, "grad_norm": 0.08660466649366423, "learning_rate": 1.5239946623066466e-05, "loss": 0.3656, "step": 1097 }, { "epoch": 3.7075173720783323, "grad_norm": 0.08962594968540999, "learning_rate": 1.5165920206270257e-05, "loss": 0.3578, "step": 1098 }, { "epoch": 3.7108865024215625, "grad_norm": 0.08885368968596062, "learning_rate": 1.5092031935745102e-05, "loss": 0.362, "step": 1099 }, { "epoch": 3.7142556327647926, "grad_norm": 0.0928847110975983, "learning_rate": 1.5018282222514451e-05, "loss": 0.3673, "step": 1100 }, { "epoch": 3.717624763108023, "grad_norm": 0.08764355173587007, "learning_rate": 1.4944671476830967e-05, "loss": 0.3559, "step": 1101 }, { "epoch": 3.720993893451253, "grad_norm": 0.08858832328866184, "learning_rate": 1.4871200108174306e-05, "loss": 0.3621, "step": 1102 }, { "epoch": 3.724363023794483, "grad_norm": 0.08763675374855749, "learning_rate": 1.479786852524879e-05, "loss": 0.3588, "step": 1103 }, { "epoch": 3.7277321541377133, "grad_norm": 0.08909210540103846, "learning_rate": 1.4724677135981118e-05, "loss": 0.3625, "step": 1104 }, { "epoch": 3.7311012844809435, "grad_norm": 0.08178768871920535, "learning_rate": 1.4651626347518169e-05, "loss": 0.3621, "step": 1105 }, { "epoch": 3.7344704148241736, "grad_norm": 0.09156080753526148, "learning_rate": 1.457871656622463e-05, "loss": 0.359, "step": 1106 }, { "epoch": 3.737839545167404, "grad_norm": 0.08325843071720376, "learning_rate": 1.4505948197680892e-05, "loss": 0.3607, "step": 1107 }, { "epoch": 3.741208675510634, "grad_norm": 0.08448426244553969, "learning_rate": 1.4433321646680614e-05, "loss": 0.3648, "step": 1108 }, { "epoch": 3.744577805853864, "grad_norm": 0.08086940191673836, "learning_rate": 1.4360837317228571e-05, "loss": 0.3588, "step": 1109 }, { "epoch": 3.747946936197094, "grad_norm": 0.08083156500148386, "learning_rate": 1.4288495612538427e-05, "loss": 0.3571, "step": 1110 }, { "epoch": 3.7513160665403245, "grad_norm": 0.08098967397738577, "learning_rate": 1.4216296935030433e-05, "loss": 0.3661, "step": 1111 }, { "epoch": 3.754685196883554, "grad_norm": 0.08420676698532144, "learning_rate": 1.4144241686329236e-05, "loss": 0.3667, "step": 1112 }, { "epoch": 3.758054327226785, "grad_norm": 0.08205757559067017, "learning_rate": 1.4072330267261585e-05, "loss": 0.3538, "step": 1113 }, { "epoch": 3.7614234575700145, "grad_norm": 0.08130986735808181, "learning_rate": 1.400056307785413e-05, "loss": 0.358, "step": 1114 }, { "epoch": 3.764792587913245, "grad_norm": 0.08513069994957134, "learning_rate": 1.3928940517331282e-05, "loss": 0.363, "step": 1115 }, { "epoch": 3.768161718256475, "grad_norm": 0.08636038178242018, "learning_rate": 1.3857462984112831e-05, "loss": 0.3625, "step": 1116 }, { "epoch": 3.7715308485997054, "grad_norm": 0.08228222146136721, "learning_rate": 1.3786130875811864e-05, "loss": 0.3643, "step": 1117 }, { "epoch": 3.774899978942935, "grad_norm": 0.08245367651432615, "learning_rate": 1.371494458923246e-05, "loss": 0.3611, "step": 1118 }, { "epoch": 3.7782691092861658, "grad_norm": 0.08984002063694464, "learning_rate": 1.3643904520367568e-05, "loss": 0.3665, "step": 1119 }, { "epoch": 3.7816382396293955, "grad_norm": 0.08004513922265995, "learning_rate": 1.3573011064396751e-05, "loss": 0.3626, "step": 1120 }, { "epoch": 3.7850073699726257, "grad_norm": 0.08501294133045856, "learning_rate": 1.3502264615683966e-05, "loss": 0.3584, "step": 1121 }, { "epoch": 3.788376500315856, "grad_norm": 0.08838080335200914, "learning_rate": 1.3431665567775439e-05, "loss": 0.3584, "step": 1122 }, { "epoch": 3.791745630659086, "grad_norm": 0.08330553315133392, "learning_rate": 1.3361214313397444e-05, "loss": 0.36, "step": 1123 }, { "epoch": 3.795114761002316, "grad_norm": 0.08670146765162016, "learning_rate": 1.3290911244454066e-05, "loss": 0.3661, "step": 1124 }, { "epoch": 3.7984838913455463, "grad_norm": 0.0841408453670069, "learning_rate": 1.3220756752025126e-05, "loss": 0.363, "step": 1125 }, { "epoch": 3.8018530216887765, "grad_norm": 0.08384047682221397, "learning_rate": 1.3150751226363886e-05, "loss": 0.3622, "step": 1126 }, { "epoch": 3.8052221520320066, "grad_norm": 0.08347244270329462, "learning_rate": 1.3080895056895022e-05, "loss": 0.3618, "step": 1127 }, { "epoch": 3.808591282375237, "grad_norm": 0.0851964538331852, "learning_rate": 1.3011188632212307e-05, "loss": 0.3639, "step": 1128 }, { "epoch": 3.811960412718467, "grad_norm": 0.08389988414632749, "learning_rate": 1.2941632340076531e-05, "loss": 0.3656, "step": 1129 }, { "epoch": 3.815329543061697, "grad_norm": 0.0818943745196087, "learning_rate": 1.2872226567413346e-05, "loss": 0.3595, "step": 1130 }, { "epoch": 3.8186986734049273, "grad_norm": 0.07744154226291297, "learning_rate": 1.2802971700311103e-05, "loss": 0.3595, "step": 1131 }, { "epoch": 3.8220678037481575, "grad_norm": 0.08550107649728135, "learning_rate": 1.2733868124018694e-05, "loss": 0.3614, "step": 1132 }, { "epoch": 3.8254369340913876, "grad_norm": 0.07860069500089853, "learning_rate": 1.2664916222943392e-05, "loss": 0.3552, "step": 1133 }, { "epoch": 3.828806064434618, "grad_norm": 0.08126878361185912, "learning_rate": 1.2596116380648761e-05, "loss": 0.3622, "step": 1134 }, { "epoch": 3.832175194777848, "grad_norm": 0.08610190267035886, "learning_rate": 1.2527468979852513e-05, "loss": 0.3645, "step": 1135 }, { "epoch": 3.835544325121078, "grad_norm": 0.0815351952289208, "learning_rate": 1.2458974402424312e-05, "loss": 0.36, "step": 1136 }, { "epoch": 3.8389134554643083, "grad_norm": 0.08556542224799225, "learning_rate": 1.239063302938376e-05, "loss": 0.3581, "step": 1137 }, { "epoch": 3.8422825858075385, "grad_norm": 0.0863667479775735, "learning_rate": 1.2322445240898158e-05, "loss": 0.3592, "step": 1138 }, { "epoch": 3.8456517161507686, "grad_norm": 0.09012131069922455, "learning_rate": 1.2254411416280494e-05, "loss": 0.3608, "step": 1139 }, { "epoch": 3.849020846493999, "grad_norm": 0.0813110318115451, "learning_rate": 1.2186531933987294e-05, "loss": 0.3617, "step": 1140 }, { "epoch": 3.852389976837229, "grad_norm": 0.08621918893656133, "learning_rate": 1.2118807171616469e-05, "loss": 0.3632, "step": 1141 }, { "epoch": 3.855759107180459, "grad_norm": 0.08876046697132542, "learning_rate": 1.2051237505905302e-05, "loss": 0.363, "step": 1142 }, { "epoch": 3.8591282375236893, "grad_norm": 0.08486782812443205, "learning_rate": 1.1983823312728306e-05, "loss": 0.3681, "step": 1143 }, { "epoch": 3.8624973678669194, "grad_norm": 0.08182983032740812, "learning_rate": 1.19165649670951e-05, "loss": 0.3635, "step": 1144 }, { "epoch": 3.8658664982101496, "grad_norm": 0.07583529894267067, "learning_rate": 1.1849462843148398e-05, "loss": 0.3633, "step": 1145 }, { "epoch": 3.8692356285533798, "grad_norm": 0.09126795810440728, "learning_rate": 1.1782517314161872e-05, "loss": 0.3584, "step": 1146 }, { "epoch": 3.87260475889661, "grad_norm": 0.0811651957931282, "learning_rate": 1.1715728752538103e-05, "loss": 0.3617, "step": 1147 }, { "epoch": 3.87597388923984, "grad_norm": 0.0843774615335534, "learning_rate": 1.164909752980648e-05, "loss": 0.3644, "step": 1148 }, { "epoch": 3.8793430195830703, "grad_norm": 0.08099134098178276, "learning_rate": 1.1582624016621154e-05, "loss": 0.3595, "step": 1149 }, { "epoch": 3.8827121499263004, "grad_norm": 0.07907946240714192, "learning_rate": 1.1516308582758983e-05, "loss": 0.3614, "step": 1150 }, { "epoch": 3.8860812802695306, "grad_norm": 0.08423868561227663, "learning_rate": 1.1450151597117479e-05, "loss": 0.3613, "step": 1151 }, { "epoch": 3.8894504106127608, "grad_norm": 0.08033686574245383, "learning_rate": 1.1384153427712729e-05, "loss": 0.3642, "step": 1152 }, { "epoch": 3.8928195409559905, "grad_norm": 0.07677407670189697, "learning_rate": 1.1318314441677348e-05, "loss": 0.3569, "step": 1153 }, { "epoch": 3.896188671299221, "grad_norm": 0.07906769729135289, "learning_rate": 1.1252635005258466e-05, "loss": 0.3595, "step": 1154 }, { "epoch": 3.899557801642451, "grad_norm": 0.08225694582677316, "learning_rate": 1.1187115483815693e-05, "loss": 0.3644, "step": 1155 }, { "epoch": 3.9029269319856814, "grad_norm": 0.08435086211540141, "learning_rate": 1.1121756241819023e-05, "loss": 0.3629, "step": 1156 }, { "epoch": 3.906296062328911, "grad_norm": 0.0779208137414844, "learning_rate": 1.105655764284689e-05, "loss": 0.3594, "step": 1157 }, { "epoch": 3.9096651926721417, "grad_norm": 0.07917404294021134, "learning_rate": 1.0991520049584112e-05, "loss": 0.3649, "step": 1158 }, { "epoch": 3.9130343230153715, "grad_norm": 0.07819778959894405, "learning_rate": 1.0926643823819827e-05, "loss": 0.3643, "step": 1159 }, { "epoch": 3.916403453358602, "grad_norm": 0.0822711836526933, "learning_rate": 1.0861929326445572e-05, "loss": 0.3627, "step": 1160 }, { "epoch": 3.919772583701832, "grad_norm": 0.07971405947853387, "learning_rate": 1.0797376917453187e-05, "loss": 0.3599, "step": 1161 }, { "epoch": 3.923141714045062, "grad_norm": 0.08341374870417605, "learning_rate": 1.0732986955932869e-05, "loss": 0.3555, "step": 1162 }, { "epoch": 3.926510844388292, "grad_norm": 0.07752760209485876, "learning_rate": 1.0668759800071174e-05, "loss": 0.3591, "step": 1163 }, { "epoch": 3.9298799747315223, "grad_norm": 0.0795643390039002, "learning_rate": 1.0604695807148971e-05, "loss": 0.3568, "step": 1164 }, { "epoch": 3.9332491050747524, "grad_norm": 0.07806803915038091, "learning_rate": 1.0540795333539515e-05, "loss": 0.3629, "step": 1165 }, { "epoch": 3.9366182354179826, "grad_norm": 0.07851161347206628, "learning_rate": 1.0477058734706436e-05, "loss": 0.3611, "step": 1166 }, { "epoch": 3.939987365761213, "grad_norm": 0.07762641296833782, "learning_rate": 1.0413486365201785e-05, "loss": 0.3613, "step": 1167 }, { "epoch": 3.943356496104443, "grad_norm": 0.08535005147447429, "learning_rate": 1.0350078578664005e-05, "loss": 0.3591, "step": 1168 }, { "epoch": 3.946725626447673, "grad_norm": 0.07824237520005016, "learning_rate": 1.0286835727816001e-05, "loss": 0.363, "step": 1169 }, { "epoch": 3.9500947567909033, "grad_norm": 0.0725027774844816, "learning_rate": 1.0223758164463246e-05, "loss": 0.361, "step": 1170 }, { "epoch": 3.9534638871341334, "grad_norm": 0.08250211916215387, "learning_rate": 1.0160846239491673e-05, "loss": 0.3706, "step": 1171 }, { "epoch": 3.9568330174773636, "grad_norm": 0.07768057857668437, "learning_rate": 1.0098100302865865e-05, "loss": 0.358, "step": 1172 }, { "epoch": 3.9602021478205938, "grad_norm": 0.0743357334386284, "learning_rate": 1.003552070362701e-05, "loss": 0.3588, "step": 1173 }, { "epoch": 3.963571278163824, "grad_norm": 0.08538154828312804, "learning_rate": 9.973107789891024e-06, "loss": 0.3687, "step": 1174 }, { "epoch": 3.966940408507054, "grad_norm": 0.08474253190258095, "learning_rate": 9.910861908846598e-06, "loss": 0.36, "step": 1175 }, { "epoch": 3.9703095388502843, "grad_norm": 0.07698260800417392, "learning_rate": 9.848783406753224e-06, "loss": 0.3655, "step": 1176 }, { "epoch": 3.9736786691935144, "grad_norm": 0.07875068992732076, "learning_rate": 9.786872628939329e-06, "loss": 0.3605, "step": 1177 }, { "epoch": 3.9770477995367446, "grad_norm": 0.08337836249305365, "learning_rate": 9.725129919800339e-06, "loss": 0.3653, "step": 1178 }, { "epoch": 3.9804169298799748, "grad_norm": 0.0799444611097984, "learning_rate": 9.66355562279671e-06, "loss": 0.3604, "step": 1179 }, { "epoch": 3.983786060223205, "grad_norm": 0.08618283586928363, "learning_rate": 9.60215008045211e-06, "loss": 0.3637, "step": 1180 }, { "epoch": 3.987155190566435, "grad_norm": 0.08302579845358256, "learning_rate": 9.540913634351408e-06, "loss": 0.3602, "step": 1181 }, { "epoch": 3.9905243209096652, "grad_norm": 0.07735294324245658, "learning_rate": 9.479846625138909e-06, "loss": 0.3596, "step": 1182 }, { "epoch": 3.9938934512528954, "grad_norm": 0.07471734423709958, "learning_rate": 9.418949392516307e-06, "loss": 0.3611, "step": 1183 }, { "epoch": 3.9972625815961256, "grad_norm": 0.08214012704171592, "learning_rate": 9.358222275240884e-06, "loss": 0.3648, "step": 1184 }, { "epoch": 4.00336913034323, "grad_norm": 0.11292758122904588, "learning_rate": 9.297665611123628e-06, "loss": 0.3527, "step": 1185 }, { "epoch": 4.00673826068646, "grad_norm": 0.0941098295127884, "learning_rate": 9.237279737027326e-06, "loss": 0.3472, "step": 1186 }, { "epoch": 4.01010739102969, "grad_norm": 0.09639154458998347, "learning_rate": 9.177064988864712e-06, "loss": 0.3425, "step": 1187 }, { "epoch": 4.013476521372921, "grad_norm": 0.09835304863889502, "learning_rate": 9.117021701596567e-06, "loss": 0.3446, "step": 1188 }, { "epoch": 4.01684565171615, "grad_norm": 0.08987244503280054, "learning_rate": 9.057150209229845e-06, "loss": 0.3513, "step": 1189 }, { "epoch": 4.020214782059381, "grad_norm": 0.10031177854257561, "learning_rate": 8.99745084481594e-06, "loss": 0.3516, "step": 1190 }, { "epoch": 4.023583912402611, "grad_norm": 0.10651297976200229, "learning_rate": 8.937923940448634e-06, "loss": 0.3489, "step": 1191 }, { "epoch": 4.026953042745841, "grad_norm": 0.08656835316363745, "learning_rate": 8.87856982726243e-06, "loss": 0.3402, "step": 1192 }, { "epoch": 4.030322173089071, "grad_norm": 0.0977560831877126, "learning_rate": 8.819388835430569e-06, "loss": 0.348, "step": 1193 }, { "epoch": 4.033691303432302, "grad_norm": 0.09746909055035731, "learning_rate": 8.7603812941633e-06, "loss": 0.3492, "step": 1194 }, { "epoch": 4.037060433775531, "grad_norm": 0.08395050874481182, "learning_rate": 8.701547531706018e-06, "loss": 0.3482, "step": 1195 }, { "epoch": 4.040429564118762, "grad_norm": 0.09139581639425662, "learning_rate": 8.642887875337376e-06, "loss": 0.3509, "step": 1196 }, { "epoch": 4.043798694461992, "grad_norm": 0.09015094643326858, "learning_rate": 8.584402651367556e-06, "loss": 0.3445, "step": 1197 }, { "epoch": 4.047167824805222, "grad_norm": 0.08067803096785321, "learning_rate": 8.526092185136394e-06, "loss": 0.345, "step": 1198 }, { "epoch": 4.050536955148452, "grad_norm": 0.08630631888609785, "learning_rate": 8.467956801011618e-06, "loss": 0.338, "step": 1199 }, { "epoch": 4.053906085491683, "grad_norm": 0.08433690244909006, "learning_rate": 8.409996822386972e-06, "loss": 0.343, "step": 1200 }, { "epoch": 4.057275215834912, "grad_norm": 0.07920044123514752, "learning_rate": 8.352212571680458e-06, "loss": 0.3473, "step": 1201 }, { "epoch": 4.060644346178143, "grad_norm": 0.07927154455223241, "learning_rate": 8.294604370332613e-06, "loss": 0.3482, "step": 1202 }, { "epoch": 4.064013476521373, "grad_norm": 0.08109057542606768, "learning_rate": 8.23717253880457e-06, "loss": 0.3428, "step": 1203 }, { "epoch": 4.067382606864603, "grad_norm": 0.08569342844895425, "learning_rate": 8.17991739657641e-06, "loss": 0.3474, "step": 1204 }, { "epoch": 4.070751737207833, "grad_norm": 0.08637139957757115, "learning_rate": 8.122839262145294e-06, "loss": 0.3467, "step": 1205 }, { "epoch": 4.074120867551064, "grad_norm": 0.07781808041765698, "learning_rate": 8.06593845302376e-06, "loss": 0.3395, "step": 1206 }, { "epoch": 4.077489997894293, "grad_norm": 0.08111376806052889, "learning_rate": 8.00921528573793e-06, "loss": 0.3389, "step": 1207 }, { "epoch": 4.080859128237524, "grad_norm": 0.08619767447901233, "learning_rate": 7.952670075825702e-06, "loss": 0.348, "step": 1208 }, { "epoch": 4.084228258580754, "grad_norm": 0.07737321565650793, "learning_rate": 7.896303137835084e-06, "loss": 0.3373, "step": 1209 }, { "epoch": 4.087597388923984, "grad_norm": 0.07775405743530504, "learning_rate": 7.840114785322384e-06, "loss": 0.3443, "step": 1210 }, { "epoch": 4.090966519267214, "grad_norm": 0.07816418598625743, "learning_rate": 7.78410533085046e-06, "loss": 0.345, "step": 1211 }, { "epoch": 4.094335649610445, "grad_norm": 0.08021420493935687, "learning_rate": 7.728275085987041e-06, "loss": 0.3445, "step": 1212 }, { "epoch": 4.097704779953674, "grad_norm": 0.07501876010838501, "learning_rate": 7.672624361302894e-06, "loss": 0.345, "step": 1213 }, { "epoch": 4.101073910296905, "grad_norm": 0.07616193917641446, "learning_rate": 7.6171534663702416e-06, "loss": 0.3451, "step": 1214 }, { "epoch": 4.104443040640135, "grad_norm": 0.08197274858236898, "learning_rate": 7.5618627097608835e-06, "loss": 0.3481, "step": 1215 }, { "epoch": 4.107812170983365, "grad_norm": 0.07483017111226394, "learning_rate": 7.50675239904457e-06, "loss": 0.3454, "step": 1216 }, { "epoch": 4.111181301326595, "grad_norm": 0.07441931083866478, "learning_rate": 7.451822840787279e-06, "loss": 0.3469, "step": 1217 }, { "epoch": 4.114550431669826, "grad_norm": 0.08142190767207858, "learning_rate": 7.397074340549508e-06, "loss": 0.3431, "step": 1218 }, { "epoch": 4.117919562013055, "grad_norm": 0.07876869644542178, "learning_rate": 7.342507202884577e-06, "loss": 0.3462, "step": 1219 }, { "epoch": 4.121288692356286, "grad_norm": 0.07845687277699909, "learning_rate": 7.288121731336901e-06, "loss": 0.3456, "step": 1220 }, { "epoch": 4.124657822699516, "grad_norm": 0.07817574483354851, "learning_rate": 7.233918228440324e-06, "loss": 0.3436, "step": 1221 }, { "epoch": 4.128026953042746, "grad_norm": 0.07876507958828823, "learning_rate": 7.1798969957165025e-06, "loss": 0.3493, "step": 1222 }, { "epoch": 4.131396083385976, "grad_norm": 0.07707210638891601, "learning_rate": 7.126058333673094e-06, "loss": 0.3402, "step": 1223 }, { "epoch": 4.134765213729207, "grad_norm": 0.07947117463971737, "learning_rate": 7.072402541802197e-06, "loss": 0.3478, "step": 1224 }, { "epoch": 4.138134344072436, "grad_norm": 0.07708906857469865, "learning_rate": 7.018929918578621e-06, "loss": 0.3457, "step": 1225 }, { "epoch": 4.141503474415666, "grad_norm": 0.08008450821251828, "learning_rate": 6.965640761458274e-06, "loss": 0.3414, "step": 1226 }, { "epoch": 4.144872604758897, "grad_norm": 0.07732322409168987, "learning_rate": 6.912535366876483e-06, "loss": 0.3427, "step": 1227 }, { "epoch": 4.148241735102126, "grad_norm": 0.07450575669616548, "learning_rate": 6.859614030246318e-06, "loss": 0.3477, "step": 1228 }, { "epoch": 4.151610865445357, "grad_norm": 0.08433118593640568, "learning_rate": 6.806877045957003e-06, "loss": 0.3425, "step": 1229 }, { "epoch": 4.154979995788587, "grad_norm": 0.07513389398253724, "learning_rate": 6.754324707372264e-06, "loss": 0.3443, "step": 1230 }, { "epoch": 4.158349126131817, "grad_norm": 0.07536890804885507, "learning_rate": 6.701957306828637e-06, "loss": 0.3438, "step": 1231 }, { "epoch": 4.161718256475047, "grad_norm": 0.07685668754719273, "learning_rate": 6.649775135633944e-06, "loss": 0.3401, "step": 1232 }, { "epoch": 4.165087386818278, "grad_norm": 0.07956673529792976, "learning_rate": 6.597778484065571e-06, "loss": 0.3503, "step": 1233 }, { "epoch": 4.168456517161507, "grad_norm": 0.07209527381971025, "learning_rate": 6.545967641368958e-06, "loss": 0.3434, "step": 1234 }, { "epoch": 4.171825647504738, "grad_norm": 0.07458918014634688, "learning_rate": 6.494342895755879e-06, "loss": 0.343, "step": 1235 }, { "epoch": 4.175194777847968, "grad_norm": 0.08077306421411162, "learning_rate": 6.4429045344029136e-06, "loss": 0.3513, "step": 1236 }, { "epoch": 4.178563908191198, "grad_norm": 0.08065308092284855, "learning_rate": 6.391652843449829e-06, "loss": 0.3434, "step": 1237 }, { "epoch": 4.181933038534428, "grad_norm": 0.0731775502872814, "learning_rate": 6.340588107997994e-06, "loss": 0.3443, "step": 1238 }, { "epoch": 4.185302168877659, "grad_norm": 0.07546567416391478, "learning_rate": 6.289710612108786e-06, "loss": 0.3434, "step": 1239 }, { "epoch": 4.188671299220888, "grad_norm": 0.07650397977406549, "learning_rate": 6.239020638801987e-06, "loss": 0.3452, "step": 1240 }, { "epoch": 4.192040429564119, "grad_norm": 0.07431679145535366, "learning_rate": 6.18851847005427e-06, "loss": 0.3484, "step": 1241 }, { "epoch": 4.195409559907349, "grad_norm": 0.07416827387620398, "learning_rate": 6.1382043867975836e-06, "loss": 0.3452, "step": 1242 }, { "epoch": 4.198778690250579, "grad_norm": 0.07754320392922942, "learning_rate": 6.088078668917572e-06, "loss": 0.3491, "step": 1243 }, { "epoch": 4.202147820593809, "grad_norm": 0.07827458851806732, "learning_rate": 6.038141595252094e-06, "loss": 0.3406, "step": 1244 }, { "epoch": 4.20551695093704, "grad_norm": 0.0725724426162921, "learning_rate": 5.9883934435895774e-06, "loss": 0.3496, "step": 1245 }, { "epoch": 4.208886081280269, "grad_norm": 0.0719909369345341, "learning_rate": 5.9388344906675485e-06, "loss": 0.3526, "step": 1246 }, { "epoch": 4.2122552116235, "grad_norm": 0.07567213228800986, "learning_rate": 5.889465012171069e-06, "loss": 0.3468, "step": 1247 }, { "epoch": 4.21562434196673, "grad_norm": 0.07098076354440293, "learning_rate": 5.840285282731173e-06, "loss": 0.3466, "step": 1248 }, { "epoch": 4.21899347230996, "grad_norm": 0.07019771893928237, "learning_rate": 5.791295575923382e-06, "loss": 0.3448, "step": 1249 }, { "epoch": 4.22236260265319, "grad_norm": 0.07471579252214251, "learning_rate": 5.742496164266174e-06, "loss": 0.3491, "step": 1250 }, { "epoch": 4.225731732996421, "grad_norm": 0.07236549423445121, "learning_rate": 5.693887319219422e-06, "loss": 0.3499, "step": 1251 }, { "epoch": 4.22910086333965, "grad_norm": 0.07134479537520134, "learning_rate": 5.645469311182958e-06, "loss": 0.3459, "step": 1252 }, { "epoch": 4.232469993682881, "grad_norm": 0.07072016749147457, "learning_rate": 5.597242409495018e-06, "loss": 0.3438, "step": 1253 }, { "epoch": 4.235839124026111, "grad_norm": 0.07179051070856982, "learning_rate": 5.549206882430773e-06, "loss": 0.3419, "step": 1254 }, { "epoch": 4.239208254369341, "grad_norm": 0.07302770625869862, "learning_rate": 5.501362997200787e-06, "loss": 0.3487, "step": 1255 }, { "epoch": 4.242577384712571, "grad_norm": 0.06976392401988353, "learning_rate": 5.453711019949581e-06, "loss": 0.344, "step": 1256 }, { "epoch": 4.245946515055802, "grad_norm": 0.07078499285712887, "learning_rate": 5.406251215754146e-06, "loss": 0.3465, "step": 1257 }, { "epoch": 4.249315645399031, "grad_norm": 0.07118826571789505, "learning_rate": 5.358983848622452e-06, "loss": 0.3504, "step": 1258 }, { "epoch": 4.252684775742262, "grad_norm": 0.0686563097499576, "learning_rate": 5.311909181491994e-06, "loss": 0.3433, "step": 1259 }, { "epoch": 4.256053906085492, "grad_norm": 0.06836729686980945, "learning_rate": 5.265027476228297e-06, "loss": 0.3428, "step": 1260 }, { "epoch": 4.259423036428722, "grad_norm": 0.07026205200909408, "learning_rate": 5.218338993623499e-06, "loss": 0.3475, "step": 1261 }, { "epoch": 4.262792166771952, "grad_norm": 0.07032323091306557, "learning_rate": 5.171843993394903e-06, "loss": 0.3431, "step": 1262 }, { "epoch": 4.2661612971151825, "grad_norm": 0.07423746533959613, "learning_rate": 5.125542734183473e-06, "loss": 0.3445, "step": 1263 }, { "epoch": 4.269530427458412, "grad_norm": 0.07841448579779874, "learning_rate": 5.079435473552474e-06, "loss": 0.3481, "step": 1264 }, { "epoch": 4.272899557801642, "grad_norm": 0.07040437269579536, "learning_rate": 5.033522467985985e-06, "loss": 0.3422, "step": 1265 }, { "epoch": 4.276268688144873, "grad_norm": 0.07271729651198641, "learning_rate": 4.987803972887482e-06, "loss": 0.3433, "step": 1266 }, { "epoch": 4.279637818488103, "grad_norm": 0.07717082685197238, "learning_rate": 4.9422802425784475e-06, "loss": 0.3459, "step": 1267 }, { "epoch": 4.283006948831333, "grad_norm": 0.07646859752104176, "learning_rate": 4.896951530296896e-06, "loss": 0.3487, "step": 1268 }, { "epoch": 4.286376079174563, "grad_norm": 0.07196146666335995, "learning_rate": 4.851818088196041e-06, "loss": 0.3451, "step": 1269 }, { "epoch": 4.289745209517793, "grad_norm": 0.07601088345941356, "learning_rate": 4.806880167342831e-06, "loss": 0.346, "step": 1270 }, { "epoch": 4.293114339861023, "grad_norm": 0.0730390084111676, "learning_rate": 4.762138017716571e-06, "loss": 0.3451, "step": 1271 }, { "epoch": 4.296483470204254, "grad_norm": 0.08370554873202815, "learning_rate": 4.7175918882075465e-06, "loss": 0.3413, "step": 1272 }, { "epoch": 4.299852600547483, "grad_norm": 0.07165140458981821, "learning_rate": 4.673242026615627e-06, "loss": 0.3413, "step": 1273 }, { "epoch": 4.303221730890714, "grad_norm": 0.07124644667052794, "learning_rate": 4.6290886796488946e-06, "loss": 0.3474, "step": 1274 }, { "epoch": 4.306590861233944, "grad_norm": 0.07331931362741691, "learning_rate": 4.58513209292224e-06, "loss": 0.3445, "step": 1275 }, { "epoch": 4.309959991577174, "grad_norm": 0.07237727500497035, "learning_rate": 4.54137251095605e-06, "loss": 0.3511, "step": 1276 }, { "epoch": 4.313329121920404, "grad_norm": 0.07038492284926416, "learning_rate": 4.4978101771748195e-06, "loss": 0.3429, "step": 1277 }, { "epoch": 4.316698252263635, "grad_norm": 0.07186746493744087, "learning_rate": 4.454445333905768e-06, "loss": 0.3423, "step": 1278 }, { "epoch": 4.320067382606864, "grad_norm": 0.07185532233373727, "learning_rate": 4.411278222377551e-06, "loss": 0.3416, "step": 1279 }, { "epoch": 4.323436512950095, "grad_norm": 0.0702075072689657, "learning_rate": 4.3683090827188666e-06, "loss": 0.3452, "step": 1280 }, { "epoch": 4.326805643293325, "grad_norm": 0.0752614715082349, "learning_rate": 4.325538153957158e-06, "loss": 0.3475, "step": 1281 }, { "epoch": 4.330174773636555, "grad_norm": 0.07050331941427515, "learning_rate": 4.282965674017265e-06, "loss": 0.3477, "step": 1282 }, { "epoch": 4.333543903979785, "grad_norm": 0.07219368807869528, "learning_rate": 4.240591879720084e-06, "loss": 0.3497, "step": 1283 }, { "epoch": 4.336913034323016, "grad_norm": 0.06956963675751204, "learning_rate": 4.198417006781283e-06, "loss": 0.3474, "step": 1284 }, { "epoch": 4.340282164666245, "grad_norm": 0.06960098578843016, "learning_rate": 4.156441289809983e-06, "loss": 0.3445, "step": 1285 }, { "epoch": 4.343651295009476, "grad_norm": 0.07648526368534525, "learning_rate": 4.114664962307439e-06, "loss": 0.3479, "step": 1286 }, { "epoch": 4.347020425352706, "grad_norm": 0.07088809269875901, "learning_rate": 4.073088256665742e-06, "loss": 0.3421, "step": 1287 }, { "epoch": 4.350389555695936, "grad_norm": 0.07273421779811111, "learning_rate": 4.031711404166525e-06, "loss": 0.344, "step": 1288 }, { "epoch": 4.353758686039166, "grad_norm": 0.07174713114445853, "learning_rate": 3.9905346349797234e-06, "loss": 0.3441, "step": 1289 }, { "epoch": 4.3571278163823965, "grad_norm": 0.07290897068132188, "learning_rate": 3.949558178162209e-06, "loss": 0.3462, "step": 1290 }, { "epoch": 4.360496946725626, "grad_norm": 0.07194649852054723, "learning_rate": 3.9087822616565984e-06, "loss": 0.3478, "step": 1291 }, { "epoch": 4.363866077068857, "grad_norm": 0.07337266992394913, "learning_rate": 3.86820711228991e-06, "loss": 0.3447, "step": 1292 }, { "epoch": 4.367235207412087, "grad_norm": 0.07030690021581439, "learning_rate": 3.827832955772372e-06, "loss": 0.3456, "step": 1293 }, { "epoch": 4.370604337755317, "grad_norm": 0.07201158711941352, "learning_rate": 3.7876600166961353e-06, "loss": 0.3465, "step": 1294 }, { "epoch": 4.373973468098547, "grad_norm": 0.07511999851456955, "learning_rate": 3.747688518534003e-06, "loss": 0.3509, "step": 1295 }, { "epoch": 4.3773425984417775, "grad_norm": 0.07172350904328591, "learning_rate": 3.707918683638223e-06, "loss": 0.345, "step": 1296 }, { "epoch": 4.380711728785007, "grad_norm": 0.0693885503387989, "learning_rate": 3.6683507332392476e-06, "loss": 0.3453, "step": 1297 }, { "epoch": 4.384080859128238, "grad_norm": 0.07019744686285931, "learning_rate": 3.628984887444462e-06, "loss": 0.3432, "step": 1298 }, { "epoch": 4.387449989471468, "grad_norm": 0.06892399615992918, "learning_rate": 3.589821365237023e-06, "loss": 0.3422, "step": 1299 }, { "epoch": 4.390819119814698, "grad_norm": 0.0711323225202878, "learning_rate": 3.550860384474568e-06, "loss": 0.3468, "step": 1300 }, { "epoch": 4.394188250157928, "grad_norm": 0.07222951484982641, "learning_rate": 3.5121021618881e-06, "loss": 0.3444, "step": 1301 }, { "epoch": 4.3975573805011585, "grad_norm": 0.07011816955357002, "learning_rate": 3.473546913080674e-06, "loss": 0.3417, "step": 1302 }, { "epoch": 4.400926510844388, "grad_norm": 0.06918135608237871, "learning_rate": 3.4351948525262625e-06, "loss": 0.3431, "step": 1303 }, { "epoch": 4.404295641187619, "grad_norm": 0.07183448949638974, "learning_rate": 3.397046193568558e-06, "loss": 0.3454, "step": 1304 }, { "epoch": 4.407664771530849, "grad_norm": 0.06841029875272973, "learning_rate": 3.3591011484197744e-06, "loss": 0.3471, "step": 1305 }, { "epoch": 4.411033901874079, "grad_norm": 0.07008578728288764, "learning_rate": 3.3213599281594688e-06, "loss": 0.3469, "step": 1306 }, { "epoch": 4.414403032217309, "grad_norm": 0.06784411674661273, "learning_rate": 3.28382274273336e-06, "loss": 0.3452, "step": 1307 }, { "epoch": 4.417772162560539, "grad_norm": 0.06727601165426443, "learning_rate": 3.246489800952155e-06, "loss": 0.3513, "step": 1308 }, { "epoch": 4.421141292903769, "grad_norm": 0.06930299868926686, "learning_rate": 3.209361310490451e-06, "loss": 0.344, "step": 1309 }, { "epoch": 4.424510423247, "grad_norm": 0.06983137546711997, "learning_rate": 3.172437477885475e-06, "loss": 0.3432, "step": 1310 }, { "epoch": 4.4278795535902296, "grad_norm": 0.06738405898147315, "learning_rate": 3.1357185085360233e-06, "loss": 0.3412, "step": 1311 }, { "epoch": 4.431248683933459, "grad_norm": 0.069114436702608, "learning_rate": 3.099204606701256e-06, "loss": 0.3438, "step": 1312 }, { "epoch": 4.43461781427669, "grad_norm": 0.07063250147224803, "learning_rate": 3.062895975499616e-06, "loss": 0.3449, "step": 1313 }, { "epoch": 4.43798694461992, "grad_norm": 0.06869203050534661, "learning_rate": 3.026792816907671e-06, "loss": 0.347, "step": 1314 }, { "epoch": 4.44135607496315, "grad_norm": 0.06790795340800003, "learning_rate": 2.9908953317589675e-06, "loss": 0.3511, "step": 1315 }, { "epoch": 4.44472520530638, "grad_norm": 0.06801706888897209, "learning_rate": 2.955203719742965e-06, "loss": 0.3499, "step": 1316 }, { "epoch": 4.4480943356496105, "grad_norm": 0.06703090567229934, "learning_rate": 2.9197181794038896e-06, "loss": 0.3409, "step": 1317 }, { "epoch": 4.45146346599284, "grad_norm": 0.06845785402581211, "learning_rate": 2.884438908139626e-06, "loss": 0.3451, "step": 1318 }, { "epoch": 4.454832596336071, "grad_norm": 0.06809288242514337, "learning_rate": 2.8493661022006615e-06, "loss": 0.349, "step": 1319 }, { "epoch": 4.458201726679301, "grad_norm": 0.06993068933675987, "learning_rate": 2.814499956688912e-06, "loss": 0.3457, "step": 1320 }, { "epoch": 4.461570857022531, "grad_norm": 0.06709969061038806, "learning_rate": 2.7798406655567565e-06, "loss": 0.3512, "step": 1321 }, { "epoch": 4.464939987365761, "grad_norm": 0.06978980053246452, "learning_rate": 2.7453884216058368e-06, "loss": 0.3452, "step": 1322 }, { "epoch": 4.4683091177089915, "grad_norm": 0.06806425403838408, "learning_rate": 2.7111434164860573e-06, "loss": 0.3489, "step": 1323 }, { "epoch": 4.471678248052221, "grad_norm": 0.07023315792460011, "learning_rate": 2.677105840694507e-06, "loss": 0.3484, "step": 1324 }, { "epoch": 4.475047378395452, "grad_norm": 0.0671632913864402, "learning_rate": 2.6432758835743854e-06, "loss": 0.3475, "step": 1325 }, { "epoch": 4.478416508738682, "grad_norm": 0.0668737342617598, "learning_rate": 2.6096537333139616e-06, "loss": 0.3402, "step": 1326 }, { "epoch": 4.481785639081912, "grad_norm": 0.06731813732301019, "learning_rate": 2.5762395769455183e-06, "loss": 0.3472, "step": 1327 }, { "epoch": 4.485154769425142, "grad_norm": 0.06962894223132757, "learning_rate": 2.5430336003443045e-06, "loss": 0.3411, "step": 1328 }, { "epoch": 4.4885238997683725, "grad_norm": 0.06651868659879541, "learning_rate": 2.5100359882275526e-06, "loss": 0.3463, "step": 1329 }, { "epoch": 4.491893030111602, "grad_norm": 0.06589574436809537, "learning_rate": 2.4772469241533648e-06, "loss": 0.3449, "step": 1330 }, { "epoch": 4.495262160454833, "grad_norm": 0.06851573366912253, "learning_rate": 2.444666590519775e-06, "loss": 0.3478, "step": 1331 }, { "epoch": 4.498631290798063, "grad_norm": 0.06812334086330306, "learning_rate": 2.4122951685636674e-06, "loss": 0.3493, "step": 1332 }, { "epoch": 4.502000421141293, "grad_norm": 0.06783544762672909, "learning_rate": 2.380132838359819e-06, "loss": 0.3458, "step": 1333 }, { "epoch": 4.505369551484523, "grad_norm": 0.06645851016955091, "learning_rate": 2.3481797788198745e-06, "loss": 0.3487, "step": 1334 }, { "epoch": 4.5087386818277535, "grad_norm": 0.06691716361429041, "learning_rate": 2.3164361676913406e-06, "loss": 0.3461, "step": 1335 }, { "epoch": 4.512107812170983, "grad_norm": 0.066561132769546, "learning_rate": 2.284902181556632e-06, "loss": 0.3451, "step": 1336 }, { "epoch": 4.515476942514214, "grad_norm": 0.06972464277014613, "learning_rate": 2.2535779958320614e-06, "loss": 0.3363, "step": 1337 }, { "epoch": 4.5188460728574436, "grad_norm": 0.06662582951723346, "learning_rate": 2.2224637847668484e-06, "loss": 0.3462, "step": 1338 }, { "epoch": 4.522215203200674, "grad_norm": 0.06683364588110276, "learning_rate": 2.1915597214422048e-06, "loss": 0.345, "step": 1339 }, { "epoch": 4.525584333543904, "grad_norm": 0.06973071855720024, "learning_rate": 2.1608659777703033e-06, "loss": 0.3486, "step": 1340 }, { "epoch": 4.5289534638871345, "grad_norm": 0.06547912030107868, "learning_rate": 2.130382724493405e-06, "loss": 0.3481, "step": 1341 }, { "epoch": 4.532322594230364, "grad_norm": 0.06796161455803124, "learning_rate": 2.100110131182813e-06, "loss": 0.3488, "step": 1342 }, { "epoch": 4.535691724573595, "grad_norm": 0.06643717641974535, "learning_rate": 2.070048366238e-06, "loss": 0.3453, "step": 1343 }, { "epoch": 4.5390608549168245, "grad_norm": 0.0657312313993076, "learning_rate": 2.0401975968856514e-06, "loss": 0.3364, "step": 1344 }, { "epoch": 4.542429985260055, "grad_norm": 0.0662991056630753, "learning_rate": 2.010557989178725e-06, "loss": 0.3456, "step": 1345 }, { "epoch": 4.545799115603285, "grad_norm": 0.06723548381525182, "learning_rate": 1.981129707995542e-06, "loss": 0.3428, "step": 1346 }, { "epoch": 4.549168245946515, "grad_norm": 0.06854275132803765, "learning_rate": 1.9519129170388496e-06, "loss": 0.3519, "step": 1347 }, { "epoch": 4.552537376289745, "grad_norm": 0.0687917997485082, "learning_rate": 1.9229077788349393e-06, "loss": 0.342, "step": 1348 }, { "epoch": 4.555906506632976, "grad_norm": 0.06728042882661939, "learning_rate": 1.8941144547327228e-06, "loss": 0.3513, "step": 1349 }, { "epoch": 4.5592756369762055, "grad_norm": 0.06733086071107253, "learning_rate": 1.865533104902828e-06, "loss": 0.3432, "step": 1350 }, { "epoch": 4.562644767319435, "grad_norm": 0.06653132755662035, "learning_rate": 1.8371638883367371e-06, "loss": 0.3455, "step": 1351 }, { "epoch": 4.566013897662666, "grad_norm": 0.07062467690102314, "learning_rate": 1.8090069628458583e-06, "loss": 0.3513, "step": 1352 }, { "epoch": 4.5693830280058965, "grad_norm": 0.06749739958232552, "learning_rate": 1.7810624850607007e-06, "loss": 0.3422, "step": 1353 }, { "epoch": 4.572752158349126, "grad_norm": 0.06715174264716953, "learning_rate": 1.7533306104299663e-06, "loss": 0.3427, "step": 1354 }, { "epoch": 4.576121288692356, "grad_norm": 0.06825607468688703, "learning_rate": 1.7258114932196824e-06, "loss": 0.3484, "step": 1355 }, { "epoch": 4.5794904190355865, "grad_norm": 0.0662384762896948, "learning_rate": 1.6985052865123641e-06, "loss": 0.344, "step": 1356 }, { "epoch": 4.582859549378816, "grad_norm": 0.06749795339121123, "learning_rate": 1.6714121422061636e-06, "loss": 0.348, "step": 1357 }, { "epoch": 4.586228679722047, "grad_norm": 0.06937799589584792, "learning_rate": 1.6445322110140116e-06, "loss": 0.3473, "step": 1358 }, { "epoch": 4.589597810065277, "grad_norm": 0.06748221547140407, "learning_rate": 1.617865642462766e-06, "loss": 0.3414, "step": 1359 }, { "epoch": 4.592966940408507, "grad_norm": 0.06814928775630703, "learning_rate": 1.59141258489242e-06, "loss": 0.345, "step": 1360 }, { "epoch": 4.596336070751737, "grad_norm": 0.07057379791962957, "learning_rate": 1.5651731854552466e-06, "loss": 0.3432, "step": 1361 }, { "epoch": 4.5997052010949675, "grad_norm": 0.06665029276024906, "learning_rate": 1.53914759011498e-06, "loss": 0.3524, "step": 1362 }, { "epoch": 4.603074331438197, "grad_norm": 0.06906650342043347, "learning_rate": 1.513335943646026e-06, "loss": 0.3457, "step": 1363 }, { "epoch": 4.606443461781428, "grad_norm": 0.06942705785663987, "learning_rate": 1.4877383896326269e-06, "loss": 0.3435, "step": 1364 }, { "epoch": 4.6098125921246575, "grad_norm": 0.06819335124159634, "learning_rate": 1.4623550704680889e-06, "loss": 0.3508, "step": 1365 }, { "epoch": 4.613181722467888, "grad_norm": 0.06742489592183823, "learning_rate": 1.4371861273539778e-06, "loss": 0.3457, "step": 1366 }, { "epoch": 4.616550852811118, "grad_norm": 0.064467972456891, "learning_rate": 1.4122317002993247e-06, "loss": 0.3437, "step": 1367 }, { "epoch": 4.6199199831543485, "grad_norm": 0.06450585611276803, "learning_rate": 1.3874919281198662e-06, "loss": 0.3471, "step": 1368 }, { "epoch": 4.623289113497578, "grad_norm": 0.06675137780602221, "learning_rate": 1.3629669484372722e-06, "loss": 0.3497, "step": 1369 }, { "epoch": 4.626658243840809, "grad_norm": 0.06713388756947067, "learning_rate": 1.3386568976783453e-06, "loss": 0.3423, "step": 1370 }, { "epoch": 4.6300273741840385, "grad_norm": 0.0647734710561896, "learning_rate": 1.3145619110743169e-06, "loss": 0.3451, "step": 1371 }, { "epoch": 4.633396504527269, "grad_norm": 0.06580879452568121, "learning_rate": 1.2906821226600453e-06, "loss": 0.3429, "step": 1372 }, { "epoch": 4.636765634870499, "grad_norm": 0.06578978457756152, "learning_rate": 1.2670176652733023e-06, "loss": 0.342, "step": 1373 }, { "epoch": 4.6401347652137295, "grad_norm": 0.06786565921397064, "learning_rate": 1.2435686705540228e-06, "loss": 0.3458, "step": 1374 }, { "epoch": 4.643503895556959, "grad_norm": 0.06730192180307096, "learning_rate": 1.2203352689435532e-06, "loss": 0.3505, "step": 1375 }, { "epoch": 4.64687302590019, "grad_norm": 0.06442684402191479, "learning_rate": 1.1973175896839684e-06, "loss": 0.3417, "step": 1376 }, { "epoch": 4.6502421562434195, "grad_norm": 0.06497046470832643, "learning_rate": 1.1745157608173253e-06, "loss": 0.3429, "step": 1377 }, { "epoch": 4.65361128658665, "grad_norm": 0.0655614246650691, "learning_rate": 1.1519299091849523e-06, "loss": 0.3405, "step": 1378 }, { "epoch": 4.65698041692988, "grad_norm": 0.06746924444935623, "learning_rate": 1.1295601604267348e-06, "loss": 0.347, "step": 1379 }, { "epoch": 4.6603495472731105, "grad_norm": 0.06671677812012947, "learning_rate": 1.1074066389804395e-06, "loss": 0.348, "step": 1380 }, { "epoch": 4.66371867761634, "grad_norm": 0.06798688584484958, "learning_rate": 1.0854694680810175e-06, "loss": 0.3468, "step": 1381 }, { "epoch": 4.667087807959571, "grad_norm": 0.06373690906496436, "learning_rate": 1.0637487697598937e-06, "loss": 0.3391, "step": 1382 }, { "epoch": 4.6704569383028005, "grad_norm": 0.06902986516002681, "learning_rate": 1.0422446648443142e-06, "loss": 0.3449, "step": 1383 }, { "epoch": 4.673826068646031, "grad_norm": 0.06783886040134948, "learning_rate": 1.0209572729566708e-06, "loss": 0.3469, "step": 1384 }, { "epoch": 4.677195198989261, "grad_norm": 0.06789415607732335, "learning_rate": 9.998867125138223e-07, "loss": 0.3483, "step": 1385 }, { "epoch": 4.680564329332491, "grad_norm": 0.06478682570392917, "learning_rate": 9.790331007264543e-07, "loss": 0.3465, "step": 1386 }, { "epoch": 4.683933459675721, "grad_norm": 0.06659198241596209, "learning_rate": 9.583965535983997e-07, "loss": 0.3377, "step": 1387 }, { "epoch": 4.687302590018952, "grad_norm": 0.06679774424195298, "learning_rate": 9.379771859260267e-07, "loss": 0.3474, "step": 1388 }, { "epoch": 4.6906717203621815, "grad_norm": 0.06562337466888649, "learning_rate": 9.177751112975853e-07, "loss": 0.3378, "step": 1389 }, { "epoch": 4.694040850705411, "grad_norm": 0.0643058634496552, "learning_rate": 8.977904420925543e-07, "loss": 0.3401, "step": 1390 }, { "epoch": 4.697409981048642, "grad_norm": 0.06520681777558435, "learning_rate": 8.780232894810558e-07, "loss": 0.3476, "step": 1391 }, { "epoch": 4.700779111391872, "grad_norm": 0.06652677782803126, "learning_rate": 8.584737634232154e-07, "loss": 0.3445, "step": 1392 }, { "epoch": 4.704148241735102, "grad_norm": 0.06513347952901734, "learning_rate": 8.391419726685446e-07, "loss": 0.3486, "step": 1393 }, { "epoch": 4.707517372078332, "grad_norm": 0.06577657248355921, "learning_rate": 8.200280247553461e-07, "loss": 0.3461, "step": 1394 }, { "epoch": 4.7108865024215625, "grad_norm": 0.06369190711960318, "learning_rate": 8.011320260101052e-07, "loss": 0.3478, "step": 1395 }, { "epoch": 4.714255632764792, "grad_norm": 0.06569207225402134, "learning_rate": 7.824540815469306e-07, "loss": 0.3496, "step": 1396 }, { "epoch": 4.717624763108023, "grad_norm": 0.0636558204987421, "learning_rate": 7.639942952669232e-07, "loss": 0.3462, "step": 1397 }, { "epoch": 4.7209938934512525, "grad_norm": 0.06451389941556673, "learning_rate": 7.457527698576217e-07, "loss": 0.3454, "step": 1398 }, { "epoch": 4.724363023794483, "grad_norm": 0.06490245056573639, "learning_rate": 7.277296067924377e-07, "loss": 0.345, "step": 1399 }, { "epoch": 4.727732154137713, "grad_norm": 0.06421211046867673, "learning_rate": 7.099249063300751e-07, "loss": 0.3509, "step": 1400 }, { "epoch": 4.7311012844809435, "grad_norm": 0.06376468633122387, "learning_rate": 6.923387675139958e-07, "loss": 0.3449, "step": 1401 }, { "epoch": 4.734470414824173, "grad_norm": 0.06306595288457956, "learning_rate": 6.749712881718306e-07, "loss": 0.3438, "step": 1402 }, { "epoch": 4.737839545167404, "grad_norm": 0.064531257088043, "learning_rate": 6.578225649148806e-07, "loss": 0.3459, "step": 1403 }, { "epoch": 4.7412086755106335, "grad_norm": 0.06475033645731526, "learning_rate": 6.408926931375403e-07, "loss": 0.3489, "step": 1404 }, { "epoch": 4.744577805853864, "grad_norm": 0.06725279891073008, "learning_rate": 6.241817670167961e-07, "loss": 0.3517, "step": 1405 }, { "epoch": 4.747946936197094, "grad_norm": 0.06576628806576036, "learning_rate": 6.076898795116792e-07, "loss": 0.3476, "step": 1406 }, { "epoch": 4.7513160665403245, "grad_norm": 0.06636084321383787, "learning_rate": 5.914171223627652e-07, "loss": 0.3431, "step": 1407 }, { "epoch": 4.754685196883554, "grad_norm": 0.06307439592979396, "learning_rate": 5.753635860916617e-07, "loss": 0.344, "step": 1408 }, { "epoch": 4.758054327226785, "grad_norm": 0.06354853186497929, "learning_rate": 5.595293600004948e-07, "loss": 0.3452, "step": 1409 }, { "epoch": 4.7614234575700145, "grad_norm": 0.06640861850363539, "learning_rate": 5.43914532171419e-07, "loss": 0.3498, "step": 1410 }, { "epoch": 4.764792587913245, "grad_norm": 0.06432227056221736, "learning_rate": 5.285191894661257e-07, "loss": 0.3448, "step": 1411 }, { "epoch": 4.768161718256475, "grad_norm": 0.0650298496723325, "learning_rate": 5.133434175253715e-07, "loss": 0.348, "step": 1412 }, { "epoch": 4.771530848599705, "grad_norm": 0.0642338741687956, "learning_rate": 4.983873007684769e-07, "loss": 0.3504, "step": 1413 }, { "epoch": 4.774899978942935, "grad_norm": 0.06597221985673193, "learning_rate": 4.83650922392882e-07, "loss": 0.3443, "step": 1414 }, { "epoch": 4.778269109286166, "grad_norm": 0.06414310328903884, "learning_rate": 4.691343643736579e-07, "loss": 0.3498, "step": 1415 }, { "epoch": 4.7816382396293955, "grad_norm": 0.06423727553913079, "learning_rate": 4.5483770746309383e-07, "loss": 0.3462, "step": 1416 }, { "epoch": 4.785007369972626, "grad_norm": 0.06712703203955196, "learning_rate": 4.4076103119018666e-07, "loss": 0.344, "step": 1417 }, { "epoch": 4.788376500315856, "grad_norm": 0.06406676946222813, "learning_rate": 4.269044138602585e-07, "loss": 0.3424, "step": 1418 }, { "epoch": 4.791745630659086, "grad_norm": 0.0650048525731774, "learning_rate": 4.132679325544775e-07, "loss": 0.3434, "step": 1419 }, { "epoch": 4.795114761002316, "grad_norm": 0.06381393163242242, "learning_rate": 3.998516631294491e-07, "loss": 0.3464, "step": 1420 }, { "epoch": 4.798483891345547, "grad_norm": 0.062168147457412865, "learning_rate": 3.866556802167942e-07, "loss": 0.3447, "step": 1421 }, { "epoch": 4.8018530216887765, "grad_norm": 0.06359774281703022, "learning_rate": 3.736800572227317e-07, "loss": 0.3452, "step": 1422 }, { "epoch": 4.805222152032007, "grad_norm": 0.06777082256384792, "learning_rate": 3.6092486632766543e-07, "loss": 0.3405, "step": 1423 }, { "epoch": 4.808591282375237, "grad_norm": 0.06518391137080269, "learning_rate": 3.483901784857846e-07, "loss": 0.3499, "step": 1424 }, { "epoch": 4.811960412718467, "grad_norm": 0.06360491257484012, "learning_rate": 3.3607606342467293e-07, "loss": 0.3464, "step": 1425 }, { "epoch": 4.815329543061697, "grad_norm": 0.0630016058736709, "learning_rate": 3.239825896449267e-07, "loss": 0.3493, "step": 1426 }, { "epoch": 4.818698673404928, "grad_norm": 0.06424370898677036, "learning_rate": 3.1210982441974623e-07, "loss": 0.3424, "step": 1427 }, { "epoch": 4.8220678037481575, "grad_norm": 0.06333420103209184, "learning_rate": 3.004578337945985e-07, "loss": 0.3444, "step": 1428 }, { "epoch": 4.825436934091387, "grad_norm": 0.06413449663730773, "learning_rate": 2.8902668258683043e-07, "loss": 0.3465, "step": 1429 }, { "epoch": 4.828806064434618, "grad_norm": 0.06372049815441223, "learning_rate": 2.778164343852918e-07, "loss": 0.3478, "step": 1430 }, { "epoch": 4.832175194777848, "grad_norm": 0.06414269762017184, "learning_rate": 2.668271515500287e-07, "loss": 0.3502, "step": 1431 }, { "epoch": 4.835544325121078, "grad_norm": 0.06533137367117652, "learning_rate": 2.5605889521188364e-07, "loss": 0.3491, "step": 1432 }, { "epoch": 4.838913455464308, "grad_norm": 0.06350312986484183, "learning_rate": 2.455117252721895e-07, "loss": 0.3453, "step": 1433 }, { "epoch": 4.8422825858075385, "grad_norm": 0.06475788404284327, "learning_rate": 2.351857004024316e-07, "loss": 0.3503, "step": 1434 }, { "epoch": 4.845651716150769, "grad_norm": 0.0631781774805789, "learning_rate": 2.2508087804390178e-07, "loss": 0.3446, "step": 1435 }, { "epoch": 4.849020846493999, "grad_norm": 0.06379282423784381, "learning_rate": 2.1519731440740487e-07, "loss": 0.3474, "step": 1436 }, { "epoch": 4.8523899768372285, "grad_norm": 0.06402172658556064, "learning_rate": 2.055350644729348e-07, "loss": 0.3511, "step": 1437 }, { "epoch": 4.855759107180459, "grad_norm": 0.06513215066751245, "learning_rate": 1.9609418198935916e-07, "loss": 0.3471, "step": 1438 }, { "epoch": 4.859128237523689, "grad_norm": 0.06283559414952865, "learning_rate": 1.8687471947413495e-07, "loss": 0.3446, "step": 1439 }, { "epoch": 4.862497367866919, "grad_norm": 0.06309493725276366, "learning_rate": 1.778767282130156e-07, "loss": 0.3431, "step": 1440 }, { "epoch": 4.865866498210149, "grad_norm": 0.06560809934411752, "learning_rate": 1.691002582597534e-07, "loss": 0.3526, "step": 1441 }, { "epoch": 4.86923562855338, "grad_norm": 0.06433417193452762, "learning_rate": 1.6054535843582854e-07, "loss": 0.3507, "step": 1442 }, { "epoch": 4.8726047588966095, "grad_norm": 0.06442999780392818, "learning_rate": 1.522120763301782e-07, "loss": 0.3492, "step": 1443 }, { "epoch": 4.87597388923984, "grad_norm": 0.06306148601810407, "learning_rate": 1.4410045829893915e-07, "loss": 0.3434, "step": 1444 }, { "epoch": 4.87934301958307, "grad_norm": 0.06308220046755993, "learning_rate": 1.3621054946517666e-07, "loss": 0.3445, "step": 1445 }, { "epoch": 4.8827121499263, "grad_norm": 0.06305097370353915, "learning_rate": 1.2854239371863142e-07, "loss": 0.3431, "step": 1446 }, { "epoch": 4.88608128026953, "grad_norm": 0.06293090962933129, "learning_rate": 1.2109603371548873e-07, "loss": 0.3397, "step": 1447 }, { "epoch": 4.889450410612761, "grad_norm": 0.06368330582611549, "learning_rate": 1.1387151087814297e-07, "loss": 0.3468, "step": 1448 }, { "epoch": 4.8928195409559905, "grad_norm": 0.0642396525858067, "learning_rate": 1.06868865394949e-07, "loss": 0.3419, "step": 1449 }, { "epoch": 4.896188671299221, "grad_norm": 0.06286580837917152, "learning_rate": 1.0008813622001345e-07, "loss": 0.3465, "step": 1450 }, { "epoch": 4.899557801642451, "grad_norm": 0.0646704999704258, "learning_rate": 9.352936107296817e-08, "loss": 0.3515, "step": 1451 }, { "epoch": 4.902926931985681, "grad_norm": 0.06254527612862122, "learning_rate": 8.719257643877044e-08, "loss": 0.3418, "step": 1452 }, { "epoch": 4.906296062328911, "grad_norm": 0.06265534232163783, "learning_rate": 8.107781756749866e-08, "loss": 0.3417, "step": 1453 }, { "epoch": 4.909665192672142, "grad_norm": 0.06417368994248919, "learning_rate": 7.51851184741481e-08, "loss": 0.3451, "step": 1454 }, { "epoch": 4.9130343230153715, "grad_norm": 0.06427635001716354, "learning_rate": 6.951451193844883e-08, "loss": 0.3517, "step": 1455 }, { "epoch": 4.916403453358602, "grad_norm": 0.06446286415220177, "learning_rate": 6.40660295046791e-08, "loss": 0.3499, "step": 1456 }, { "epoch": 4.919772583701832, "grad_norm": 0.06325304997383964, "learning_rate": 5.8839701481487875e-08, "loss": 0.3437, "step": 1457 }, { "epoch": 4.923141714045062, "grad_norm": 0.06376968784671593, "learning_rate": 5.3835556941743695e-08, "loss": 0.3423, "step": 1458 }, { "epoch": 4.926510844388292, "grad_norm": 0.06529781285688359, "learning_rate": 4.905362372234379e-08, "loss": 0.3492, "step": 1459 }, { "epoch": 4.929879974731523, "grad_norm": 0.06414078488995091, "learning_rate": 4.449392842408529e-08, "loss": 0.3479, "step": 1460 }, { "epoch": 4.9332491050747524, "grad_norm": 0.06362859239383568, "learning_rate": 4.015649641150976e-08, "loss": 0.3492, "step": 1461 }, { "epoch": 4.936618235417983, "grad_norm": 0.06341769294492185, "learning_rate": 3.6041351812743374e-08, "loss": 0.351, "step": 1462 }, { "epoch": 4.939987365761213, "grad_norm": 0.06486183719402762, "learning_rate": 3.21485175193903e-08, "loss": 0.3511, "step": 1463 }, { "epoch": 4.943356496104443, "grad_norm": 0.06360741943701602, "learning_rate": 2.8478015186399477e-08, "loss": 0.3471, "step": 1464 }, { "epoch": 4.946725626447673, "grad_norm": 0.06343696624954866, "learning_rate": 2.5029865231922524e-08, "loss": 0.3448, "step": 1465 }, { "epoch": 4.950094756790904, "grad_norm": 0.06343915127065658, "learning_rate": 2.1804086837229344e-08, "loss": 0.3416, "step": 1466 }, { "epoch": 4.953463887134133, "grad_norm": 0.06487303485827695, "learning_rate": 1.880069794657935e-08, "loss": 0.3444, "step": 1467 }, { "epoch": 4.956833017477363, "grad_norm": 0.062408603956769386, "learning_rate": 1.601971526713708e-08, "loss": 0.341, "step": 1468 }, { "epoch": 4.960202147820594, "grad_norm": 0.06255760369115392, "learning_rate": 1.3461154268865628e-08, "loss": 0.3445, "step": 1469 }, { "epoch": 4.963571278163824, "grad_norm": 0.062112638608570706, "learning_rate": 1.112502918445113e-08, "loss": 0.3391, "step": 1470 }, { "epoch": 4.966940408507054, "grad_norm": 0.06398681422452646, "learning_rate": 9.011353009222846e-09, "loss": 0.3455, "step": 1471 }, { "epoch": 4.970309538850284, "grad_norm": 0.0637738300165632, "learning_rate": 7.12013750107321e-09, "loss": 0.3438, "step": 1472 }, { "epoch": 4.973678669193514, "grad_norm": 0.06456086790149927, "learning_rate": 5.451393180400111e-09, "loss": 0.3486, "step": 1473 }, { "epoch": 4.977047799536745, "grad_norm": 0.06334490636848067, "learning_rate": 4.00512933004471e-09, "loss": 0.3456, "step": 1474 }, { "epoch": 4.980416929879975, "grad_norm": 0.06295292438577572, "learning_rate": 2.7813539952381563e-09, "loss": 0.3445, "step": 1475 }, { "epoch": 4.9837860602232045, "grad_norm": 0.0633108315280129, "learning_rate": 1.7800739835616143e-09, "loss": 0.3451, "step": 1476 }, { "epoch": 4.987155190566435, "grad_norm": 0.0630218856533905, "learning_rate": 1.0012948649018584e-09, "loss": 0.3497, "step": 1477 }, { "epoch": 4.990524320909666, "grad_norm": 0.06351551674205162, "learning_rate": 4.450209714379483e-10, "loss": 0.3382, "step": 1478 }, { "epoch": 4.993893451252895, "grad_norm": 0.06362931363383374, "learning_rate": 1.1125539757905756e-10, "loss": 0.3436, "step": 1479 }, { "epoch": 4.997262581596125, "grad_norm": 0.0635855435860357, "learning_rate": 0.0, "loss": 0.3456, "step": 1480 }, { "epoch": 4.997262581596125, "step": 1480, "total_flos": 3.94117975967185e+19, "train_loss": 0.06913654437741718, "train_runtime": 69116.03, "train_samples_per_second": 10.993, "train_steps_per_second": 0.021 } ], "logging_steps": 1, "max_steps": 1480, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.94117975967185e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }