diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10402 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.997262581596125, + "eval_steps": 500, + "global_step": 1480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0033691303432301536, + "grad_norm": 6.109744437799009, + "learning_rate": 5.405405405405406e-07, + "loss": 0.8395, + "step": 1 + }, + { + "epoch": 0.006738260686460307, + "grad_norm": 6.213278091341755, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.8539, + "step": 2 + }, + { + "epoch": 0.01010739102969046, + "grad_norm": 6.2309201627607536, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.862, + "step": 3 + }, + { + "epoch": 0.013476521372920615, + "grad_norm": 5.960777622679009, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.8418, + "step": 4 + }, + { + "epoch": 0.01684565171615077, + "grad_norm": 5.5645531135403825, + "learning_rate": 2.702702702702703e-06, + "loss": 0.8303, + "step": 5 + }, + { + "epoch": 0.02021478205938092, + "grad_norm": 4.368766112287261, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.7949, + "step": 6 + }, + { + "epoch": 0.023583912402611075, + "grad_norm": 2.4282762566778153, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.7538, + "step": 7 + }, + { + "epoch": 0.02695304274584123, + "grad_norm": 2.286826804105074, + "learning_rate": 4.324324324324325e-06, + "loss": 0.7537, + "step": 8 + }, + { + "epoch": 0.030322173089071383, + "grad_norm": 1.7230122995757746, + "learning_rate": 4.864864864864866e-06, + "loss": 0.7465, + "step": 9 + }, + { + "epoch": 0.03369130343230154, + "grad_norm": 4.178444915489739, + "learning_rate": 5.405405405405406e-06, + "loss": 0.7473, + "step": 10 + }, + { + "epoch": 0.03706043377553169, + "grad_norm": 4.418045830813186, + "learning_rate": 5.945945945945947e-06, + "loss": 0.7374, + "step": 11 + }, + { + "epoch": 0.04042956411876184, + "grad_norm": 4.465657225144453, + "learning_rate": 6.486486486486487e-06, + "loss": 0.7306, + "step": 12 + }, + { + "epoch": 0.043798694461992, + "grad_norm": 4.089235560381299, + "learning_rate": 7.027027027027028e-06, + "loss": 0.6953, + "step": 13 + }, + { + "epoch": 0.04716782480522215, + "grad_norm": 3.9135602100241798, + "learning_rate": 7.567567567567569e-06, + "loss": 0.6963, + "step": 14 + }, + { + "epoch": 0.05053695514845231, + "grad_norm": 2.799692430785198, + "learning_rate": 8.108108108108109e-06, + "loss": 0.6717, + "step": 15 + }, + { + "epoch": 0.05390608549168246, + "grad_norm": 1.7463496609273474, + "learning_rate": 8.64864864864865e-06, + "loss": 0.6544, + "step": 16 + }, + { + "epoch": 0.057275215834912616, + "grad_norm": 1.788941312078948, + "learning_rate": 9.189189189189191e-06, + "loss": 0.6521, + "step": 17 + }, + { + "epoch": 0.060644346178142766, + "grad_norm": 2.287622821921788, + "learning_rate": 9.729729729729732e-06, + "loss": 0.6429, + "step": 18 + }, + { + "epoch": 0.06401347652137292, + "grad_norm": 1.960430434087058, + "learning_rate": 1.027027027027027e-05, + "loss": 0.6392, + "step": 19 + }, + { + "epoch": 0.06738260686460308, + "grad_norm": 1.349903658959779, + "learning_rate": 1.0810810810810812e-05, + "loss": 0.6281, + "step": 20 + }, + { + "epoch": 0.07075173720783323, + "grad_norm": 1.2019647976807795, + "learning_rate": 1.1351351351351352e-05, + "loss": 0.6151, + "step": 21 + }, + { + "epoch": 0.07412086755106338, + "grad_norm": 0.9434380017683085, + "learning_rate": 1.1891891891891894e-05, + "loss": 0.604, + "step": 22 + }, + { + "epoch": 0.07748999789429353, + "grad_norm": 1.1709993819720563, + "learning_rate": 1.2432432432432433e-05, + "loss": 0.6027, + "step": 23 + }, + { + "epoch": 0.08085912823752368, + "grad_norm": 0.7729753637518647, + "learning_rate": 1.2972972972972975e-05, + "loss": 0.6009, + "step": 24 + }, + { + "epoch": 0.08422825858075385, + "grad_norm": 0.9253134596999972, + "learning_rate": 1.3513513513513515e-05, + "loss": 0.5784, + "step": 25 + }, + { + "epoch": 0.087597388923984, + "grad_norm": 0.7203325434817915, + "learning_rate": 1.4054054054054055e-05, + "loss": 0.5784, + "step": 26 + }, + { + "epoch": 0.09096651926721415, + "grad_norm": 0.7738717415397595, + "learning_rate": 1.4594594594594596e-05, + "loss": 0.5724, + "step": 27 + }, + { + "epoch": 0.0943356496104443, + "grad_norm": 0.756189527437592, + "learning_rate": 1.5135135135135138e-05, + "loss": 0.5751, + "step": 28 + }, + { + "epoch": 0.09770477995367446, + "grad_norm": 0.5619223236064955, + "learning_rate": 1.5675675675675676e-05, + "loss": 0.5632, + "step": 29 + }, + { + "epoch": 0.10107391029690461, + "grad_norm": 0.6416604397150266, + "learning_rate": 1.6216216216216218e-05, + "loss": 0.5624, + "step": 30 + }, + { + "epoch": 0.10444304064013477, + "grad_norm": 0.5213594465327983, + "learning_rate": 1.6756756756756757e-05, + "loss": 0.5618, + "step": 31 + }, + { + "epoch": 0.10781217098336492, + "grad_norm": 0.6096320957570693, + "learning_rate": 1.72972972972973e-05, + "loss": 0.5569, + "step": 32 + }, + { + "epoch": 0.11118130132659507, + "grad_norm": 0.45054132971017113, + "learning_rate": 1.783783783783784e-05, + "loss": 0.547, + "step": 33 + }, + { + "epoch": 0.11455043166982523, + "grad_norm": 0.4599454116974351, + "learning_rate": 1.8378378378378383e-05, + "loss": 0.5484, + "step": 34 + }, + { + "epoch": 0.11791956201305538, + "grad_norm": 0.5174247359394964, + "learning_rate": 1.891891891891892e-05, + "loss": 0.5493, + "step": 35 + }, + { + "epoch": 0.12128869235628553, + "grad_norm": 0.37366751205930016, + "learning_rate": 1.9459459459459463e-05, + "loss": 0.5391, + "step": 36 + }, + { + "epoch": 0.12465782269951568, + "grad_norm": 0.48653413640763127, + "learning_rate": 2e-05, + "loss": 0.5403, + "step": 37 + }, + { + "epoch": 0.12802695304274583, + "grad_norm": 0.3651703371460431, + "learning_rate": 2.054054054054054e-05, + "loss": 0.5432, + "step": 38 + }, + { + "epoch": 0.131396083385976, + "grad_norm": 0.5149214857154895, + "learning_rate": 2.1081081081081082e-05, + "loss": 0.5351, + "step": 39 + }, + { + "epoch": 0.13476521372920616, + "grad_norm": 0.3233162021565579, + "learning_rate": 2.1621621621621624e-05, + "loss": 0.5358, + "step": 40 + }, + { + "epoch": 0.1381343440724363, + "grad_norm": 0.3994220446903501, + "learning_rate": 2.2162162162162163e-05, + "loss": 0.5215, + "step": 41 + }, + { + "epoch": 0.14150347441566646, + "grad_norm": 0.3613182742062023, + "learning_rate": 2.2702702702702705e-05, + "loss": 0.5306, + "step": 42 + }, + { + "epoch": 0.1448726047588966, + "grad_norm": 0.3444241276930956, + "learning_rate": 2.3243243243243243e-05, + "loss": 0.5255, + "step": 43 + }, + { + "epoch": 0.14824173510212676, + "grad_norm": 0.4493831957153716, + "learning_rate": 2.378378378378379e-05, + "loss": 0.525, + "step": 44 + }, + { + "epoch": 0.15161086544535693, + "grad_norm": 0.44813168016549754, + "learning_rate": 2.4324324324324327e-05, + "loss": 0.5182, + "step": 45 + }, + { + "epoch": 0.15497999578858707, + "grad_norm": 0.6714598609090919, + "learning_rate": 2.4864864864864866e-05, + "loss": 0.5236, + "step": 46 + }, + { + "epoch": 0.15834912613181723, + "grad_norm": 0.8782480079092542, + "learning_rate": 2.5405405405405404e-05, + "loss": 0.5189, + "step": 47 + }, + { + "epoch": 0.16171825647504737, + "grad_norm": 0.933082396567328, + "learning_rate": 2.594594594594595e-05, + "loss": 0.5136, + "step": 48 + }, + { + "epoch": 0.16508738681827753, + "grad_norm": 0.770716538259162, + "learning_rate": 2.6486486486486488e-05, + "loss": 0.515, + "step": 49 + }, + { + "epoch": 0.1684565171615077, + "grad_norm": 0.5162725525339301, + "learning_rate": 2.702702702702703e-05, + "loss": 0.5102, + "step": 50 + }, + { + "epoch": 0.17182564750473783, + "grad_norm": 1.015431854648304, + "learning_rate": 2.756756756756757e-05, + "loss": 0.5155, + "step": 51 + }, + { + "epoch": 0.175194777847968, + "grad_norm": 0.9581799901627799, + "learning_rate": 2.810810810810811e-05, + "loss": 0.5144, + "step": 52 + }, + { + "epoch": 0.17856390819119813, + "grad_norm": 0.7963519939531664, + "learning_rate": 2.8648648648648653e-05, + "loss": 0.5097, + "step": 53 + }, + { + "epoch": 0.1819330385344283, + "grad_norm": 1.2154735765132731, + "learning_rate": 2.918918918918919e-05, + "loss": 0.5, + "step": 54 + }, + { + "epoch": 0.18530216887765846, + "grad_norm": 0.5396425181695459, + "learning_rate": 2.972972972972973e-05, + "loss": 0.5028, + "step": 55 + }, + { + "epoch": 0.1886712992208886, + "grad_norm": 1.02978967009914, + "learning_rate": 3.0270270270270275e-05, + "loss": 0.5081, + "step": 56 + }, + { + "epoch": 0.19204042956411876, + "grad_norm": 0.8713408712796716, + "learning_rate": 3.081081081081082e-05, + "loss": 0.5117, + "step": 57 + }, + { + "epoch": 0.19540955990734893, + "grad_norm": 0.5039355010657219, + "learning_rate": 3.135135135135135e-05, + "loss": 0.5062, + "step": 58 + }, + { + "epoch": 0.19877869025057907, + "grad_norm": 0.8903957505195225, + "learning_rate": 3.1891891891891894e-05, + "loss": 0.5065, + "step": 59 + }, + { + "epoch": 0.20214782059380923, + "grad_norm": 0.6438414779422286, + "learning_rate": 3.2432432432432436e-05, + "loss": 0.505, + "step": 60 + }, + { + "epoch": 0.20551695093703937, + "grad_norm": 0.7196844068315907, + "learning_rate": 3.297297297297298e-05, + "loss": 0.5027, + "step": 61 + }, + { + "epoch": 0.20888608128026953, + "grad_norm": 0.768470871542238, + "learning_rate": 3.351351351351351e-05, + "loss": 0.4952, + "step": 62 + }, + { + "epoch": 0.2122552116234997, + "grad_norm": 1.2385285643157924, + "learning_rate": 3.4054054054054055e-05, + "loss": 0.4998, + "step": 63 + }, + { + "epoch": 0.21562434196672983, + "grad_norm": 1.2926026603680456, + "learning_rate": 3.45945945945946e-05, + "loss": 0.5059, + "step": 64 + }, + { + "epoch": 0.21899347230996, + "grad_norm": 0.5270463926051342, + "learning_rate": 3.513513513513514e-05, + "loss": 0.4882, + "step": 65 + }, + { + "epoch": 0.22236260265319013, + "grad_norm": 0.9232242256936453, + "learning_rate": 3.567567567567568e-05, + "loss": 0.5001, + "step": 66 + }, + { + "epoch": 0.2257317329964203, + "grad_norm": 1.1802567310747412, + "learning_rate": 3.6216216216216216e-05, + "loss": 0.4988, + "step": 67 + }, + { + "epoch": 0.22910086333965046, + "grad_norm": 0.6824243910642153, + "learning_rate": 3.6756756756756765e-05, + "loss": 0.4968, + "step": 68 + }, + { + "epoch": 0.2324699936828806, + "grad_norm": 1.1492977773270214, + "learning_rate": 3.72972972972973e-05, + "loss": 0.492, + "step": 69 + }, + { + "epoch": 0.23583912402611076, + "grad_norm": 0.6936995427226362, + "learning_rate": 3.783783783783784e-05, + "loss": 0.484, + "step": 70 + }, + { + "epoch": 0.2392082543693409, + "grad_norm": 0.8248787799708727, + "learning_rate": 3.837837837837838e-05, + "loss": 0.4875, + "step": 71 + }, + { + "epoch": 0.24257738471257106, + "grad_norm": 0.9148972625023225, + "learning_rate": 3.8918918918918926e-05, + "loss": 0.4936, + "step": 72 + }, + { + "epoch": 0.24594651505580123, + "grad_norm": 0.9090987008379405, + "learning_rate": 3.945945945945946e-05, + "loss": 0.4915, + "step": 73 + }, + { + "epoch": 0.24931564539903137, + "grad_norm": 1.3515657782891444, + "learning_rate": 4e-05, + "loss": 0.4898, + "step": 74 + }, + { + "epoch": 0.2526847757422615, + "grad_norm": 0.9152494686227338, + "learning_rate": 4.0540540540540545e-05, + "loss": 0.492, + "step": 75 + }, + { + "epoch": 0.25605390608549167, + "grad_norm": 1.5681418879764368, + "learning_rate": 4.108108108108108e-05, + "loss": 0.4927, + "step": 76 + }, + { + "epoch": 0.25942303642872183, + "grad_norm": 0.7357882817582275, + "learning_rate": 4.162162162162163e-05, + "loss": 0.4916, + "step": 77 + }, + { + "epoch": 0.262792166771952, + "grad_norm": 1.8207664011692413, + "learning_rate": 4.2162162162162164e-05, + "loss": 0.496, + "step": 78 + }, + { + "epoch": 0.26616129711518216, + "grad_norm": 1.1821288274944997, + "learning_rate": 4.2702702702702706e-05, + "loss": 0.4853, + "step": 79 + }, + { + "epoch": 0.2695304274584123, + "grad_norm": 1.6328730676456176, + "learning_rate": 4.324324324324325e-05, + "loss": 0.4942, + "step": 80 + }, + { + "epoch": 0.27289955780164243, + "grad_norm": 1.5462148913519038, + "learning_rate": 4.3783783783783783e-05, + "loss": 0.5016, + "step": 81 + }, + { + "epoch": 0.2762686881448726, + "grad_norm": 1.0513902425500052, + "learning_rate": 4.4324324324324325e-05, + "loss": 0.4816, + "step": 82 + }, + { + "epoch": 0.27963781848810276, + "grad_norm": 1.6621005750940228, + "learning_rate": 4.4864864864864874e-05, + "loss": 0.4999, + "step": 83 + }, + { + "epoch": 0.2830069488313329, + "grad_norm": 1.0166569610477454, + "learning_rate": 4.540540540540541e-05, + "loss": 0.4909, + "step": 84 + }, + { + "epoch": 0.2863760791745631, + "grad_norm": 1.3532816330257298, + "learning_rate": 4.594594594594595e-05, + "loss": 0.4853, + "step": 85 + }, + { + "epoch": 0.2897452095177932, + "grad_norm": 1.1921249261435982, + "learning_rate": 4.6486486486486486e-05, + "loss": 0.4907, + "step": 86 + }, + { + "epoch": 0.29311433986102337, + "grad_norm": 0.7446095855395451, + "learning_rate": 4.702702702702703e-05, + "loss": 0.4829, + "step": 87 + }, + { + "epoch": 0.29648347020425353, + "grad_norm": 1.13704598401882, + "learning_rate": 4.756756756756758e-05, + "loss": 0.4923, + "step": 88 + }, + { + "epoch": 0.2998526005474837, + "grad_norm": 0.9773983706335432, + "learning_rate": 4.810810810810811e-05, + "loss": 0.4925, + "step": 89 + }, + { + "epoch": 0.30322173089071386, + "grad_norm": 1.405804106766561, + "learning_rate": 4.8648648648648654e-05, + "loss": 0.4988, + "step": 90 + }, + { + "epoch": 0.30659086123394397, + "grad_norm": 1.2120207198526078, + "learning_rate": 4.9189189189189196e-05, + "loss": 0.493, + "step": 91 + }, + { + "epoch": 0.30995999157717413, + "grad_norm": 1.209860911666279, + "learning_rate": 4.972972972972973e-05, + "loss": 0.4871, + "step": 92 + }, + { + "epoch": 0.3133291219204043, + "grad_norm": 1.1742086251606596, + "learning_rate": 5.027027027027027e-05, + "loss": 0.4892, + "step": 93 + }, + { + "epoch": 0.31669825226363446, + "grad_norm": 0.9715216714272247, + "learning_rate": 5.081081081081081e-05, + "loss": 0.4869, + "step": 94 + }, + { + "epoch": 0.3200673826068646, + "grad_norm": 1.1287315453457836, + "learning_rate": 5.135135135135136e-05, + "loss": 0.483, + "step": 95 + }, + { + "epoch": 0.32343651295009473, + "grad_norm": 1.3753216500561123, + "learning_rate": 5.18918918918919e-05, + "loss": 0.5011, + "step": 96 + }, + { + "epoch": 0.3268056432933249, + "grad_norm": 1.0595415038900966, + "learning_rate": 5.2432432432432434e-05, + "loss": 0.4872, + "step": 97 + }, + { + "epoch": 0.33017477363655506, + "grad_norm": 0.993959867595984, + "learning_rate": 5.2972972972972976e-05, + "loss": 0.4874, + "step": 98 + }, + { + "epoch": 0.33354390397978523, + "grad_norm": 0.9824629794475075, + "learning_rate": 5.3513513513513525e-05, + "loss": 0.4813, + "step": 99 + }, + { + "epoch": 0.3369130343230154, + "grad_norm": 1.1711893920755474, + "learning_rate": 5.405405405405406e-05, + "loss": 0.4799, + "step": 100 + }, + { + "epoch": 0.3402821646662455, + "grad_norm": 0.6759536241348655, + "learning_rate": 5.45945945945946e-05, + "loss": 0.472, + "step": 101 + }, + { + "epoch": 0.34365129500947567, + "grad_norm": 0.7703133612780192, + "learning_rate": 5.513513513513514e-05, + "loss": 0.484, + "step": 102 + }, + { + "epoch": 0.34702042535270583, + "grad_norm": 0.7769913812178919, + "learning_rate": 5.567567567567568e-05, + "loss": 0.4758, + "step": 103 + }, + { + "epoch": 0.350389555695936, + "grad_norm": 0.8455984342427874, + "learning_rate": 5.621621621621622e-05, + "loss": 0.4807, + "step": 104 + }, + { + "epoch": 0.35375868603916616, + "grad_norm": 1.0743068767474058, + "learning_rate": 5.6756756756756757e-05, + "loss": 0.4845, + "step": 105 + }, + { + "epoch": 0.35712781638239627, + "grad_norm": 1.098648574354644, + "learning_rate": 5.7297297297297305e-05, + "loss": 0.4912, + "step": 106 + }, + { + "epoch": 0.36049694672562643, + "grad_norm": 1.6110582797161879, + "learning_rate": 5.783783783783785e-05, + "loss": 0.4975, + "step": 107 + }, + { + "epoch": 0.3638660770688566, + "grad_norm": 0.8739341305512099, + "learning_rate": 5.837837837837838e-05, + "loss": 0.4785, + "step": 108 + }, + { + "epoch": 0.36723520741208676, + "grad_norm": 1.267103393588983, + "learning_rate": 5.8918918918918924e-05, + "loss": 0.4922, + "step": 109 + }, + { + "epoch": 0.3706043377553169, + "grad_norm": 1.2547562478396024, + "learning_rate": 5.945945945945946e-05, + "loss": 0.4865, + "step": 110 + }, + { + "epoch": 0.37397346809854703, + "grad_norm": 1.1629837530431066, + "learning_rate": 6.000000000000001e-05, + "loss": 0.4815, + "step": 111 + }, + { + "epoch": 0.3773425984417772, + "grad_norm": 0.8450218760805568, + "learning_rate": 6.054054054054055e-05, + "loss": 0.4795, + "step": 112 + }, + { + "epoch": 0.38071172878500736, + "grad_norm": 1.116338785518496, + "learning_rate": 6.108108108108108e-05, + "loss": 0.4806, + "step": 113 + }, + { + "epoch": 0.38408085912823753, + "grad_norm": 0.8964893535442878, + "learning_rate": 6.162162162162163e-05, + "loss": 0.4756, + "step": 114 + }, + { + "epoch": 0.3874499894714677, + "grad_norm": 0.5881211331403472, + "learning_rate": 6.216216216216216e-05, + "loss": 0.4732, + "step": 115 + }, + { + "epoch": 0.39081911981469786, + "grad_norm": 0.7134703598403237, + "learning_rate": 6.27027027027027e-05, + "loss": 0.4788, + "step": 116 + }, + { + "epoch": 0.39418825015792797, + "grad_norm": 0.5897113644451194, + "learning_rate": 6.324324324324325e-05, + "loss": 0.4728, + "step": 117 + }, + { + "epoch": 0.39755738050115813, + "grad_norm": 0.6261295369849983, + "learning_rate": 6.378378378378379e-05, + "loss": 0.4773, + "step": 118 + }, + { + "epoch": 0.4009265108443883, + "grad_norm": 0.6923893019220413, + "learning_rate": 6.432432432432433e-05, + "loss": 0.4762, + "step": 119 + }, + { + "epoch": 0.40429564118761846, + "grad_norm": 0.9773203912887567, + "learning_rate": 6.486486486486487e-05, + "loss": 0.4835, + "step": 120 + }, + { + "epoch": 0.4076647715308486, + "grad_norm": 1.2845541454753984, + "learning_rate": 6.540540540540541e-05, + "loss": 0.4742, + "step": 121 + }, + { + "epoch": 0.41103390187407873, + "grad_norm": 0.7789854415201711, + "learning_rate": 6.594594594594596e-05, + "loss": 0.4693, + "step": 122 + }, + { + "epoch": 0.4144030322173089, + "grad_norm": 0.8431373845569413, + "learning_rate": 6.648648648648648e-05, + "loss": 0.4831, + "step": 123 + }, + { + "epoch": 0.41777216256053906, + "grad_norm": 0.8809025032679428, + "learning_rate": 6.702702702702703e-05, + "loss": 0.48, + "step": 124 + }, + { + "epoch": 0.4211412929037692, + "grad_norm": 0.7787166669908304, + "learning_rate": 6.756756756756758e-05, + "loss": 0.4861, + "step": 125 + }, + { + "epoch": 0.4245104232469994, + "grad_norm": 0.7332477510232297, + "learning_rate": 6.810810810810811e-05, + "loss": 0.478, + "step": 126 + }, + { + "epoch": 0.4278795535902295, + "grad_norm": 1.3066224708152618, + "learning_rate": 6.864864864864865e-05, + "loss": 0.4813, + "step": 127 + }, + { + "epoch": 0.43124868393345966, + "grad_norm": 0.9836726211958252, + "learning_rate": 6.91891891891892e-05, + "loss": 0.4714, + "step": 128 + }, + { + "epoch": 0.43461781427668983, + "grad_norm": 0.9119614187689974, + "learning_rate": 6.972972972972974e-05, + "loss": 0.4769, + "step": 129 + }, + { + "epoch": 0.43798694461992, + "grad_norm": 0.6243806457837586, + "learning_rate": 7.027027027027028e-05, + "loss": 0.4794, + "step": 130 + }, + { + "epoch": 0.44135607496315016, + "grad_norm": 0.7687415551391915, + "learning_rate": 7.081081081081081e-05, + "loss": 0.4732, + "step": 131 + }, + { + "epoch": 0.44472520530638027, + "grad_norm": 1.0092750754274926, + "learning_rate": 7.135135135135136e-05, + "loss": 0.4776, + "step": 132 + }, + { + "epoch": 0.44809433564961043, + "grad_norm": 0.8380704714410115, + "learning_rate": 7.18918918918919e-05, + "loss": 0.4648, + "step": 133 + }, + { + "epoch": 0.4514634659928406, + "grad_norm": 0.6767690752757246, + "learning_rate": 7.243243243243243e-05, + "loss": 0.4609, + "step": 134 + }, + { + "epoch": 0.45483259633607076, + "grad_norm": 0.9804619524504721, + "learning_rate": 7.297297297297297e-05, + "loss": 0.4713, + "step": 135 + }, + { + "epoch": 0.4582017266793009, + "grad_norm": 1.3776587580151205, + "learning_rate": 7.351351351351353e-05, + "loss": 0.4794, + "step": 136 + }, + { + "epoch": 0.46157085702253103, + "grad_norm": 0.5714903502719861, + "learning_rate": 7.405405405405406e-05, + "loss": 0.4693, + "step": 137 + }, + { + "epoch": 0.4649399873657612, + "grad_norm": 1.1165381199232975, + "learning_rate": 7.45945945945946e-05, + "loss": 0.4789, + "step": 138 + }, + { + "epoch": 0.46830911770899136, + "grad_norm": 1.1391177830520112, + "learning_rate": 7.513513513513514e-05, + "loss": 0.4763, + "step": 139 + }, + { + "epoch": 0.4716782480522215, + "grad_norm": 0.8405758310660678, + "learning_rate": 7.567567567567568e-05, + "loss": 0.4708, + "step": 140 + }, + { + "epoch": 0.4750473783954517, + "grad_norm": 0.673259823629261, + "learning_rate": 7.621621621621623e-05, + "loss": 0.4709, + "step": 141 + }, + { + "epoch": 0.4784165087386818, + "grad_norm": 0.9085593163135716, + "learning_rate": 7.675675675675675e-05, + "loss": 0.4658, + "step": 142 + }, + { + "epoch": 0.48178563908191196, + "grad_norm": 0.9065492978219933, + "learning_rate": 7.729729729729731e-05, + "loss": 0.4661, + "step": 143 + }, + { + "epoch": 0.48515476942514213, + "grad_norm": 0.8751584723634406, + "learning_rate": 7.783783783783785e-05, + "loss": 0.4725, + "step": 144 + }, + { + "epoch": 0.4885238997683723, + "grad_norm": 0.6907562652250656, + "learning_rate": 7.837837837837838e-05, + "loss": 0.4684, + "step": 145 + }, + { + "epoch": 0.49189303011160246, + "grad_norm": 0.5990641326148477, + "learning_rate": 7.891891891891892e-05, + "loss": 0.4672, + "step": 146 + }, + { + "epoch": 0.4952621604548326, + "grad_norm": 0.6994191437855128, + "learning_rate": 7.945945945945946e-05, + "loss": 0.4662, + "step": 147 + }, + { + "epoch": 0.49863129079806273, + "grad_norm": 0.5573598940486624, + "learning_rate": 8e-05, + "loss": 0.4665, + "step": 148 + }, + { + "epoch": 0.502000421141293, + "grad_norm": 0.6145912929008095, + "learning_rate": 7.999988874460243e-05, + "loss": 0.4669, + "step": 149 + }, + { + "epoch": 0.505369551484523, + "grad_norm": 0.8011807879753905, + "learning_rate": 7.999955497902857e-05, + "loss": 0.4669, + "step": 150 + }, + { + "epoch": 0.5087386818277532, + "grad_norm": 0.8616234404683479, + "learning_rate": 7.99989987051351e-05, + "loss": 0.4721, + "step": 151 + }, + { + "epoch": 0.5121078121709833, + "grad_norm": 0.7813814403741567, + "learning_rate": 7.999821992601645e-05, + "loss": 0.4753, + "step": 152 + }, + { + "epoch": 0.5154769425142135, + "grad_norm": 0.8208221005516424, + "learning_rate": 7.999721864600476e-05, + "loss": 0.4648, + "step": 153 + }, + { + "epoch": 0.5188460728574437, + "grad_norm": 0.7471274236204338, + "learning_rate": 7.999599487066996e-05, + "loss": 0.4665, + "step": 154 + }, + { + "epoch": 0.5222152032006738, + "grad_norm": 0.6025705010343646, + "learning_rate": 7.999454860681961e-05, + "loss": 0.4646, + "step": 155 + }, + { + "epoch": 0.525584333543904, + "grad_norm": 0.6278670733672859, + "learning_rate": 7.999287986249894e-05, + "loss": 0.4582, + "step": 156 + }, + { + "epoch": 0.5289534638871342, + "grad_norm": 0.7363381482182718, + "learning_rate": 7.999098864699078e-05, + "loss": 0.4644, + "step": 157 + }, + { + "epoch": 0.5323225942303643, + "grad_norm": 0.5321478176964342, + "learning_rate": 7.998887497081555e-05, + "loss": 0.4558, + "step": 158 + }, + { + "epoch": 0.5356917245735945, + "grad_norm": 0.5084748356369074, + "learning_rate": 7.998653884573114e-05, + "loss": 0.4576, + "step": 159 + }, + { + "epoch": 0.5390608549168246, + "grad_norm": 0.44466968144745794, + "learning_rate": 7.998398028473287e-05, + "loss": 0.4628, + "step": 160 + }, + { + "epoch": 0.5424299852600547, + "grad_norm": 0.5300560878644925, + "learning_rate": 7.998119930205342e-05, + "loss": 0.4587, + "step": 161 + }, + { + "epoch": 0.5457991156032849, + "grad_norm": 0.4482671223105369, + "learning_rate": 7.997819591316278e-05, + "loss": 0.4595, + "step": 162 + }, + { + "epoch": 0.549168245946515, + "grad_norm": 0.3831134887002804, + "learning_rate": 7.997497013476808e-05, + "loss": 0.4621, + "step": 163 + }, + { + "epoch": 0.5525373762897452, + "grad_norm": 0.42236120459010645, + "learning_rate": 7.99715219848136e-05, + "loss": 0.4574, + "step": 164 + }, + { + "epoch": 0.5559065066329754, + "grad_norm": 0.4457523958461928, + "learning_rate": 7.996785148248062e-05, + "loss": 0.4597, + "step": 165 + }, + { + "epoch": 0.5592756369762055, + "grad_norm": 0.5006917210647484, + "learning_rate": 7.996395864818727e-05, + "loss": 0.4594, + "step": 166 + }, + { + "epoch": 0.5626447673194357, + "grad_norm": 0.5999241658726214, + "learning_rate": 7.995984350358851e-05, + "loss": 0.4578, + "step": 167 + }, + { + "epoch": 0.5660138976626659, + "grad_norm": 0.7291489485043735, + "learning_rate": 7.995550607157592e-05, + "loss": 0.4538, + "step": 168 + }, + { + "epoch": 0.569383028005896, + "grad_norm": 0.7577464603442905, + "learning_rate": 7.995094637627767e-05, + "loss": 0.4507, + "step": 169 + }, + { + "epoch": 0.5727521583491262, + "grad_norm": 0.5373115669466836, + "learning_rate": 7.994616444305826e-05, + "loss": 0.4602, + "step": 170 + }, + { + "epoch": 0.5761212886923562, + "grad_norm": 0.5783937366804819, + "learning_rate": 7.994116029851852e-05, + "loss": 0.4621, + "step": 171 + }, + { + "epoch": 0.5794904190355864, + "grad_norm": 0.7289647138839453, + "learning_rate": 7.993593397049533e-05, + "loss": 0.4569, + "step": 172 + }, + { + "epoch": 0.5828595493788166, + "grad_norm": 0.7726864760162053, + "learning_rate": 7.993048548806155e-05, + "loss": 0.4609, + "step": 173 + }, + { + "epoch": 0.5862286797220467, + "grad_norm": 0.7101749816908381, + "learning_rate": 7.992481488152585e-05, + "loss": 0.4628, + "step": 174 + }, + { + "epoch": 0.5895978100652769, + "grad_norm": 0.7787526674806393, + "learning_rate": 7.991892218243251e-05, + "loss": 0.4664, + "step": 175 + }, + { + "epoch": 0.5929669404085071, + "grad_norm": 0.9193285112654672, + "learning_rate": 7.991280742356124e-05, + "loss": 0.4583, + "step": 176 + }, + { + "epoch": 0.5963360707517372, + "grad_norm": 0.863766394540256, + "learning_rate": 7.990647063892704e-05, + "loss": 0.4532, + "step": 177 + }, + { + "epoch": 0.5997052010949674, + "grad_norm": 0.7969950754484971, + "learning_rate": 7.989991186378e-05, + "loss": 0.4649, + "step": 178 + }, + { + "epoch": 0.6030743314381976, + "grad_norm": 0.9175228695778532, + "learning_rate": 7.989313113460506e-05, + "loss": 0.4598, + "step": 179 + }, + { + "epoch": 0.6064434617814277, + "grad_norm": 1.189324294096932, + "learning_rate": 7.988612848912186e-05, + "loss": 0.4616, + "step": 180 + }, + { + "epoch": 0.6098125921246578, + "grad_norm": 0.5502633850375939, + "learning_rate": 7.987890396628451e-05, + "loss": 0.4506, + "step": 181 + }, + { + "epoch": 0.6131817224678879, + "grad_norm": 0.6418325300837303, + "learning_rate": 7.987145760628138e-05, + "loss": 0.4589, + "step": 182 + }, + { + "epoch": 0.6165508528111181, + "grad_norm": 0.8487957991579048, + "learning_rate": 7.986378945053483e-05, + "loss": 0.4534, + "step": 183 + }, + { + "epoch": 0.6199199831543483, + "grad_norm": 0.729090543693198, + "learning_rate": 7.985589954170107e-05, + "loss": 0.4502, + "step": 184 + }, + { + "epoch": 0.6232891134975784, + "grad_norm": 0.564140229622775, + "learning_rate": 7.984778792366983e-05, + "loss": 0.4561, + "step": 185 + }, + { + "epoch": 0.6266582438408086, + "grad_norm": 0.5489014465662102, + "learning_rate": 7.983945464156419e-05, + "loss": 0.4511, + "step": 186 + }, + { + "epoch": 0.6300273741840388, + "grad_norm": 0.4439092473485429, + "learning_rate": 7.983089974174026e-05, + "loss": 0.4592, + "step": 187 + }, + { + "epoch": 0.6333965045272689, + "grad_norm": 0.4899343556871492, + "learning_rate": 7.982212327178699e-05, + "loss": 0.4576, + "step": 188 + }, + { + "epoch": 0.6367656348704991, + "grad_norm": 0.4429930723656228, + "learning_rate": 7.981312528052587e-05, + "loss": 0.4527, + "step": 189 + }, + { + "epoch": 0.6401347652137293, + "grad_norm": 0.3517045134537643, + "learning_rate": 7.980390581801064e-05, + "loss": 0.4533, + "step": 190 + }, + { + "epoch": 0.6435038955569593, + "grad_norm": 0.35596532078238685, + "learning_rate": 7.979446493552708e-05, + "loss": 0.4512, + "step": 191 + }, + { + "epoch": 0.6468730259001895, + "grad_norm": 0.4117813215790183, + "learning_rate": 7.97848026855926e-05, + "loss": 0.4427, + "step": 192 + }, + { + "epoch": 0.6502421562434196, + "grad_norm": 0.42570694906406503, + "learning_rate": 7.977491912195611e-05, + "loss": 0.4559, + "step": 193 + }, + { + "epoch": 0.6536112865866498, + "grad_norm": 0.32926038316817535, + "learning_rate": 7.976481429959758e-05, + "loss": 0.4525, + "step": 194 + }, + { + "epoch": 0.65698041692988, + "grad_norm": 0.3352588049162969, + "learning_rate": 7.975448827472782e-05, + "loss": 0.4465, + "step": 195 + }, + { + "epoch": 0.6603495472731101, + "grad_norm": 0.3121745237951815, + "learning_rate": 7.974394110478813e-05, + "loss": 0.4504, + "step": 196 + }, + { + "epoch": 0.6637186776163403, + "grad_norm": 0.3514443346936628, + "learning_rate": 7.973317284844998e-05, + "loss": 0.4543, + "step": 197 + }, + { + "epoch": 0.6670878079595705, + "grad_norm": 0.36563500765518064, + "learning_rate": 7.972218356561471e-05, + "loss": 0.4466, + "step": 198 + }, + { + "epoch": 0.6704569383028006, + "grad_norm": 0.36993328537084813, + "learning_rate": 7.971097331741318e-05, + "loss": 0.447, + "step": 199 + }, + { + "epoch": 0.6738260686460308, + "grad_norm": 0.4218574088374599, + "learning_rate": 7.96995421662054e-05, + "loss": 0.4456, + "step": 200 + }, + { + "epoch": 0.677195198989261, + "grad_norm": 0.5127127798248658, + "learning_rate": 7.968789017558026e-05, + "loss": 0.4367, + "step": 201 + }, + { + "epoch": 0.680564329332491, + "grad_norm": 0.5533862982628416, + "learning_rate": 7.967601741035507e-05, + "loss": 0.4464, + "step": 202 + }, + { + "epoch": 0.6839334596757212, + "grad_norm": 0.5128169904646379, + "learning_rate": 7.966392393657533e-05, + "loss": 0.4493, + "step": 203 + }, + { + "epoch": 0.6873025900189513, + "grad_norm": 0.47256773564418525, + "learning_rate": 7.965160982151422e-05, + "loss": 0.4536, + "step": 204 + }, + { + "epoch": 0.6906717203621815, + "grad_norm": 0.452879409095403, + "learning_rate": 7.963907513367234e-05, + "loss": 0.4589, + "step": 205 + }, + { + "epoch": 0.6940408507054117, + "grad_norm": 0.455219584683228, + "learning_rate": 7.962631994277728e-05, + "loss": 0.4414, + "step": 206 + }, + { + "epoch": 0.6974099810486418, + "grad_norm": 0.47863589957769587, + "learning_rate": 7.961334431978321e-05, + "loss": 0.4486, + "step": 207 + }, + { + "epoch": 0.700779111391872, + "grad_norm": 0.5110385780704738, + "learning_rate": 7.960014833687055e-05, + "loss": 0.4495, + "step": 208 + }, + { + "epoch": 0.7041482417351022, + "grad_norm": 0.4683257451933529, + "learning_rate": 7.958673206744553e-05, + "loss": 0.4522, + "step": 209 + }, + { + "epoch": 0.7075173720783323, + "grad_norm": 0.4506553993940309, + "learning_rate": 7.957309558613974e-05, + "loss": 0.4452, + "step": 210 + }, + { + "epoch": 0.7108865024215625, + "grad_norm": 0.4526028368594711, + "learning_rate": 7.955923896880982e-05, + "loss": 0.4456, + "step": 211 + }, + { + "epoch": 0.7142556327647925, + "grad_norm": 0.5212859488073646, + "learning_rate": 7.954516229253691e-05, + "loss": 0.4482, + "step": 212 + }, + { + "epoch": 0.7176247631080227, + "grad_norm": 0.4908480827080424, + "learning_rate": 7.953086563562635e-05, + "loss": 0.4404, + "step": 213 + }, + { + "epoch": 0.7209938934512529, + "grad_norm": 0.43474906852801837, + "learning_rate": 7.951634907760713e-05, + "loss": 0.4415, + "step": 214 + }, + { + "epoch": 0.724363023794483, + "grad_norm": 0.5465543422325746, + "learning_rate": 7.950161269923153e-05, + "loss": 0.453, + "step": 215 + }, + { + "epoch": 0.7277321541377132, + "grad_norm": 0.5191090578880476, + "learning_rate": 7.948665658247463e-05, + "loss": 0.4511, + "step": 216 + }, + { + "epoch": 0.7311012844809434, + "grad_norm": 0.41056922017028197, + "learning_rate": 7.947148081053388e-05, + "loss": 0.4428, + "step": 217 + }, + { + "epoch": 0.7344704148241735, + "grad_norm": 0.4280367756173325, + "learning_rate": 7.945608546782858e-05, + "loss": 0.4552, + "step": 218 + }, + { + "epoch": 0.7378395451674037, + "grad_norm": 0.44143498781875934, + "learning_rate": 7.944047063999952e-05, + "loss": 0.4461, + "step": 219 + }, + { + "epoch": 0.7412086755106339, + "grad_norm": 0.4671020826488003, + "learning_rate": 7.942463641390834e-05, + "loss": 0.433, + "step": 220 + }, + { + "epoch": 0.744577805853864, + "grad_norm": 0.4802991806753108, + "learning_rate": 7.940858287763724e-05, + "loss": 0.4487, + "step": 221 + }, + { + "epoch": 0.7479469361970941, + "grad_norm": 0.4271821601132076, + "learning_rate": 7.939231012048833e-05, + "loss": 0.4509, + "step": 222 + }, + { + "epoch": 0.7513160665403242, + "grad_norm": 0.38123610223687315, + "learning_rate": 7.93758182329832e-05, + "loss": 0.4372, + "step": 223 + }, + { + "epoch": 0.7546851968835544, + "grad_norm": 0.465830487650423, + "learning_rate": 7.935910730686246e-05, + "loss": 0.4444, + "step": 224 + }, + { + "epoch": 0.7580543272267846, + "grad_norm": 0.5651393352119582, + "learning_rate": 7.934217743508513e-05, + "loss": 0.4468, + "step": 225 + }, + { + "epoch": 0.7614234575700147, + "grad_norm": 0.6526912705793722, + "learning_rate": 7.932502871182818e-05, + "loss": 0.4509, + "step": 226 + }, + { + "epoch": 0.7647925879132449, + "grad_norm": 0.7684525411435036, + "learning_rate": 7.930766123248602e-05, + "loss": 0.4475, + "step": 227 + }, + { + "epoch": 0.7681617182564751, + "grad_norm": 0.8868257582573387, + "learning_rate": 7.929007509366994e-05, + "loss": 0.4486, + "step": 228 + }, + { + "epoch": 0.7715308485997052, + "grad_norm": 0.9592751619745519, + "learning_rate": 7.927227039320758e-05, + "loss": 0.442, + "step": 229 + }, + { + "epoch": 0.7748999789429354, + "grad_norm": 0.8928159966805775, + "learning_rate": 7.925424723014239e-05, + "loss": 0.4541, + "step": 230 + }, + { + "epoch": 0.7782691092861656, + "grad_norm": 0.7880900131568054, + "learning_rate": 7.923600570473308e-05, + "loss": 0.4514, + "step": 231 + }, + { + "epoch": 0.7816382396293957, + "grad_norm": 0.4783123604515285, + "learning_rate": 7.921754591845307e-05, + "loss": 0.4442, + "step": 232 + }, + { + "epoch": 0.7850073699726258, + "grad_norm": 0.4520386015737669, + "learning_rate": 7.91988679739899e-05, + "loss": 0.448, + "step": 233 + }, + { + "epoch": 0.7883765003158559, + "grad_norm": 0.6605527609379506, + "learning_rate": 7.917997197524467e-05, + "loss": 0.4435, + "step": 234 + }, + { + "epoch": 0.7917456306590861, + "grad_norm": 0.7089385732745206, + "learning_rate": 7.916085802733147e-05, + "loss": 0.4449, + "step": 235 + }, + { + "epoch": 0.7951147610023163, + "grad_norm": 0.5904512970852802, + "learning_rate": 7.914152623657678e-05, + "loss": 0.448, + "step": 236 + }, + { + "epoch": 0.7984838913455464, + "grad_norm": 0.5165195483185807, + "learning_rate": 7.912197671051894e-05, + "loss": 0.4475, + "step": 237 + }, + { + "epoch": 0.8018530216887766, + "grad_norm": 0.47278629514591364, + "learning_rate": 7.910220955790746e-05, + "loss": 0.447, + "step": 238 + }, + { + "epoch": 0.8052221520320068, + "grad_norm": 0.4466680465677497, + "learning_rate": 7.908222488870243e-05, + "loss": 0.4471, + "step": 239 + }, + { + "epoch": 0.8085912823752369, + "grad_norm": 0.40052321749076436, + "learning_rate": 7.906202281407398e-05, + "loss": 0.4453, + "step": 240 + }, + { + "epoch": 0.8119604127184671, + "grad_norm": 0.3808574042244712, + "learning_rate": 7.90416034464016e-05, + "loss": 0.4467, + "step": 241 + }, + { + "epoch": 0.8153295430616972, + "grad_norm": 0.3009379630644614, + "learning_rate": 7.902096689927355e-05, + "loss": 0.4405, + "step": 242 + }, + { + "epoch": 0.8186986734049273, + "grad_norm": 0.4006333439696202, + "learning_rate": 7.900011328748619e-05, + "loss": 0.441, + "step": 243 + }, + { + "epoch": 0.8220678037481575, + "grad_norm": 0.36250537572683333, + "learning_rate": 7.897904272704333e-05, + "loss": 0.4382, + "step": 244 + }, + { + "epoch": 0.8254369340913876, + "grad_norm": 0.37232144501481734, + "learning_rate": 7.895775533515569e-05, + "loss": 0.4455, + "step": 245 + }, + { + "epoch": 0.8288060644346178, + "grad_norm": 0.4169869556836039, + "learning_rate": 7.893625123024011e-05, + "loss": 0.4356, + "step": 246 + }, + { + "epoch": 0.832175194777848, + "grad_norm": 0.3864353557408192, + "learning_rate": 7.891453053191898e-05, + "loss": 0.4435, + "step": 247 + }, + { + "epoch": 0.8355443251210781, + "grad_norm": 0.3608352846793135, + "learning_rate": 7.889259336101957e-05, + "loss": 0.4462, + "step": 248 + }, + { + "epoch": 0.8389134554643083, + "grad_norm": 0.32373631118958723, + "learning_rate": 7.887043983957327e-05, + "loss": 0.4375, + "step": 249 + }, + { + "epoch": 0.8422825858075385, + "grad_norm": 0.26424914090383317, + "learning_rate": 7.884807009081506e-05, + "loss": 0.4375, + "step": 250 + }, + { + "epoch": 0.8456517161507686, + "grad_norm": 0.22444081020907958, + "learning_rate": 7.882548423918268e-05, + "loss": 0.4413, + "step": 251 + }, + { + "epoch": 0.8490208464939988, + "grad_norm": 0.26045857383329957, + "learning_rate": 7.880268241031604e-05, + "loss": 0.4317, + "step": 252 + }, + { + "epoch": 0.8523899768372288, + "grad_norm": 0.30550339254012787, + "learning_rate": 7.877966473105645e-05, + "loss": 0.4458, + "step": 253 + }, + { + "epoch": 0.855759107180459, + "grad_norm": 0.34559528308231324, + "learning_rate": 7.875643132944599e-05, + "loss": 0.4403, + "step": 254 + }, + { + "epoch": 0.8591282375236892, + "grad_norm": 0.35710994685108394, + "learning_rate": 7.873298233472671e-05, + "loss": 0.4394, + "step": 255 + }, + { + "epoch": 0.8624973678669193, + "grad_norm": 0.42956681122910056, + "learning_rate": 7.870931787733996e-05, + "loss": 0.4403, + "step": 256 + }, + { + "epoch": 0.8658664982101495, + "grad_norm": 0.5626197718228877, + "learning_rate": 7.868543808892569e-05, + "loss": 0.4387, + "step": 257 + }, + { + "epoch": 0.8692356285533797, + "grad_norm": 0.6076789146858117, + "learning_rate": 7.866134310232167e-05, + "loss": 0.4439, + "step": 258 + }, + { + "epoch": 0.8726047588966098, + "grad_norm": 0.5742280027785791, + "learning_rate": 7.863703305156273e-05, + "loss": 0.4455, + "step": 259 + }, + { + "epoch": 0.87597388923984, + "grad_norm": 0.5069317059933754, + "learning_rate": 7.861250807188014e-05, + "loss": 0.4476, + "step": 260 + }, + { + "epoch": 0.8793430195830702, + "grad_norm": 0.4288223928021788, + "learning_rate": 7.858776829970069e-05, + "loss": 0.4379, + "step": 261 + }, + { + "epoch": 0.8827121499263003, + "grad_norm": 0.5442592728854474, + "learning_rate": 7.856281387264603e-05, + "loss": 0.4379, + "step": 262 + }, + { + "epoch": 0.8860812802695305, + "grad_norm": 0.5638482346313414, + "learning_rate": 7.853764492953192e-05, + "loss": 0.4444, + "step": 263 + }, + { + "epoch": 0.8894504106127605, + "grad_norm": 0.4523819114426828, + "learning_rate": 7.851226161036739e-05, + "loss": 0.4394, + "step": 264 + }, + { + "epoch": 0.8928195409559907, + "grad_norm": 0.5349306408767115, + "learning_rate": 7.848666405635398e-05, + "loss": 0.441, + "step": 265 + }, + { + "epoch": 0.8961886712992209, + "grad_norm": 0.5452142089194884, + "learning_rate": 7.846085240988503e-05, + "loss": 0.4483, + "step": 266 + }, + { + "epoch": 0.899557801642451, + "grad_norm": 0.4222443920522887, + "learning_rate": 7.843482681454476e-05, + "loss": 0.4407, + "step": 267 + }, + { + "epoch": 0.9029269319856812, + "grad_norm": 0.5310106072977896, + "learning_rate": 7.840858741510758e-05, + "loss": 0.4442, + "step": 268 + }, + { + "epoch": 0.9062960623289114, + "grad_norm": 0.5876077411696179, + "learning_rate": 7.838213435753724e-05, + "loss": 0.4438, + "step": 269 + }, + { + "epoch": 0.9096651926721415, + "grad_norm": 0.6100738415200538, + "learning_rate": 7.835546778898599e-05, + "loss": 0.4465, + "step": 270 + }, + { + "epoch": 0.9130343230153717, + "grad_norm": 0.6760561504138676, + "learning_rate": 7.832858785779383e-05, + "loss": 0.4338, + "step": 271 + }, + { + "epoch": 0.9164034533586018, + "grad_norm": 0.45392830007094576, + "learning_rate": 7.830149471348763e-05, + "loss": 0.431, + "step": 272 + }, + { + "epoch": 0.919772583701832, + "grad_norm": 0.30596440551036547, + "learning_rate": 7.827418850678034e-05, + "loss": 0.4396, + "step": 273 + }, + { + "epoch": 0.9231417140450621, + "grad_norm": 0.4969999175377505, + "learning_rate": 7.824666938957004e-05, + "loss": 0.4375, + "step": 274 + }, + { + "epoch": 0.9265108443882922, + "grad_norm": 0.5437640388773309, + "learning_rate": 7.82189375149393e-05, + "loss": 0.444, + "step": 275 + }, + { + "epoch": 0.9298799747315224, + "grad_norm": 0.4134501055661062, + "learning_rate": 7.819099303715414e-05, + "loss": 0.4385, + "step": 276 + }, + { + "epoch": 0.9332491050747526, + "grad_norm": 0.3810051790575615, + "learning_rate": 7.816283611166328e-05, + "loss": 0.4339, + "step": 277 + }, + { + "epoch": 0.9366182354179827, + "grad_norm": 0.4135193612689647, + "learning_rate": 7.813446689509718e-05, + "loss": 0.4413, + "step": 278 + }, + { + "epoch": 0.9399873657612129, + "grad_norm": 0.5154216890519913, + "learning_rate": 7.810588554526728e-05, + "loss": 0.4409, + "step": 279 + }, + { + "epoch": 0.943356496104443, + "grad_norm": 0.5335234306967277, + "learning_rate": 7.807709222116506e-05, + "loss": 0.4392, + "step": 280 + }, + { + "epoch": 0.9467256264476732, + "grad_norm": 0.4582890089443176, + "learning_rate": 7.804808708296116e-05, + "loss": 0.44, + "step": 281 + }, + { + "epoch": 0.9500947567909034, + "grad_norm": 0.41636142631229706, + "learning_rate": 7.801887029200448e-05, + "loss": 0.4359, + "step": 282 + }, + { + "epoch": 0.9534638871341335, + "grad_norm": 0.3777680522962764, + "learning_rate": 7.798944201082128e-05, + "loss": 0.4305, + "step": 283 + }, + { + "epoch": 0.9568330174773636, + "grad_norm": 0.31197040692277506, + "learning_rate": 7.795980240311436e-05, + "loss": 0.4378, + "step": 284 + }, + { + "epoch": 0.9602021478205938, + "grad_norm": 0.2615719658181643, + "learning_rate": 7.7929951633762e-05, + "loss": 0.4349, + "step": 285 + }, + { + "epoch": 0.9635712781638239, + "grad_norm": 0.27255928093352183, + "learning_rate": 7.789988986881719e-05, + "loss": 0.4324, + "step": 286 + }, + { + "epoch": 0.9669404085070541, + "grad_norm": 0.3086259327892651, + "learning_rate": 7.78696172755066e-05, + "loss": 0.4338, + "step": 287 + }, + { + "epoch": 0.9703095388502843, + "grad_norm": 0.3128738492807504, + "learning_rate": 7.78391340222297e-05, + "loss": 0.4327, + "step": 288 + }, + { + "epoch": 0.9736786691935144, + "grad_norm": 0.28991557468061835, + "learning_rate": 7.78084402785578e-05, + "loss": 0.4368, + "step": 289 + }, + { + "epoch": 0.9770477995367446, + "grad_norm": 0.3462635013389902, + "learning_rate": 7.777753621523316e-05, + "loss": 0.4376, + "step": 290 + }, + { + "epoch": 0.9804169298799748, + "grad_norm": 0.41703460759212563, + "learning_rate": 7.774642200416795e-05, + "loss": 0.4364, + "step": 291 + }, + { + "epoch": 0.9837860602232049, + "grad_norm": 0.5058437435233563, + "learning_rate": 7.771509781844338e-05, + "loss": 0.4392, + "step": 292 + }, + { + "epoch": 0.9871551905664351, + "grad_norm": 0.49478795685868665, + "learning_rate": 7.768356383230868e-05, + "loss": 0.4387, + "step": 293 + }, + { + "epoch": 0.9905243209096652, + "grad_norm": 0.4745986454402833, + "learning_rate": 7.765182022118014e-05, + "loss": 0.435, + "step": 294 + }, + { + "epoch": 0.9938934512528953, + "grad_norm": 0.4611674206006931, + "learning_rate": 7.761986716164019e-05, + "loss": 0.4379, + "step": 295 + }, + { + "epoch": 0.9972625815961255, + "grad_norm": 0.42674160555276347, + "learning_rate": 7.758770483143634e-05, + "loss": 0.4408, + "step": 296 + }, + { + "epoch": 1.0033691303432302, + "grad_norm": 0.4680506210026581, + "learning_rate": 7.755533340948024e-05, + "loss": 0.4223, + "step": 297 + }, + { + "epoch": 1.0067382606864603, + "grad_norm": 0.5238195474514908, + "learning_rate": 7.752275307584664e-05, + "loss": 0.4295, + "step": 298 + }, + { + "epoch": 1.0101073910296905, + "grad_norm": 0.5889650401759404, + "learning_rate": 7.748996401177244e-05, + "loss": 0.4275, + "step": 299 + }, + { + "epoch": 1.0134765213729207, + "grad_norm": 0.5507636965946558, + "learning_rate": 7.745696639965569e-05, + "loss": 0.4194, + "step": 300 + }, + { + "epoch": 1.0168456517161508, + "grad_norm": 0.6157938140990948, + "learning_rate": 7.742376042305449e-05, + "loss": 0.433, + "step": 301 + }, + { + "epoch": 1.020214782059381, + "grad_norm": 0.6252887191974348, + "learning_rate": 7.739034626668605e-05, + "loss": 0.4262, + "step": 302 + }, + { + "epoch": 1.0235839124026112, + "grad_norm": 0.5880567265579987, + "learning_rate": 7.735672411642562e-05, + "loss": 0.4233, + "step": 303 + }, + { + "epoch": 1.0269530427458413, + "grad_norm": 0.5726576390809455, + "learning_rate": 7.732289415930549e-05, + "loss": 0.424, + "step": 304 + }, + { + "epoch": 1.0303221730890715, + "grad_norm": 0.47054865900441445, + "learning_rate": 7.728885658351395e-05, + "loss": 0.4176, + "step": 305 + }, + { + "epoch": 1.0336913034323016, + "grad_norm": 0.4115743947585953, + "learning_rate": 7.725461157839417e-05, + "loss": 0.4292, + "step": 306 + }, + { + "epoch": 1.0370604337755316, + "grad_norm": 0.4210186740776416, + "learning_rate": 7.722015933444325e-05, + "loss": 0.4247, + "step": 307 + }, + { + "epoch": 1.0404295641187618, + "grad_norm": 0.36198066116445515, + "learning_rate": 7.71855000433111e-05, + "loss": 0.4193, + "step": 308 + }, + { + "epoch": 1.043798694461992, + "grad_norm": 0.43778422486602975, + "learning_rate": 7.715063389779936e-05, + "loss": 0.4238, + "step": 309 + }, + { + "epoch": 1.047167824805222, + "grad_norm": 0.45920696669429717, + "learning_rate": 7.711556109186039e-05, + "loss": 0.4237, + "step": 310 + }, + { + "epoch": 1.0505369551484522, + "grad_norm": 0.3341781881526272, + "learning_rate": 7.708028182059612e-05, + "loss": 0.4239, + "step": 311 + }, + { + "epoch": 1.0539060854916824, + "grad_norm": 0.3082296798332506, + "learning_rate": 7.704479628025704e-05, + "loss": 0.4167, + "step": 312 + }, + { + "epoch": 1.0572752158349126, + "grad_norm": 0.33284929605340835, + "learning_rate": 7.700910466824104e-05, + "loss": 0.4233, + "step": 313 + }, + { + "epoch": 1.0606443461781427, + "grad_norm": 0.333894193250551, + "learning_rate": 7.697320718309235e-05, + "loss": 0.4177, + "step": 314 + }, + { + "epoch": 1.064013476521373, + "grad_norm": 0.39528163268670363, + "learning_rate": 7.69371040245004e-05, + "loss": 0.4188, + "step": 315 + }, + { + "epoch": 1.067382606864603, + "grad_norm": 0.28394188370498197, + "learning_rate": 7.690079539329875e-05, + "loss": 0.4129, + "step": 316 + }, + { + "epoch": 1.0707517372078332, + "grad_norm": 0.2953142618107928, + "learning_rate": 7.686428149146398e-05, + "loss": 0.4188, + "step": 317 + }, + { + "epoch": 1.0741208675510634, + "grad_norm": 0.2804966323905774, + "learning_rate": 7.682756252211453e-05, + "loss": 0.4171, + "step": 318 + }, + { + "epoch": 1.0774899978942936, + "grad_norm": 0.2510744302434169, + "learning_rate": 7.679063868950955e-05, + "loss": 0.4182, + "step": 319 + }, + { + "epoch": 1.0808591282375237, + "grad_norm": 0.307553691452299, + "learning_rate": 7.675351019904785e-05, + "loss": 0.4177, + "step": 320 + }, + { + "epoch": 1.084228258580754, + "grad_norm": 0.30605445723544605, + "learning_rate": 7.671617725726666e-05, + "loss": 0.4158, + "step": 321 + }, + { + "epoch": 1.087597388923984, + "grad_norm": 0.3011711568174157, + "learning_rate": 7.667864007184054e-05, + "loss": 0.4141, + "step": 322 + }, + { + "epoch": 1.0909665192672142, + "grad_norm": 0.286345264353555, + "learning_rate": 7.664089885158023e-05, + "loss": 0.4187, + "step": 323 + }, + { + "epoch": 1.0943356496104444, + "grad_norm": 0.3592297464995333, + "learning_rate": 7.660295380643144e-05, + "loss": 0.4175, + "step": 324 + }, + { + "epoch": 1.0977047799536745, + "grad_norm": 0.4650620300291997, + "learning_rate": 7.656480514747374e-05, + "loss": 0.4258, + "step": 325 + }, + { + "epoch": 1.1010739102969047, + "grad_norm": 0.6004744804102117, + "learning_rate": 7.652645308691933e-05, + "loss": 0.419, + "step": 326 + }, + { + "epoch": 1.1044430406401347, + "grad_norm": 0.6574099091252659, + "learning_rate": 7.648789783811191e-05, + "loss": 0.4217, + "step": 327 + }, + { + "epoch": 1.1078121709833648, + "grad_norm": 0.6456572845078092, + "learning_rate": 7.644913961552544e-05, + "loss": 0.4207, + "step": 328 + }, + { + "epoch": 1.111181301326595, + "grad_norm": 0.5305128144292774, + "learning_rate": 7.641017863476298e-05, + "loss": 0.4215, + "step": 329 + }, + { + "epoch": 1.1145504316698251, + "grad_norm": 0.3067892240749629, + "learning_rate": 7.637101511255554e-05, + "loss": 0.4127, + "step": 330 + }, + { + "epoch": 1.1179195620130553, + "grad_norm": 0.3619393126280428, + "learning_rate": 7.633164926676076e-05, + "loss": 0.4144, + "step": 331 + }, + { + "epoch": 1.1212886923562855, + "grad_norm": 0.5099026443948101, + "learning_rate": 7.629208131636179e-05, + "loss": 0.4247, + "step": 332 + }, + { + "epoch": 1.1246578226995156, + "grad_norm": 0.4600109550186491, + "learning_rate": 7.625231148146601e-05, + "loss": 0.4277, + "step": 333 + }, + { + "epoch": 1.1280269530427458, + "grad_norm": 0.34171771897702874, + "learning_rate": 7.621233998330387e-05, + "loss": 0.4111, + "step": 334 + }, + { + "epoch": 1.131396083385976, + "grad_norm": 0.3788094599826585, + "learning_rate": 7.617216704422763e-05, + "loss": 0.4238, + "step": 335 + }, + { + "epoch": 1.1347652137292061, + "grad_norm": 0.38930545757784435, + "learning_rate": 7.61317928877101e-05, + "loss": 0.4266, + "step": 336 + }, + { + "epoch": 1.1381343440724363, + "grad_norm": 0.3433858843038104, + "learning_rate": 7.609121773834341e-05, + "loss": 0.4113, + "step": 337 + }, + { + "epoch": 1.1415034744156665, + "grad_norm": 0.3664698026597823, + "learning_rate": 7.605044182183779e-05, + "loss": 0.4215, + "step": 338 + }, + { + "epoch": 1.1448726047588966, + "grad_norm": 0.3669871707797489, + "learning_rate": 7.600946536502028e-05, + "loss": 0.4187, + "step": 339 + }, + { + "epoch": 1.1482417351021268, + "grad_norm": 0.36360389026471707, + "learning_rate": 7.596828859583347e-05, + "loss": 0.4179, + "step": 340 + }, + { + "epoch": 1.151610865445357, + "grad_norm": 0.34303435626525425, + "learning_rate": 7.592691174333426e-05, + "loss": 0.4166, + "step": 341 + }, + { + "epoch": 1.1549799957885871, + "grad_norm": 0.37256557828106257, + "learning_rate": 7.588533503769257e-05, + "loss": 0.4181, + "step": 342 + }, + { + "epoch": 1.1583491261318173, + "grad_norm": 0.41467336010702505, + "learning_rate": 7.584355871019002e-05, + "loss": 0.4195, + "step": 343 + }, + { + "epoch": 1.1617182564750475, + "grad_norm": 0.37682711741626357, + "learning_rate": 7.580158299321872e-05, + "loss": 0.4226, + "step": 344 + }, + { + "epoch": 1.1650873868182776, + "grad_norm": 0.2646890963052802, + "learning_rate": 7.575940812027993e-05, + "loss": 0.4094, + "step": 345 + }, + { + "epoch": 1.1684565171615078, + "grad_norm": 0.23766486308489482, + "learning_rate": 7.571703432598275e-05, + "loss": 0.42, + "step": 346 + }, + { + "epoch": 1.171825647504738, + "grad_norm": 0.23844909696838593, + "learning_rate": 7.567446184604285e-05, + "loss": 0.4189, + "step": 347 + }, + { + "epoch": 1.175194777847968, + "grad_norm": 0.23287909504956197, + "learning_rate": 7.563169091728115e-05, + "loss": 0.4123, + "step": 348 + }, + { + "epoch": 1.178563908191198, + "grad_norm": 0.21692865660818864, + "learning_rate": 7.558872177762246e-05, + "loss": 0.4193, + "step": 349 + }, + { + "epoch": 1.1819330385344282, + "grad_norm": 0.2191653011493883, + "learning_rate": 7.554555466609425e-05, + "loss": 0.4271, + "step": 350 + }, + { + "epoch": 1.1853021688776584, + "grad_norm": 0.23748843252543808, + "learning_rate": 7.550218982282518e-05, + "loss": 0.4196, + "step": 351 + }, + { + "epoch": 1.1886712992208885, + "grad_norm": 0.24616974908904704, + "learning_rate": 7.545862748904394e-05, + "loss": 0.4146, + "step": 352 + }, + { + "epoch": 1.1920404295641187, + "grad_norm": 0.25196821392623664, + "learning_rate": 7.541486790707776e-05, + "loss": 0.4266, + "step": 353 + }, + { + "epoch": 1.1954095599073489, + "grad_norm": 0.2470207075626988, + "learning_rate": 7.537091132035111e-05, + "loss": 0.4148, + "step": 354 + }, + { + "epoch": 1.198778690250579, + "grad_norm": 0.2314470630644042, + "learning_rate": 7.532675797338438e-05, + "loss": 0.4033, + "step": 355 + }, + { + "epoch": 1.2021478205938092, + "grad_norm": 0.23746828407475515, + "learning_rate": 7.528240811179245e-05, + "loss": 0.4203, + "step": 356 + }, + { + "epoch": 1.2055169509370394, + "grad_norm": 0.28754749236703137, + "learning_rate": 7.523786198228344e-05, + "loss": 0.4182, + "step": 357 + }, + { + "epoch": 1.2088860812802695, + "grad_norm": 0.3151739472415091, + "learning_rate": 7.519311983265718e-05, + "loss": 0.4222, + "step": 358 + }, + { + "epoch": 1.2122552116234997, + "grad_norm": 0.34818120293706006, + "learning_rate": 7.514818191180397e-05, + "loss": 0.4162, + "step": 359 + }, + { + "epoch": 1.2156243419667299, + "grad_norm": 0.39359185740609565, + "learning_rate": 7.510304846970311e-05, + "loss": 0.4179, + "step": 360 + }, + { + "epoch": 1.21899347230996, + "grad_norm": 0.49169697889587877, + "learning_rate": 7.505771975742157e-05, + "loss": 0.42, + "step": 361 + }, + { + "epoch": 1.2223626026531902, + "grad_norm": 0.6588501716182329, + "learning_rate": 7.501219602711253e-05, + "loss": 0.4207, + "step": 362 + }, + { + "epoch": 1.2257317329964204, + "grad_norm": 0.6704211038154936, + "learning_rate": 7.496647753201403e-05, + "loss": 0.419, + "step": 363 + }, + { + "epoch": 1.2291008633396505, + "grad_norm": 0.5759654898267196, + "learning_rate": 7.492056452644753e-05, + "loss": 0.418, + "step": 364 + }, + { + "epoch": 1.2324699936828807, + "grad_norm": 0.46697984648682656, + "learning_rate": 7.487445726581654e-05, + "loss": 0.4202, + "step": 365 + }, + { + "epoch": 1.2358391240261108, + "grad_norm": 0.4072897949316555, + "learning_rate": 7.48281560066051e-05, + "loss": 0.416, + "step": 366 + }, + { + "epoch": 1.2392082543693408, + "grad_norm": 0.3677992679979327, + "learning_rate": 7.47816610063765e-05, + "loss": 0.4184, + "step": 367 + }, + { + "epoch": 1.242577384712571, + "grad_norm": 0.39067043330522583, + "learning_rate": 7.473497252377171e-05, + "loss": 0.4246, + "step": 368 + }, + { + "epoch": 1.2459465150558011, + "grad_norm": 0.4371263217453357, + "learning_rate": 7.468809081850802e-05, + "loss": 0.4154, + "step": 369 + }, + { + "epoch": 1.2493156453990313, + "grad_norm": 0.466667275005644, + "learning_rate": 7.464101615137756e-05, + "loss": 0.4221, + "step": 370 + }, + { + "epoch": 1.2526847757422614, + "grad_norm": 0.40517656554168363, + "learning_rate": 7.459374878424585e-05, + "loss": 0.4149, + "step": 371 + }, + { + "epoch": 1.2560539060854916, + "grad_norm": 0.318301658777578, + "learning_rate": 7.454628898005043e-05, + "loss": 0.4117, + "step": 372 + }, + { + "epoch": 1.2594230364287218, + "grad_norm": 0.27016373739597804, + "learning_rate": 7.449863700279923e-05, + "loss": 0.4151, + "step": 373 + }, + { + "epoch": 1.262792166771952, + "grad_norm": 0.27155592369639425, + "learning_rate": 7.445079311756924e-05, + "loss": 0.4121, + "step": 374 + }, + { + "epoch": 1.266161297115182, + "grad_norm": 0.3009069725735746, + "learning_rate": 7.440275759050499e-05, + "loss": 0.4209, + "step": 375 + }, + { + "epoch": 1.2695304274584123, + "grad_norm": 0.2981596620618354, + "learning_rate": 7.435453068881706e-05, + "loss": 0.4127, + "step": 376 + }, + { + "epoch": 1.2728995578016424, + "grad_norm": 0.323823588891271, + "learning_rate": 7.430611268078059e-05, + "loss": 0.4097, + "step": 377 + }, + { + "epoch": 1.2762686881448726, + "grad_norm": 0.3872478492699807, + "learning_rate": 7.425750383573384e-05, + "loss": 0.4142, + "step": 378 + }, + { + "epoch": 1.2796378184881028, + "grad_norm": 0.38340080550258643, + "learning_rate": 7.420870442407662e-05, + "loss": 0.4158, + "step": 379 + }, + { + "epoch": 1.283006948831333, + "grad_norm": 0.33826799912854405, + "learning_rate": 7.415971471726884e-05, + "loss": 0.4181, + "step": 380 + }, + { + "epoch": 1.286376079174563, + "grad_norm": 0.3412122527192401, + "learning_rate": 7.411053498782893e-05, + "loss": 0.4115, + "step": 381 + }, + { + "epoch": 1.2897452095177933, + "grad_norm": 0.339753253979875, + "learning_rate": 7.406116550933246e-05, + "loss": 0.414, + "step": 382 + }, + { + "epoch": 1.2931143398610234, + "grad_norm": 0.27036940059101494, + "learning_rate": 7.401160655641044e-05, + "loss": 0.4134, + "step": 383 + }, + { + "epoch": 1.2964834702042536, + "grad_norm": 0.26771109575539487, + "learning_rate": 7.396185840474792e-05, + "loss": 0.4145, + "step": 384 + }, + { + "epoch": 1.2998526005474837, + "grad_norm": 0.26853385874933655, + "learning_rate": 7.391192133108243e-05, + "loss": 0.4196, + "step": 385 + }, + { + "epoch": 1.303221730890714, + "grad_norm": 0.25978626330709237, + "learning_rate": 7.386179561320243e-05, + "loss": 0.4179, + "step": 386 + }, + { + "epoch": 1.306590861233944, + "grad_norm": 0.27982894045702544, + "learning_rate": 7.381148152994573e-05, + "loss": 0.4134, + "step": 387 + }, + { + "epoch": 1.3099599915771742, + "grad_norm": 0.2323947422520883, + "learning_rate": 7.376097936119803e-05, + "loss": 0.4125, + "step": 388 + }, + { + "epoch": 1.3133291219204044, + "grad_norm": 0.30635653504253, + "learning_rate": 7.371028938789122e-05, + "loss": 0.4169, + "step": 389 + }, + { + "epoch": 1.3166982522636346, + "grad_norm": 0.30800676867326193, + "learning_rate": 7.365941189200201e-05, + "loss": 0.4124, + "step": 390 + }, + { + "epoch": 1.3200673826068647, + "grad_norm": 0.26291762070095065, + "learning_rate": 7.360834715655019e-05, + "loss": 0.4163, + "step": 391 + }, + { + "epoch": 1.3234365129500947, + "grad_norm": 0.28430697703638136, + "learning_rate": 7.35570954655971e-05, + "loss": 0.4126, + "step": 392 + }, + { + "epoch": 1.3268056432933248, + "grad_norm": 0.26422175633500083, + "learning_rate": 7.350565710424414e-05, + "loss": 0.4089, + "step": 393 + }, + { + "epoch": 1.330174773636555, + "grad_norm": 0.2426759795255368, + "learning_rate": 7.345403235863105e-05, + "loss": 0.4164, + "step": 394 + }, + { + "epoch": 1.3335439039797852, + "grad_norm": 0.25948928094236345, + "learning_rate": 7.340222151593443e-05, + "loss": 0.4184, + "step": 395 + }, + { + "epoch": 1.3369130343230153, + "grad_norm": 0.3377516247942669, + "learning_rate": 7.335022486436608e-05, + "loss": 0.4169, + "step": 396 + }, + { + "epoch": 1.3402821646662455, + "grad_norm": 0.39062210665305114, + "learning_rate": 7.329804269317137e-05, + "loss": 0.4212, + "step": 397 + }, + { + "epoch": 1.3436512950094757, + "grad_norm": 0.43725817266447464, + "learning_rate": 7.324567529262775e-05, + "loss": 0.4162, + "step": 398 + }, + { + "epoch": 1.3470204253527058, + "grad_norm": 0.3940951458992484, + "learning_rate": 7.319312295404301e-05, + "loss": 0.4109, + "step": 399 + }, + { + "epoch": 1.350389555695936, + "grad_norm": 0.30866362073984877, + "learning_rate": 7.31403859697537e-05, + "loss": 0.4138, + "step": 400 + }, + { + "epoch": 1.3537586860391662, + "grad_norm": 0.26875958761791713, + "learning_rate": 7.308746463312353e-05, + "loss": 0.417, + "step": 401 + }, + { + "epoch": 1.3571278163823963, + "grad_norm": 0.3115107888080639, + "learning_rate": 7.303435923854172e-05, + "loss": 0.4122, + "step": 402 + }, + { + "epoch": 1.3604969467256265, + "grad_norm": 0.36714492695394935, + "learning_rate": 7.298107008142139e-05, + "loss": 0.4159, + "step": 403 + }, + { + "epoch": 1.3638660770688567, + "grad_norm": 0.3981685397894353, + "learning_rate": 7.292759745819781e-05, + "loss": 0.4133, + "step": 404 + }, + { + "epoch": 1.3672352074120868, + "grad_norm": 0.3069454557345131, + "learning_rate": 7.287394166632691e-05, + "loss": 0.4208, + "step": 405 + }, + { + "epoch": 1.370604337755317, + "grad_norm": 0.24748441489038914, + "learning_rate": 7.282010300428351e-05, + "loss": 0.4104, + "step": 406 + }, + { + "epoch": 1.373973468098547, + "grad_norm": 0.2118791200536055, + "learning_rate": 7.276608177155968e-05, + "loss": 0.4124, + "step": 407 + }, + { + "epoch": 1.377342598441777, + "grad_norm": 0.24963361520147168, + "learning_rate": 7.271187826866312e-05, + "loss": 0.4149, + "step": 408 + }, + { + "epoch": 1.3807117287850073, + "grad_norm": 0.31609061336459937, + "learning_rate": 7.265749279711543e-05, + "loss": 0.4266, + "step": 409 + }, + { + "epoch": 1.3840808591282374, + "grad_norm": 0.35611885888992273, + "learning_rate": 7.260292565945049e-05, + "loss": 0.4144, + "step": 410 + }, + { + "epoch": 1.3874499894714676, + "grad_norm": 0.36813941496856034, + "learning_rate": 7.254817715921273e-05, + "loss": 0.4148, + "step": 411 + }, + { + "epoch": 1.3908191198146977, + "grad_norm": 0.3386310771485794, + "learning_rate": 7.249324760095544e-05, + "loss": 0.4157, + "step": 412 + }, + { + "epoch": 1.394188250157928, + "grad_norm": 0.3286044748961816, + "learning_rate": 7.243813729023913e-05, + "loss": 0.418, + "step": 413 + }, + { + "epoch": 1.397557380501158, + "grad_norm": 0.36106557337509826, + "learning_rate": 7.238284653362977e-05, + "loss": 0.4127, + "step": 414 + }, + { + "epoch": 1.4009265108443882, + "grad_norm": 0.3713183412724868, + "learning_rate": 7.232737563869711e-05, + "loss": 0.4223, + "step": 415 + }, + { + "epoch": 1.4042956411876184, + "grad_norm": 0.4108948812505769, + "learning_rate": 7.227172491401299e-05, + "loss": 0.4159, + "step": 416 + }, + { + "epoch": 1.4076647715308486, + "grad_norm": 0.42746635756913826, + "learning_rate": 7.221589466914955e-05, + "loss": 0.4183, + "step": 417 + }, + { + "epoch": 1.4110339018740787, + "grad_norm": 0.4281754377696307, + "learning_rate": 7.215988521467763e-05, + "loss": 0.4143, + "step": 418 + }, + { + "epoch": 1.414403032217309, + "grad_norm": 0.34581190375042925, + "learning_rate": 7.210369686216492e-05, + "loss": 0.4232, + "step": 419 + }, + { + "epoch": 1.417772162560539, + "grad_norm": 0.24817011068233216, + "learning_rate": 7.204732992417431e-05, + "loss": 0.4203, + "step": 420 + }, + { + "epoch": 1.4211412929037692, + "grad_norm": 0.2703015486109723, + "learning_rate": 7.199078471426208e-05, + "loss": 0.4188, + "step": 421 + }, + { + "epoch": 1.4245104232469994, + "grad_norm": 0.3376907597722382, + "learning_rate": 7.193406154697625e-05, + "loss": 0.4123, + "step": 422 + }, + { + "epoch": 1.4278795535902296, + "grad_norm": 0.35688284736368614, + "learning_rate": 7.187716073785471e-05, + "loss": 0.4073, + "step": 423 + }, + { + "epoch": 1.4312486839334597, + "grad_norm": 0.29210262958830335, + "learning_rate": 7.18200826034236e-05, + "loss": 0.4155, + "step": 424 + }, + { + "epoch": 1.4346178142766899, + "grad_norm": 0.20624868853539452, + "learning_rate": 7.176282746119544e-05, + "loss": 0.4082, + "step": 425 + }, + { + "epoch": 1.43798694461992, + "grad_norm": 0.21431087254932987, + "learning_rate": 7.17053956296674e-05, + "loss": 0.4072, + "step": 426 + }, + { + "epoch": 1.4413560749631502, + "grad_norm": 0.25982900003092185, + "learning_rate": 7.164778742831954e-05, + "loss": 0.4113, + "step": 427 + }, + { + "epoch": 1.4447252053063804, + "grad_norm": 0.3503298873194117, + "learning_rate": 7.159000317761305e-05, + "loss": 0.4128, + "step": 428 + }, + { + "epoch": 1.4480943356496105, + "grad_norm": 0.4693051629184559, + "learning_rate": 7.153204319898839e-05, + "loss": 0.4138, + "step": 429 + }, + { + "epoch": 1.4514634659928407, + "grad_norm": 0.502991287048126, + "learning_rate": 7.14739078148636e-05, + "loss": 0.4157, + "step": 430 + }, + { + "epoch": 1.4548325963360709, + "grad_norm": 0.5001041791172387, + "learning_rate": 7.141559734863245e-05, + "loss": 0.4082, + "step": 431 + }, + { + "epoch": 1.458201726679301, + "grad_norm": 0.4696810029288007, + "learning_rate": 7.135711212466264e-05, + "loss": 0.4198, + "step": 432 + }, + { + "epoch": 1.461570857022531, + "grad_norm": 0.43034902073433023, + "learning_rate": 7.1298452468294e-05, + "loss": 0.4165, + "step": 433 + }, + { + "epoch": 1.4649399873657611, + "grad_norm": 0.4022839654121198, + "learning_rate": 7.123961870583671e-05, + "loss": 0.4096, + "step": 434 + }, + { + "epoch": 1.4683091177089913, + "grad_norm": 0.3107712308577315, + "learning_rate": 7.118061116456944e-05, + "loss": 0.4137, + "step": 435 + }, + { + "epoch": 1.4716782480522215, + "grad_norm": 0.3140180702883453, + "learning_rate": 7.112143017273759e-05, + "loss": 0.4108, + "step": 436 + }, + { + "epoch": 1.4750473783954516, + "grad_norm": 0.40495663409539695, + "learning_rate": 7.106207605955136e-05, + "loss": 0.4166, + "step": 437 + }, + { + "epoch": 1.4784165087386818, + "grad_norm": 0.4652370041483942, + "learning_rate": 7.100254915518408e-05, + "loss": 0.414, + "step": 438 + }, + { + "epoch": 1.481785639081912, + "grad_norm": 0.41391982007664, + "learning_rate": 7.094284979077015e-05, + "loss": 0.4131, + "step": 439 + }, + { + "epoch": 1.4851547694251421, + "grad_norm": 0.34516805959620245, + "learning_rate": 7.088297829840346e-05, + "loss": 0.4129, + "step": 440 + }, + { + "epoch": 1.4885238997683723, + "grad_norm": 0.32652038382328485, + "learning_rate": 7.08229350111353e-05, + "loss": 0.413, + "step": 441 + }, + { + "epoch": 1.4918930301116025, + "grad_norm": 0.22506092814882847, + "learning_rate": 7.076272026297268e-05, + "loss": 0.4127, + "step": 442 + }, + { + "epoch": 1.4952621604548326, + "grad_norm": 0.2282536847065667, + "learning_rate": 7.070233438887639e-05, + "loss": 0.4071, + "step": 443 + }, + { + "epoch": 1.4986312907980628, + "grad_norm": 0.2446847320184482, + "learning_rate": 7.064177772475912e-05, + "loss": 0.4138, + "step": 444 + }, + { + "epoch": 1.502000421141293, + "grad_norm": 0.25152698752852437, + "learning_rate": 7.05810506074837e-05, + "loss": 0.4141, + "step": 445 + }, + { + "epoch": 1.505369551484523, + "grad_norm": 0.2548217617366647, + "learning_rate": 7.052015337486109e-05, + "loss": 0.4098, + "step": 446 + }, + { + "epoch": 1.508738681827753, + "grad_norm": 0.2731777853595498, + "learning_rate": 7.045908636564858e-05, + "loss": 0.4118, + "step": 447 + }, + { + "epoch": 1.5121078121709832, + "grad_norm": 0.3121024086238583, + "learning_rate": 7.03978499195479e-05, + "loss": 0.4111, + "step": 448 + }, + { + "epoch": 1.5154769425142134, + "grad_norm": 0.28013154989340816, + "learning_rate": 7.03364443772033e-05, + "loss": 0.4123, + "step": 449 + }, + { + "epoch": 1.5188460728574436, + "grad_norm": 0.20045789950968235, + "learning_rate": 7.027487008019969e-05, + "loss": 0.41, + "step": 450 + }, + { + "epoch": 1.5222152032006737, + "grad_norm": 0.1935253416836786, + "learning_rate": 7.021312737106068e-05, + "loss": 0.4184, + "step": 451 + }, + { + "epoch": 1.5255843335439039, + "grad_norm": 0.2182563856327327, + "learning_rate": 7.015121659324678e-05, + "loss": 0.4121, + "step": 452 + }, + { + "epoch": 1.528953463887134, + "grad_norm": 0.20129933934815375, + "learning_rate": 7.00891380911534e-05, + "loss": 0.4136, + "step": 453 + }, + { + "epoch": 1.5323225942303642, + "grad_norm": 0.19011310030838788, + "learning_rate": 7.002689221010897e-05, + "loss": 0.4113, + "step": 454 + }, + { + "epoch": 1.5356917245735944, + "grad_norm": 0.19585723303180483, + "learning_rate": 6.9964479296373e-05, + "loss": 0.4139, + "step": 455 + }, + { + "epoch": 1.5390608549168245, + "grad_norm": 0.1740680287737997, + "learning_rate": 6.990189969713416e-05, + "loss": 0.4141, + "step": 456 + }, + { + "epoch": 1.5424299852600547, + "grad_norm": 0.2068670733390012, + "learning_rate": 6.983915376050833e-05, + "loss": 0.4093, + "step": 457 + }, + { + "epoch": 1.5457991156032849, + "grad_norm": 0.2583283837253456, + "learning_rate": 6.977624183553676e-05, + "loss": 0.4192, + "step": 458 + }, + { + "epoch": 1.549168245946515, + "grad_norm": 0.28194252885557924, + "learning_rate": 6.971316427218399e-05, + "loss": 0.412, + "step": 459 + }, + { + "epoch": 1.5525373762897452, + "grad_norm": 0.27071463569696164, + "learning_rate": 6.964992142133602e-05, + "loss": 0.4207, + "step": 460 + }, + { + "epoch": 1.5559065066329754, + "grad_norm": 0.27470579632282327, + "learning_rate": 6.958651363479822e-05, + "loss": 0.4165, + "step": 461 + }, + { + "epoch": 1.5592756369762055, + "grad_norm": 0.2703402600040993, + "learning_rate": 6.952294126529356e-05, + "loss": 0.4134, + "step": 462 + }, + { + "epoch": 1.5626447673194357, + "grad_norm": 0.26465479604538705, + "learning_rate": 6.94592046664605e-05, + "loss": 0.4136, + "step": 463 + }, + { + "epoch": 1.5660138976626659, + "grad_norm": 0.31132857636043815, + "learning_rate": 6.939530419285104e-05, + "loss": 0.4163, + "step": 464 + }, + { + "epoch": 1.569383028005896, + "grad_norm": 0.4012221142274, + "learning_rate": 6.933124019992884e-05, + "loss": 0.4138, + "step": 465 + }, + { + "epoch": 1.5727521583491262, + "grad_norm": 0.5021621002447393, + "learning_rate": 6.926701304406713e-05, + "loss": 0.4105, + "step": 466 + }, + { + "epoch": 1.5761212886923563, + "grad_norm": 0.5905418251776803, + "learning_rate": 6.920262308254683e-05, + "loss": 0.4147, + "step": 467 + }, + { + "epoch": 1.5794904190355865, + "grad_norm": 0.6182317762023337, + "learning_rate": 6.913807067355445e-05, + "loss": 0.4128, + "step": 468 + }, + { + "epoch": 1.5828595493788167, + "grad_norm": 0.4945917435433832, + "learning_rate": 6.907335617618018e-05, + "loss": 0.4167, + "step": 469 + }, + { + "epoch": 1.5862286797220468, + "grad_norm": 0.3166116083838581, + "learning_rate": 6.90084799504159e-05, + "loss": 0.4136, + "step": 470 + }, + { + "epoch": 1.589597810065277, + "grad_norm": 0.2848441164225104, + "learning_rate": 6.894344235715311e-05, + "loss": 0.4127, + "step": 471 + }, + { + "epoch": 1.5929669404085072, + "grad_norm": 0.35210847111444277, + "learning_rate": 6.887824375818099e-05, + "loss": 0.4125, + "step": 472 + }, + { + "epoch": 1.5963360707517373, + "grad_norm": 0.36122192833869504, + "learning_rate": 6.881288451618431e-05, + "loss": 0.4175, + "step": 473 + }, + { + "epoch": 1.5997052010949675, + "grad_norm": 0.30874010342588315, + "learning_rate": 6.874736499474154e-05, + "loss": 0.4123, + "step": 474 + }, + { + "epoch": 1.6030743314381977, + "grad_norm": 0.2415425383601781, + "learning_rate": 6.868168555832266e-05, + "loss": 0.409, + "step": 475 + }, + { + "epoch": 1.6064434617814278, + "grad_norm": 0.2777593930598247, + "learning_rate": 6.861584657228728e-05, + "loss": 0.4109, + "step": 476 + }, + { + "epoch": 1.6098125921246578, + "grad_norm": 0.2552160489277856, + "learning_rate": 6.854984840288253e-05, + "loss": 0.4063, + "step": 477 + }, + { + "epoch": 1.613181722467888, + "grad_norm": 0.21292379117303817, + "learning_rate": 6.848369141724104e-05, + "loss": 0.4113, + "step": 478 + }, + { + "epoch": 1.616550852811118, + "grad_norm": 0.25826725556041485, + "learning_rate": 6.841737598337886e-05, + "loss": 0.4162, + "step": 479 + }, + { + "epoch": 1.6199199831543483, + "grad_norm": 0.24587379643844692, + "learning_rate": 6.835090247019354e-05, + "loss": 0.4098, + "step": 480 + }, + { + "epoch": 1.6232891134975784, + "grad_norm": 0.22506059025604672, + "learning_rate": 6.828427124746191e-05, + "loss": 0.4177, + "step": 481 + }, + { + "epoch": 1.6266582438408086, + "grad_norm": 0.2625291980432951, + "learning_rate": 6.821748268583813e-05, + "loss": 0.4138, + "step": 482 + }, + { + "epoch": 1.6300273741840388, + "grad_norm": 0.2899682108073399, + "learning_rate": 6.815053715685161e-05, + "loss": 0.4112, + "step": 483 + }, + { + "epoch": 1.633396504527269, + "grad_norm": 0.24684733944107418, + "learning_rate": 6.808343503290491e-05, + "loss": 0.4084, + "step": 484 + }, + { + "epoch": 1.636765634870499, + "grad_norm": 0.22856568944562095, + "learning_rate": 6.80161766872717e-05, + "loss": 0.4099, + "step": 485 + }, + { + "epoch": 1.6401347652137293, + "grad_norm": 0.2528553309235842, + "learning_rate": 6.79487624940947e-05, + "loss": 0.4074, + "step": 486 + }, + { + "epoch": 1.6435038955569592, + "grad_norm": 0.24954291821287325, + "learning_rate": 6.788119282838355e-05, + "loss": 0.4156, + "step": 487 + }, + { + "epoch": 1.6468730259001894, + "grad_norm": 0.2486958212588815, + "learning_rate": 6.781346806601273e-05, + "loss": 0.4148, + "step": 488 + }, + { + "epoch": 1.6502421562434195, + "grad_norm": 0.20838834765340428, + "learning_rate": 6.774558858371952e-05, + "loss": 0.4107, + "step": 489 + }, + { + "epoch": 1.6536112865866497, + "grad_norm": 0.157993940020379, + "learning_rate": 6.767755475910185e-05, + "loss": 0.4112, + "step": 490 + }, + { + "epoch": 1.6569804169298799, + "grad_norm": 0.24383891745288697, + "learning_rate": 6.760936697061626e-05, + "loss": 0.4117, + "step": 491 + }, + { + "epoch": 1.66034954727311, + "grad_norm": 0.28630859094765176, + "learning_rate": 6.754102559757569e-05, + "loss": 0.4108, + "step": 492 + }, + { + "epoch": 1.6637186776163402, + "grad_norm": 0.2744705368738465, + "learning_rate": 6.74725310201475e-05, + "loss": 0.4068, + "step": 493 + }, + { + "epoch": 1.6670878079595703, + "grad_norm": 0.2832510381791776, + "learning_rate": 6.740388361935125e-05, + "loss": 0.4072, + "step": 494 + }, + { + "epoch": 1.6704569383028005, + "grad_norm": 0.2988249231230451, + "learning_rate": 6.733508377705661e-05, + "loss": 0.4077, + "step": 495 + }, + { + "epoch": 1.6738260686460307, + "grad_norm": 0.24557523045791532, + "learning_rate": 6.726613187598132e-05, + "loss": 0.416, + "step": 496 + }, + { + "epoch": 1.6771951989892608, + "grad_norm": 0.21450213834423756, + "learning_rate": 6.71970282996889e-05, + "loss": 0.4099, + "step": 497 + }, + { + "epoch": 1.680564329332491, + "grad_norm": 0.2564463597465919, + "learning_rate": 6.712777343258666e-05, + "loss": 0.4113, + "step": 498 + }, + { + "epoch": 1.6839334596757212, + "grad_norm": 0.28973958295073354, + "learning_rate": 6.705836765992348e-05, + "loss": 0.4173, + "step": 499 + }, + { + "epoch": 1.6873025900189513, + "grad_norm": 0.3093418967185147, + "learning_rate": 6.698881136778771e-05, + "loss": 0.4173, + "step": 500 + }, + { + "epoch": 1.6906717203621815, + "grad_norm": 0.30710292961925306, + "learning_rate": 6.691910494310499e-05, + "loss": 0.4202, + "step": 501 + }, + { + "epoch": 1.6940408507054117, + "grad_norm": 0.298386372490933, + "learning_rate": 6.684924877363613e-05, + "loss": 0.4063, + "step": 502 + }, + { + "epoch": 1.6974099810486418, + "grad_norm": 0.31358421654801716, + "learning_rate": 6.67792432479749e-05, + "loss": 0.4117, + "step": 503 + }, + { + "epoch": 1.700779111391872, + "grad_norm": 0.34684913918298366, + "learning_rate": 6.670908875554594e-05, + "loss": 0.4103, + "step": 504 + }, + { + "epoch": 1.7041482417351022, + "grad_norm": 0.3071849696400485, + "learning_rate": 6.663878568660258e-05, + "loss": 0.4064, + "step": 505 + }, + { + "epoch": 1.7075173720783323, + "grad_norm": 0.25934260311596186, + "learning_rate": 6.656833443222458e-05, + "loss": 0.4026, + "step": 506 + }, + { + "epoch": 1.7108865024215625, + "grad_norm": 0.254331135385578, + "learning_rate": 6.649773538431605e-05, + "loss": 0.4123, + "step": 507 + }, + { + "epoch": 1.7142556327647926, + "grad_norm": 0.2696672284837906, + "learning_rate": 6.642698893560327e-05, + "loss": 0.4135, + "step": 508 + }, + { + "epoch": 1.7176247631080228, + "grad_norm": 0.3170338993835499, + "learning_rate": 6.635609547963243e-05, + "loss": 0.4078, + "step": 509 + }, + { + "epoch": 1.720993893451253, + "grad_norm": 0.34598694657993484, + "learning_rate": 6.628505541076755e-05, + "loss": 0.4143, + "step": 510 + }, + { + "epoch": 1.7243630237944831, + "grad_norm": 0.3659302514618013, + "learning_rate": 6.621386912418816e-05, + "loss": 0.413, + "step": 511 + }, + { + "epoch": 1.7277321541377133, + "grad_norm": 0.3036155922766547, + "learning_rate": 6.614253701588718e-05, + "loss": 0.413, + "step": 512 + }, + { + "epoch": 1.7311012844809435, + "grad_norm": 0.26442302840915777, + "learning_rate": 6.607105948266872e-05, + "loss": 0.4141, + "step": 513 + }, + { + "epoch": 1.7344704148241736, + "grad_norm": 0.2820703196464, + "learning_rate": 6.599943692214587e-05, + "loss": 0.4154, + "step": 514 + }, + { + "epoch": 1.7378395451674038, + "grad_norm": 0.2716579783783052, + "learning_rate": 6.592766973273843e-05, + "loss": 0.418, + "step": 515 + }, + { + "epoch": 1.741208675510634, + "grad_norm": 0.2320214556767005, + "learning_rate": 6.585575831367078e-05, + "loss": 0.4136, + "step": 516 + }, + { + "epoch": 1.7445778058538641, + "grad_norm": 0.20790915888905742, + "learning_rate": 6.578370306496957e-05, + "loss": 0.4126, + "step": 517 + }, + { + "epoch": 1.747946936197094, + "grad_norm": 0.2165582926633229, + "learning_rate": 6.571150438746157e-05, + "loss": 0.4112, + "step": 518 + }, + { + "epoch": 1.7513160665403242, + "grad_norm": 0.24261057128754013, + "learning_rate": 6.563916268277144e-05, + "loss": 0.413, + "step": 519 + }, + { + "epoch": 1.7546851968835544, + "grad_norm": 0.2755800264624728, + "learning_rate": 6.55666783533194e-05, + "loss": 0.4166, + "step": 520 + }, + { + "epoch": 1.7580543272267846, + "grad_norm": 0.28813858434017786, + "learning_rate": 6.549405180231911e-05, + "loss": 0.404, + "step": 521 + }, + { + "epoch": 1.7614234575700147, + "grad_norm": 0.24090919880210407, + "learning_rate": 6.542128343377536e-05, + "loss": 0.4075, + "step": 522 + }, + { + "epoch": 1.764792587913245, + "grad_norm": 0.21389800108034238, + "learning_rate": 6.534837365248185e-05, + "loss": 0.4124, + "step": 523 + }, + { + "epoch": 1.768161718256475, + "grad_norm": 0.2562042134322129, + "learning_rate": 6.527532286401889e-05, + "loss": 0.4174, + "step": 524 + }, + { + "epoch": 1.7715308485997052, + "grad_norm": 0.2571401145743441, + "learning_rate": 6.520213147475123e-05, + "loss": 0.4144, + "step": 525 + }, + { + "epoch": 1.7748999789429354, + "grad_norm": 0.2423820773625362, + "learning_rate": 6.51287998918257e-05, + "loss": 0.4046, + "step": 526 + }, + { + "epoch": 1.7782691092861656, + "grad_norm": 0.2310131148631897, + "learning_rate": 6.505532852316904e-05, + "loss": 0.407, + "step": 527 + }, + { + "epoch": 1.7816382396293957, + "grad_norm": 0.2467085051059651, + "learning_rate": 6.498171777748557e-05, + "loss": 0.4134, + "step": 528 + }, + { + "epoch": 1.7850073699726257, + "grad_norm": 0.2429312927228722, + "learning_rate": 6.49079680642549e-05, + "loss": 0.4136, + "step": 529 + }, + { + "epoch": 1.7883765003158558, + "grad_norm": 0.18962286619000535, + "learning_rate": 6.483407979372975e-05, + "loss": 0.4094, + "step": 530 + }, + { + "epoch": 1.791745630659086, + "grad_norm": 0.17276030637120937, + "learning_rate": 6.476005337693355e-05, + "loss": 0.4127, + "step": 531 + }, + { + "epoch": 1.7951147610023162, + "grad_norm": 0.1991873488324741, + "learning_rate": 6.468588922565822e-05, + "loss": 0.407, + "step": 532 + }, + { + "epoch": 1.7984838913455463, + "grad_norm": 0.23230143768755912, + "learning_rate": 6.461158775246186e-05, + "loss": 0.4069, + "step": 533 + }, + { + "epoch": 1.8018530216887765, + "grad_norm": 0.25362081452848795, + "learning_rate": 6.453714937066648e-05, + "loss": 0.4089, + "step": 534 + }, + { + "epoch": 1.8052221520320066, + "grad_norm": 0.20024317986028692, + "learning_rate": 6.446257449435566e-05, + "loss": 0.4062, + "step": 535 + }, + { + "epoch": 1.8085912823752368, + "grad_norm": 0.16636181558776822, + "learning_rate": 6.438786353837228e-05, + "loss": 0.4061, + "step": 536 + }, + { + "epoch": 1.811960412718467, + "grad_norm": 0.20687002125002474, + "learning_rate": 6.43130169183162e-05, + "loss": 0.4131, + "step": 537 + }, + { + "epoch": 1.8153295430616971, + "grad_norm": 0.2568138645034864, + "learning_rate": 6.423803505054193e-05, + "loss": 0.411, + "step": 538 + }, + { + "epoch": 1.8186986734049273, + "grad_norm": 0.3369872578212292, + "learning_rate": 6.416291835215636e-05, + "loss": 0.4077, + "step": 539 + }, + { + "epoch": 1.8220678037481575, + "grad_norm": 0.41379320932213953, + "learning_rate": 6.408766724101638e-05, + "loss": 0.4077, + "step": 540 + }, + { + "epoch": 1.8254369340913876, + "grad_norm": 0.43767998472550695, + "learning_rate": 6.401228213572663e-05, + "loss": 0.4151, + "step": 541 + }, + { + "epoch": 1.8288060644346178, + "grad_norm": 0.4536984763596022, + "learning_rate": 6.393676345563708e-05, + "loss": 0.42, + "step": 542 + }, + { + "epoch": 1.832175194777848, + "grad_norm": 0.4692529959956868, + "learning_rate": 6.386111162084078e-05, + "loss": 0.4002, + "step": 543 + }, + { + "epoch": 1.8355443251210781, + "grad_norm": 0.34237321055490366, + "learning_rate": 6.378532705217148e-05, + "loss": 0.406, + "step": 544 + }, + { + "epoch": 1.8389134554643083, + "grad_norm": 0.2659729255014706, + "learning_rate": 6.370941017120127e-05, + "loss": 0.4135, + "step": 545 + }, + { + "epoch": 1.8422825858075385, + "grad_norm": 0.32797296963486666, + "learning_rate": 6.363336140023833e-05, + "loss": 0.4088, + "step": 546 + }, + { + "epoch": 1.8456517161507686, + "grad_norm": 0.35579650932418716, + "learning_rate": 6.355718116232444e-05, + "loss": 0.4093, + "step": 547 + }, + { + "epoch": 1.8490208464939988, + "grad_norm": 0.2907411351475013, + "learning_rate": 6.348086988123274e-05, + "loss": 0.4116, + "step": 548 + }, + { + "epoch": 1.852389976837229, + "grad_norm": 0.2732388318681213, + "learning_rate": 6.340442798146535e-05, + "loss": 0.4091, + "step": 549 + }, + { + "epoch": 1.855759107180459, + "grad_norm": 0.35761144655913124, + "learning_rate": 6.332785588825094e-05, + "loss": 0.4037, + "step": 550 + }, + { + "epoch": 1.8591282375236893, + "grad_norm": 0.3014328362434633, + "learning_rate": 6.325115402754245e-05, + "loss": 0.4072, + "step": 551 + }, + { + "epoch": 1.8624973678669194, + "grad_norm": 0.2340334979203501, + "learning_rate": 6.317432282601469e-05, + "loss": 0.403, + "step": 552 + }, + { + "epoch": 1.8658664982101496, + "grad_norm": 0.33855256005840595, + "learning_rate": 6.309736271106193e-05, + "loss": 0.4106, + "step": 553 + }, + { + "epoch": 1.8692356285533798, + "grad_norm": 0.31482993852294594, + "learning_rate": 6.302027411079562e-05, + "loss": 0.4079, + "step": 554 + }, + { + "epoch": 1.87260475889661, + "grad_norm": 0.21683415129270545, + "learning_rate": 6.294305745404185e-05, + "loss": 0.4032, + "step": 555 + }, + { + "epoch": 1.87597388923984, + "grad_norm": 0.209469978649313, + "learning_rate": 6.286571317033915e-05, + "loss": 0.4088, + "step": 556 + }, + { + "epoch": 1.8793430195830703, + "grad_norm": 0.2816343476274617, + "learning_rate": 6.278824168993596e-05, + "loss": 0.4126, + "step": 557 + }, + { + "epoch": 1.8827121499263004, + "grad_norm": 0.32252631746288557, + "learning_rate": 6.271064344378832e-05, + "loss": 0.4086, + "step": 558 + }, + { + "epoch": 1.8860812802695306, + "grad_norm": 0.2900131891387989, + "learning_rate": 6.263291886355738e-05, + "loss": 0.4086, + "step": 559 + }, + { + "epoch": 1.8894504106127605, + "grad_norm": 0.26445922268042416, + "learning_rate": 6.255506838160711e-05, + "loss": 0.4093, + "step": 560 + }, + { + "epoch": 1.8928195409559907, + "grad_norm": 0.2561028521945913, + "learning_rate": 6.247709243100185e-05, + "loss": 0.4136, + "step": 561 + }, + { + "epoch": 1.8961886712992209, + "grad_norm": 0.23899571940882475, + "learning_rate": 6.239899144550383e-05, + "loss": 0.4058, + "step": 562 + }, + { + "epoch": 1.899557801642451, + "grad_norm": 0.2338421290415243, + "learning_rate": 6.232076585957087e-05, + "loss": 0.4074, + "step": 563 + }, + { + "epoch": 1.9029269319856812, + "grad_norm": 0.18752299712254275, + "learning_rate": 6.224241610835391e-05, + "loss": 0.4096, + "step": 564 + }, + { + "epoch": 1.9062960623289114, + "grad_norm": 0.19324708447438393, + "learning_rate": 6.216394262769459e-05, + "loss": 0.4096, + "step": 565 + }, + { + "epoch": 1.9096651926721415, + "grad_norm": 0.21276012461948887, + "learning_rate": 6.208534585412282e-05, + "loss": 0.4033, + "step": 566 + }, + { + "epoch": 1.9130343230153717, + "grad_norm": 0.18970083289771164, + "learning_rate": 6.200662622485435e-05, + "loss": 0.4054, + "step": 567 + }, + { + "epoch": 1.9164034533586018, + "grad_norm": 0.1696360552220803, + "learning_rate": 6.19277841777884e-05, + "loss": 0.4069, + "step": 568 + }, + { + "epoch": 1.919772583701832, + "grad_norm": 0.19478504599245822, + "learning_rate": 6.18488201515051e-05, + "loss": 0.4054, + "step": 569 + }, + { + "epoch": 1.923141714045062, + "grad_norm": 0.16721019486842992, + "learning_rate": 6.176973458526317e-05, + "loss": 0.4142, + "step": 570 + }, + { + "epoch": 1.9265108443882921, + "grad_norm": 0.18059816629328238, + "learning_rate": 6.169052791899742e-05, + "loss": 0.4047, + "step": 571 + }, + { + "epoch": 1.9298799747315223, + "grad_norm": 0.2125539453111369, + "learning_rate": 6.161120059331628e-05, + "loss": 0.4074, + "step": 572 + }, + { + "epoch": 1.9332491050747524, + "grad_norm": 0.19087275687720429, + "learning_rate": 6.153175304949946e-05, + "loss": 0.411, + "step": 573 + }, + { + "epoch": 1.9366182354179826, + "grad_norm": 0.18049162279809125, + "learning_rate": 6.14521857294953e-05, + "loss": 0.4055, + "step": 574 + }, + { + "epoch": 1.9399873657612128, + "grad_norm": 0.17375875826436044, + "learning_rate": 6.137249907591855e-05, + "loss": 0.4065, + "step": 575 + }, + { + "epoch": 1.943356496104443, + "grad_norm": 0.1739704448036202, + "learning_rate": 6.129269353204769e-05, + "loss": 0.4055, + "step": 576 + }, + { + "epoch": 1.946725626447673, + "grad_norm": 0.18538527661707113, + "learning_rate": 6.121276954182261e-05, + "loss": 0.4097, + "step": 577 + }, + { + "epoch": 1.9500947567909033, + "grad_norm": 0.15156397322647622, + "learning_rate": 6.113272754984206e-05, + "loss": 0.4061, + "step": 578 + }, + { + "epoch": 1.9534638871341334, + "grad_norm": 0.18018187705246097, + "learning_rate": 6.105256800136125e-05, + "loss": 0.4086, + "step": 579 + }, + { + "epoch": 1.9568330174773636, + "grad_norm": 0.1842284584819115, + "learning_rate": 6.0972291342289274e-05, + "loss": 0.413, + "step": 580 + }, + { + "epoch": 1.9602021478205938, + "grad_norm": 0.20065268901018266, + "learning_rate": 6.0891898019186726e-05, + "loss": 0.4068, + "step": 581 + }, + { + "epoch": 1.963571278163824, + "grad_norm": 0.20725303582942523, + "learning_rate": 6.081138847926317e-05, + "loss": 0.4102, + "step": 582 + }, + { + "epoch": 1.966940408507054, + "grad_norm": 0.19644421357341532, + "learning_rate": 6.0730763170374636e-05, + "loss": 0.4053, + "step": 583 + }, + { + "epoch": 1.9703095388502843, + "grad_norm": 0.20950085034614344, + "learning_rate": 6.065002254102116e-05, + "loss": 0.4043, + "step": 584 + }, + { + "epoch": 1.9736786691935144, + "grad_norm": 0.22898989423400687, + "learning_rate": 6.056916704034429e-05, + "loss": 0.4038, + "step": 585 + }, + { + "epoch": 1.9770477995367446, + "grad_norm": 0.2379556008347109, + "learning_rate": 6.048819711812457e-05, + "loss": 0.4075, + "step": 586 + }, + { + "epoch": 1.9804169298799748, + "grad_norm": 0.23608922426333814, + "learning_rate": 6.040711322477906e-05, + "loss": 0.4074, + "step": 587 + }, + { + "epoch": 1.983786060223205, + "grad_norm": 0.2036587578092891, + "learning_rate": 6.032591581135878e-05, + "loss": 0.4116, + "step": 588 + }, + { + "epoch": 1.987155190566435, + "grad_norm": 0.1851902404809834, + "learning_rate": 6.024460532954626e-05, + "loss": 0.4015, + "step": 589 + }, + { + "epoch": 1.9905243209096652, + "grad_norm": 0.18802588423448818, + "learning_rate": 6.0163182231652985e-05, + "loss": 0.4054, + "step": 590 + }, + { + "epoch": 1.9938934512528954, + "grad_norm": 0.22345260630855865, + "learning_rate": 6.008164697061695e-05, + "loss": 0.4055, + "step": 591 + }, + { + "epoch": 1.9972625815961256, + "grad_norm": 0.23969549917986255, + "learning_rate": 6.000000000000001e-05, + "loss": 0.4015, + "step": 592 + }, + { + "epoch": 2.00336913034323, + "grad_norm": 0.2867299003150961, + "learning_rate": 5.991824177398549e-05, + "loss": 0.3913, + "step": 593 + }, + { + "epoch": 2.0067382606864603, + "grad_norm": 0.34375862252314415, + "learning_rate": 5.983637274737558e-05, + "loss": 0.391, + "step": 594 + }, + { + "epoch": 2.0101073910296905, + "grad_norm": 0.3635152444198319, + "learning_rate": 5.975439337558886e-05, + "loss": 0.3799, + "step": 595 + }, + { + "epoch": 2.0134765213729207, + "grad_norm": 0.3422619581016819, + "learning_rate": 5.967230411465768e-05, + "loss": 0.388, + "step": 596 + }, + { + "epoch": 2.016845651716151, + "grad_norm": 0.32857568135445225, + "learning_rate": 5.9590105421225715e-05, + "loss": 0.3873, + "step": 597 + }, + { + "epoch": 2.020214782059381, + "grad_norm": 0.34465546224144156, + "learning_rate": 5.950779775254539e-05, + "loss": 0.3864, + "step": 598 + }, + { + "epoch": 2.023583912402611, + "grad_norm": 0.3318091541966093, + "learning_rate": 5.9425381566475316e-05, + "loss": 0.3901, + "step": 599 + }, + { + "epoch": 2.0269530427458413, + "grad_norm": 0.3211852458337534, + "learning_rate": 5.934285732147778e-05, + "loss": 0.3865, + "step": 600 + }, + { + "epoch": 2.0303221730890715, + "grad_norm": 0.28372803606540153, + "learning_rate": 5.9260225476616157e-05, + "loss": 0.3809, + "step": 601 + }, + { + "epoch": 2.0336913034323016, + "grad_norm": 0.26378333051858827, + "learning_rate": 5.91774864915524e-05, + "loss": 0.3825, + "step": 602 + }, + { + "epoch": 2.037060433775532, + "grad_norm": 0.2699942011391507, + "learning_rate": 5.909464082654442e-05, + "loss": 0.3814, + "step": 603 + }, + { + "epoch": 2.040429564118762, + "grad_norm": 0.32423565538212784, + "learning_rate": 5.90116889424436e-05, + "loss": 0.3949, + "step": 604 + }, + { + "epoch": 2.043798694461992, + "grad_norm": 0.3504800062724603, + "learning_rate": 5.8928631300692185e-05, + "loss": 0.3919, + "step": 605 + }, + { + "epoch": 2.0471678248052223, + "grad_norm": 0.28670213447600656, + "learning_rate": 5.884546836332072e-05, + "loss": 0.3848, + "step": 606 + }, + { + "epoch": 2.0505369551484525, + "grad_norm": 0.24765267252916567, + "learning_rate": 5.8762200592945484e-05, + "loss": 0.3862, + "step": 607 + }, + { + "epoch": 2.0539060854916826, + "grad_norm": 0.25397158563496697, + "learning_rate": 5.867882845276593e-05, + "loss": 0.384, + "step": 608 + }, + { + "epoch": 2.057275215834913, + "grad_norm": 0.19777815923412465, + "learning_rate": 5.859535240656208e-05, + "loss": 0.385, + "step": 609 + }, + { + "epoch": 2.060644346178143, + "grad_norm": 0.25257499668230105, + "learning_rate": 5.851177291869197e-05, + "loss": 0.3902, + "step": 610 + }, + { + "epoch": 2.064013476521373, + "grad_norm": 0.23438152088089984, + "learning_rate": 5.842809045408905e-05, + "loss": 0.3828, + "step": 611 + }, + { + "epoch": 2.0673826068646033, + "grad_norm": 0.24579596547862945, + "learning_rate": 5.834430547825964e-05, + "loss": 0.3895, + "step": 612 + }, + { + "epoch": 2.070751737207833, + "grad_norm": 0.254567202187919, + "learning_rate": 5.826041845728026e-05, + "loss": 0.3884, + "step": 613 + }, + { + "epoch": 2.074120867551063, + "grad_norm": 0.26694805867978466, + "learning_rate": 5.8176429857795104e-05, + "loss": 0.3884, + "step": 614 + }, + { + "epoch": 2.0774899978942933, + "grad_norm": 0.292686078529123, + "learning_rate": 5.809234014701342e-05, + "loss": 0.3869, + "step": 615 + }, + { + "epoch": 2.0808591282375235, + "grad_norm": 0.2543773210365024, + "learning_rate": 5.8008149792706936e-05, + "loss": 0.3841, + "step": 616 + }, + { + "epoch": 2.0842282585807537, + "grad_norm": 0.23117543050120432, + "learning_rate": 5.7923859263207205e-05, + "loss": 0.3839, + "step": 617 + }, + { + "epoch": 2.087597388923984, + "grad_norm": 0.32949270894440474, + "learning_rate": 5.783946902740304e-05, + "loss": 0.3848, + "step": 618 + }, + { + "epoch": 2.090966519267214, + "grad_norm": 0.3487344164810163, + "learning_rate": 5.7754979554737924e-05, + "loss": 0.3841, + "step": 619 + }, + { + "epoch": 2.094335649610444, + "grad_norm": 0.23249972606551436, + "learning_rate": 5.767039131520733e-05, + "loss": 0.3808, + "step": 620 + }, + { + "epoch": 2.0977047799536743, + "grad_norm": 0.1642526127565639, + "learning_rate": 5.758570477935618e-05, + "loss": 0.3852, + "step": 621 + }, + { + "epoch": 2.1010739102969045, + "grad_norm": 0.22737138050339126, + "learning_rate": 5.750092041827618e-05, + "loss": 0.3862, + "step": 622 + }, + { + "epoch": 2.1044430406401347, + "grad_norm": 0.22187422496371617, + "learning_rate": 5.7416038703603216e-05, + "loss": 0.39, + "step": 623 + }, + { + "epoch": 2.107812170983365, + "grad_norm": 0.1976542359852637, + "learning_rate": 5.7331060107514754e-05, + "loss": 0.3828, + "step": 624 + }, + { + "epoch": 2.111181301326595, + "grad_norm": 0.22929255732564582, + "learning_rate": 5.724598510272714e-05, + "loss": 0.3865, + "step": 625 + }, + { + "epoch": 2.114550431669825, + "grad_norm": 0.2281829564525587, + "learning_rate": 5.716081416249307e-05, + "loss": 0.3834, + "step": 626 + }, + { + "epoch": 2.1179195620130553, + "grad_norm": 0.1711530750792344, + "learning_rate": 5.707554776059886e-05, + "loss": 0.3864, + "step": 627 + }, + { + "epoch": 2.1212886923562855, + "grad_norm": 0.1952598465412235, + "learning_rate": 5.699018637136192e-05, + "loss": 0.3853, + "step": 628 + }, + { + "epoch": 2.1246578226995156, + "grad_norm": 0.21178404694012465, + "learning_rate": 5.6904730469627985e-05, + "loss": 0.394, + "step": 629 + }, + { + "epoch": 2.128026953042746, + "grad_norm": 0.2291084803798316, + "learning_rate": 5.681918053076858e-05, + "loss": 0.3851, + "step": 630 + }, + { + "epoch": 2.131396083385976, + "grad_norm": 0.2550272051240587, + "learning_rate": 5.673353703067832e-05, + "loss": 0.3872, + "step": 631 + }, + { + "epoch": 2.134765213729206, + "grad_norm": 0.2497998419444254, + "learning_rate": 5.664780044577231e-05, + "loss": 0.3881, + "step": 632 + }, + { + "epoch": 2.1381343440724363, + "grad_norm": 0.2222082480877385, + "learning_rate": 5.6561971252983424e-05, + "loss": 0.388, + "step": 633 + }, + { + "epoch": 2.1415034744156665, + "grad_norm": 0.18680744639544267, + "learning_rate": 5.6476049929759714e-05, + "loss": 0.3891, + "step": 634 + }, + { + "epoch": 2.1448726047588966, + "grad_norm": 0.21245971460544757, + "learning_rate": 5.6390036954061726e-05, + "loss": 0.3863, + "step": 635 + }, + { + "epoch": 2.148241735102127, + "grad_norm": 0.2162219122370638, + "learning_rate": 5.6303932804359857e-05, + "loss": 0.3909, + "step": 636 + }, + { + "epoch": 2.151610865445357, + "grad_norm": 0.15581628741660436, + "learning_rate": 5.621773795963166e-05, + "loss": 0.3879, + "step": 637 + }, + { + "epoch": 2.154979995788587, + "grad_norm": 0.22990888646168536, + "learning_rate": 5.613145289935926e-05, + "loss": 0.3882, + "step": 638 + }, + { + "epoch": 2.1583491261318173, + "grad_norm": 0.24959544004712048, + "learning_rate": 5.6045078103526545e-05, + "loss": 0.3799, + "step": 639 + }, + { + "epoch": 2.1617182564750475, + "grad_norm": 0.2308113655952683, + "learning_rate": 5.595861405261666e-05, + "loss": 0.3879, + "step": 640 + }, + { + "epoch": 2.1650873868182776, + "grad_norm": 0.2092244335914582, + "learning_rate": 5.58720612276092e-05, + "loss": 0.3871, + "step": 641 + }, + { + "epoch": 2.168456517161508, + "grad_norm": 0.2134067897632055, + "learning_rate": 5.578542010997764e-05, + "loss": 0.3822, + "step": 642 + }, + { + "epoch": 2.171825647504738, + "grad_norm": 0.20839647987055449, + "learning_rate": 5.569869118168655e-05, + "loss": 0.3848, + "step": 643 + }, + { + "epoch": 2.175194777847968, + "grad_norm": 0.16985344503865618, + "learning_rate": 5.561187492518903e-05, + "loss": 0.3858, + "step": 644 + }, + { + "epoch": 2.1785639081911983, + "grad_norm": 0.20941799721128232, + "learning_rate": 5.5524971823423905e-05, + "loss": 0.392, + "step": 645 + }, + { + "epoch": 2.1819330385344284, + "grad_norm": 0.21048667694813664, + "learning_rate": 5.5437982359813156e-05, + "loss": 0.3837, + "step": 646 + }, + { + "epoch": 2.1853021688776586, + "grad_norm": 0.17246060013503955, + "learning_rate": 5.5350907018259135e-05, + "loss": 0.3863, + "step": 647 + }, + { + "epoch": 2.1886712992208888, + "grad_norm": 0.1808917523018754, + "learning_rate": 5.526374628314195e-05, + "loss": 0.3873, + "step": 648 + }, + { + "epoch": 2.192040429564119, + "grad_norm": 0.16962189075007583, + "learning_rate": 5.5176500639316693e-05, + "loss": 0.3806, + "step": 649 + }, + { + "epoch": 2.195409559907349, + "grad_norm": 0.15829489129124838, + "learning_rate": 5.50891705721108e-05, + "loss": 0.3912, + "step": 650 + }, + { + "epoch": 2.1987786902505793, + "grad_norm": 0.20128590320313494, + "learning_rate": 5.5001756567321355e-05, + "loss": 0.3792, + "step": 651 + }, + { + "epoch": 2.2021478205938094, + "grad_norm": 0.218877863583923, + "learning_rate": 5.4914259111212355e-05, + "loss": 0.3865, + "step": 652 + }, + { + "epoch": 2.2055169509370396, + "grad_norm": 0.17606235529471279, + "learning_rate": 5.482667869051199e-05, + "loss": 0.3917, + "step": 653 + }, + { + "epoch": 2.2088860812802693, + "grad_norm": 0.14890556371643418, + "learning_rate": 5.473901579241e-05, + "loss": 0.38, + "step": 654 + }, + { + "epoch": 2.2122552116235, + "grad_norm": 0.1654643380961197, + "learning_rate": 5.4651270904554915e-05, + "loss": 0.394, + "step": 655 + }, + { + "epoch": 2.2156243419667296, + "grad_norm": 0.1570214426630876, + "learning_rate": 5.4563444515051354e-05, + "loss": 0.3854, + "step": 656 + }, + { + "epoch": 2.21899347230996, + "grad_norm": 0.1691883131216727, + "learning_rate": 5.44755371124573e-05, + "loss": 0.3851, + "step": 657 + }, + { + "epoch": 2.22236260265319, + "grad_norm": 0.17557198906026328, + "learning_rate": 5.438754918578144e-05, + "loss": 0.3913, + "step": 658 + }, + { + "epoch": 2.22573173299642, + "grad_norm": 0.16768631591392807, + "learning_rate": 5.429948122448031e-05, + "loss": 0.386, + "step": 659 + }, + { + "epoch": 2.2291008633396503, + "grad_norm": 0.14731731125382688, + "learning_rate": 5.4211333718455756e-05, + "loss": 0.3922, + "step": 660 + }, + { + "epoch": 2.2324699936828805, + "grad_norm": 0.17746489461476853, + "learning_rate": 5.4123107158052034e-05, + "loss": 0.387, + "step": 661 + }, + { + "epoch": 2.2358391240261106, + "grad_norm": 0.1903089984499793, + "learning_rate": 5.4034802034053223e-05, + "loss": 0.3833, + "step": 662 + }, + { + "epoch": 2.239208254369341, + "grad_norm": 0.17184011460057994, + "learning_rate": 5.394641883768041e-05, + "loss": 0.39, + "step": 663 + }, + { + "epoch": 2.242577384712571, + "grad_norm": 0.20233097347593668, + "learning_rate": 5.3857958060588955e-05, + "loss": 0.3891, + "step": 664 + }, + { + "epoch": 2.245946515055801, + "grad_norm": 0.21958650033217517, + "learning_rate": 5.3769420194865806e-05, + "loss": 0.3856, + "step": 665 + }, + { + "epoch": 2.2493156453990313, + "grad_norm": 0.18358377095064263, + "learning_rate": 5.368080573302676e-05, + "loss": 0.3828, + "step": 666 + }, + { + "epoch": 2.2526847757422614, + "grad_norm": 0.17979672984272335, + "learning_rate": 5.359211516801365e-05, + "loss": 0.3804, + "step": 667 + }, + { + "epoch": 2.2560539060854916, + "grad_norm": 0.16294334924828324, + "learning_rate": 5.3503348993191706e-05, + "loss": 0.3825, + "step": 668 + }, + { + "epoch": 2.2594230364287218, + "grad_norm": 0.1508454226549176, + "learning_rate": 5.34145077023467e-05, + "loss": 0.385, + "step": 669 + }, + { + "epoch": 2.262792166771952, + "grad_norm": 0.15470462637665758, + "learning_rate": 5.332559178968231e-05, + "loss": 0.3778, + "step": 670 + }, + { + "epoch": 2.266161297115182, + "grad_norm": 0.1359656397629021, + "learning_rate": 5.3236601749817296e-05, + "loss": 0.3896, + "step": 671 + }, + { + "epoch": 2.2695304274584123, + "grad_norm": 0.15226695399087686, + "learning_rate": 5.314753807778276e-05, + "loss": 0.3874, + "step": 672 + }, + { + "epoch": 2.2728995578016424, + "grad_norm": 0.14503332183422835, + "learning_rate": 5.3058401269019415e-05, + "loss": 0.3878, + "step": 673 + }, + { + "epoch": 2.2762686881448726, + "grad_norm": 0.15318787409886342, + "learning_rate": 5.296919181937485e-05, + "loss": 0.3857, + "step": 674 + }, + { + "epoch": 2.2796378184881028, + "grad_norm": 0.16971373493795616, + "learning_rate": 5.2879910225100655e-05, + "loss": 0.3855, + "step": 675 + }, + { + "epoch": 2.283006948831333, + "grad_norm": 0.1654804092839339, + "learning_rate": 5.279055698284982e-05, + "loss": 0.3877, + "step": 676 + }, + { + "epoch": 2.286376079174563, + "grad_norm": 0.1505186583674958, + "learning_rate": 5.270113258967386e-05, + "loss": 0.3832, + "step": 677 + }, + { + "epoch": 2.2897452095177933, + "grad_norm": 0.16676666984467559, + "learning_rate": 5.261163754302011e-05, + "loss": 0.386, + "step": 678 + }, + { + "epoch": 2.2931143398610234, + "grad_norm": 0.18567032268425918, + "learning_rate": 5.2522072340728896e-05, + "loss": 0.3907, + "step": 679 + }, + { + "epoch": 2.2964834702042536, + "grad_norm": 0.1765483695468527, + "learning_rate": 5.2432437481030855e-05, + "loss": 0.3882, + "step": 680 + }, + { + "epoch": 2.2998526005474837, + "grad_norm": 0.165430115440251, + "learning_rate": 5.234273346254406e-05, + "loss": 0.3946, + "step": 681 + }, + { + "epoch": 2.303221730890714, + "grad_norm": 0.1690494896953244, + "learning_rate": 5.225296078427135e-05, + "loss": 0.3857, + "step": 682 + }, + { + "epoch": 2.306590861233944, + "grad_norm": 0.201198083663681, + "learning_rate": 5.216311994559744e-05, + "loss": 0.389, + "step": 683 + }, + { + "epoch": 2.3099599915771742, + "grad_norm": 0.20812621009650192, + "learning_rate": 5.207321144628628e-05, + "loss": 0.3865, + "step": 684 + }, + { + "epoch": 2.3133291219204044, + "grad_norm": 0.21426999240641148, + "learning_rate": 5.198323578647813e-05, + "loss": 0.3867, + "step": 685 + }, + { + "epoch": 2.3166982522636346, + "grad_norm": 0.213657425755296, + "learning_rate": 5.18931934666869e-05, + "loss": 0.3922, + "step": 686 + }, + { + "epoch": 2.3200673826068647, + "grad_norm": 0.17137164943244815, + "learning_rate": 5.180308498779728e-05, + "loss": 0.3789, + "step": 687 + }, + { + "epoch": 2.323436512950095, + "grad_norm": 0.18022826820320403, + "learning_rate": 5.171291085106202e-05, + "loss": 0.3815, + "step": 688 + }, + { + "epoch": 2.326805643293325, + "grad_norm": 0.1755115364994259, + "learning_rate": 5.162267155809908e-05, + "loss": 0.389, + "step": 689 + }, + { + "epoch": 2.3301747736365552, + "grad_norm": 0.2011673377143987, + "learning_rate": 5.153236761088888e-05, + "loss": 0.3894, + "step": 690 + }, + { + "epoch": 2.3335439039797854, + "grad_norm": 0.2305809255417625, + "learning_rate": 5.14419995117715e-05, + "loss": 0.3811, + "step": 691 + }, + { + "epoch": 2.3369130343230156, + "grad_norm": 0.2115835801437973, + "learning_rate": 5.135156776344389e-05, + "loss": 0.3892, + "step": 692 + }, + { + "epoch": 2.3402821646662453, + "grad_norm": 0.19470845993737926, + "learning_rate": 5.126107286895702e-05, + "loss": 0.3832, + "step": 693 + }, + { + "epoch": 2.343651295009476, + "grad_norm": 0.16438102517886552, + "learning_rate": 5.117051533171321e-05, + "loss": 0.3863, + "step": 694 + }, + { + "epoch": 2.3470204253527056, + "grad_norm": 0.17475480058915455, + "learning_rate": 5.1079895655463177e-05, + "loss": 0.3859, + "step": 695 + }, + { + "epoch": 2.350389555695936, + "grad_norm": 0.18741810484417695, + "learning_rate": 5.098921434430333e-05, + "loss": 0.3825, + "step": 696 + }, + { + "epoch": 2.353758686039166, + "grad_norm": 0.1687881382681767, + "learning_rate": 5.0898471902672917e-05, + "loss": 0.3758, + "step": 697 + }, + { + "epoch": 2.357127816382396, + "grad_norm": 0.18436298872908952, + "learning_rate": 5.080766883535129e-05, + "loss": 0.3852, + "step": 698 + }, + { + "epoch": 2.3604969467256263, + "grad_norm": 0.19845837669577285, + "learning_rate": 5.0716805647455006e-05, + "loss": 0.3854, + "step": 699 + }, + { + "epoch": 2.3638660770688564, + "grad_norm": 0.18343761135804904, + "learning_rate": 5.062588284443505e-05, + "loss": 0.3825, + "step": 700 + }, + { + "epoch": 2.3672352074120866, + "grad_norm": 0.13923107512819735, + "learning_rate": 5.053490093207408e-05, + "loss": 0.3797, + "step": 701 + }, + { + "epoch": 2.3706043377553168, + "grad_norm": 0.1783129344294203, + "learning_rate": 5.0443860416483536e-05, + "loss": 0.3813, + "step": 702 + }, + { + "epoch": 2.373973468098547, + "grad_norm": 0.2047126526455967, + "learning_rate": 5.0352761804100835e-05, + "loss": 0.3869, + "step": 703 + }, + { + "epoch": 2.377342598441777, + "grad_norm": 0.18677317936073162, + "learning_rate": 5.026160560168661e-05, + "loss": 0.3829, + "step": 704 + }, + { + "epoch": 2.3807117287850073, + "grad_norm": 0.15858411985283818, + "learning_rate": 5.0170392316321826e-05, + "loss": 0.3906, + "step": 705 + }, + { + "epoch": 2.3840808591282374, + "grad_norm": 0.1542922309469812, + "learning_rate": 5.0079122455405014e-05, + "loss": 0.3898, + "step": 706 + }, + { + "epoch": 2.3874499894714676, + "grad_norm": 0.16034757146153225, + "learning_rate": 4.9987796526649394e-05, + "loss": 0.3856, + "step": 707 + }, + { + "epoch": 2.3908191198146977, + "grad_norm": 0.17396513204876746, + "learning_rate": 4.989641503808011e-05, + "loss": 0.3845, + "step": 708 + }, + { + "epoch": 2.394188250157928, + "grad_norm": 0.14385199298465493, + "learning_rate": 4.9804978498031326e-05, + "loss": 0.383, + "step": 709 + }, + { + "epoch": 2.397557380501158, + "grad_norm": 0.1424278412585639, + "learning_rate": 4.971348741514349e-05, + "loss": 0.3923, + "step": 710 + }, + { + "epoch": 2.4009265108443882, + "grad_norm": 0.18492577887926495, + "learning_rate": 4.962194229836045e-05, + "loss": 0.3841, + "step": 711 + }, + { + "epoch": 2.4042956411876184, + "grad_norm": 0.1732020596072231, + "learning_rate": 4.95303436569266e-05, + "loss": 0.3915, + "step": 712 + }, + { + "epoch": 2.4076647715308486, + "grad_norm": 0.12301305622548196, + "learning_rate": 4.943869200038413e-05, + "loss": 0.384, + "step": 713 + }, + { + "epoch": 2.4110339018740787, + "grad_norm": 0.18053993824097098, + "learning_rate": 4.934698783857011e-05, + "loss": 0.3817, + "step": 714 + }, + { + "epoch": 2.414403032217309, + "grad_norm": 0.21725687137817615, + "learning_rate": 4.9255231681613674e-05, + "loss": 0.3887, + "step": 715 + }, + { + "epoch": 2.417772162560539, + "grad_norm": 0.17070860183839026, + "learning_rate": 4.91634240399332e-05, + "loss": 0.3842, + "step": 716 + }, + { + "epoch": 2.4211412929037692, + "grad_norm": 0.16062080472612222, + "learning_rate": 4.907156542423351e-05, + "loss": 0.3753, + "step": 717 + }, + { + "epoch": 2.4245104232469994, + "grad_norm": 0.16452143222682503, + "learning_rate": 4.8979656345502904e-05, + "loss": 0.3819, + "step": 718 + }, + { + "epoch": 2.4278795535902296, + "grad_norm": 0.17121464354448115, + "learning_rate": 4.888769731501047e-05, + "loss": 0.3829, + "step": 719 + }, + { + "epoch": 2.4312486839334597, + "grad_norm": 0.1588530781256576, + "learning_rate": 4.8795688844303114e-05, + "loss": 0.3872, + "step": 720 + }, + { + "epoch": 2.43461781427669, + "grad_norm": 0.15259487087295576, + "learning_rate": 4.870363144520279e-05, + "loss": 0.3878, + "step": 721 + }, + { + "epoch": 2.43798694461992, + "grad_norm": 0.15808052014003177, + "learning_rate": 4.861152562980362e-05, + "loss": 0.3827, + "step": 722 + }, + { + "epoch": 2.44135607496315, + "grad_norm": 0.18095527833139824, + "learning_rate": 4.851937191046906e-05, + "loss": 0.3828, + "step": 723 + }, + { + "epoch": 2.4447252053063804, + "grad_norm": 0.17700515235134065, + "learning_rate": 4.8427170799829055e-05, + "loss": 0.3849, + "step": 724 + }, + { + "epoch": 2.4480943356496105, + "grad_norm": 0.15108262997817984, + "learning_rate": 4.833492281077717e-05, + "loss": 0.3827, + "step": 725 + }, + { + "epoch": 2.4514634659928407, + "grad_norm": 0.14610122044801815, + "learning_rate": 4.824262845646771e-05, + "loss": 0.3891, + "step": 726 + }, + { + "epoch": 2.454832596336071, + "grad_norm": 0.17949690552168968, + "learning_rate": 4.815028825031295e-05, + "loss": 0.3824, + "step": 727 + }, + { + "epoch": 2.458201726679301, + "grad_norm": 0.17860414349949053, + "learning_rate": 4.805790270598021e-05, + "loss": 0.3859, + "step": 728 + }, + { + "epoch": 2.461570857022531, + "grad_norm": 0.15714664302158635, + "learning_rate": 4.796547233738901e-05, + "loss": 0.3805, + "step": 729 + }, + { + "epoch": 2.4649399873657614, + "grad_norm": 0.13409742518350323, + "learning_rate": 4.787299765870822e-05, + "loss": 0.3894, + "step": 730 + }, + { + "epoch": 2.4683091177089915, + "grad_norm": 0.1375698590454868, + "learning_rate": 4.77804791843532e-05, + "loss": 0.3885, + "step": 731 + }, + { + "epoch": 2.4716782480522217, + "grad_norm": 0.1382618240475382, + "learning_rate": 4.768791742898292e-05, + "loss": 0.3875, + "step": 732 + }, + { + "epoch": 2.475047378395452, + "grad_norm": 0.1398622806337096, + "learning_rate": 4.7595312907497135e-05, + "loss": 0.3853, + "step": 733 + }, + { + "epoch": 2.4784165087386816, + "grad_norm": 0.14539506330457003, + "learning_rate": 4.7502666135033486e-05, + "loss": 0.3935, + "step": 734 + }, + { + "epoch": 2.481785639081912, + "grad_norm": 0.13109075183048932, + "learning_rate": 4.7409977626964666e-05, + "loss": 0.3848, + "step": 735 + }, + { + "epoch": 2.485154769425142, + "grad_norm": 0.12988278807806955, + "learning_rate": 4.731724789889547e-05, + "loss": 0.3839, + "step": 736 + }, + { + "epoch": 2.4885238997683725, + "grad_norm": 0.1578289932884262, + "learning_rate": 4.722447746666008e-05, + "loss": 0.3836, + "step": 737 + }, + { + "epoch": 2.4918930301116022, + "grad_norm": 0.1696600549846316, + "learning_rate": 4.7131666846319036e-05, + "loss": 0.3825, + "step": 738 + }, + { + "epoch": 2.495262160454833, + "grad_norm": 0.13151686953984587, + "learning_rate": 4.7038816554156484e-05, + "loss": 0.3879, + "step": 739 + }, + { + "epoch": 2.4986312907980626, + "grad_norm": 0.19638702203051203, + "learning_rate": 4.694592710667723e-05, + "loss": 0.3873, + "step": 740 + }, + { + "epoch": 2.502000421141293, + "grad_norm": 0.18899466534966777, + "learning_rate": 4.6852999020603864e-05, + "loss": 0.3808, + "step": 741 + }, + { + "epoch": 2.505369551484523, + "grad_norm": 0.12219071702355794, + "learning_rate": 4.676003281287397e-05, + "loss": 0.3876, + "step": 742 + }, + { + "epoch": 2.508738681827753, + "grad_norm": 0.18236706911247189, + "learning_rate": 4.6667029000637164e-05, + "loss": 0.3846, + "step": 743 + }, + { + "epoch": 2.5121078121709832, + "grad_norm": 0.1684130303158305, + "learning_rate": 4.657398810125225e-05, + "loss": 0.3888, + "step": 744 + }, + { + "epoch": 2.5154769425142134, + "grad_norm": 0.16891778570455948, + "learning_rate": 4.648091063228435e-05, + "loss": 0.3878, + "step": 745 + }, + { + "epoch": 2.5188460728574436, + "grad_norm": 0.16123369621023537, + "learning_rate": 4.638779711150198e-05, + "loss": 0.3888, + "step": 746 + }, + { + "epoch": 2.5222152032006737, + "grad_norm": 0.13513366343949626, + "learning_rate": 4.629464805687426e-05, + "loss": 0.3826, + "step": 747 + }, + { + "epoch": 2.525584333543904, + "grad_norm": 0.1460461212872677, + "learning_rate": 4.620146398656792e-05, + "loss": 0.3841, + "step": 748 + }, + { + "epoch": 2.528953463887134, + "grad_norm": 0.16497117181141158, + "learning_rate": 4.610824541894452e-05, + "loss": 0.3842, + "step": 749 + }, + { + "epoch": 2.532322594230364, + "grad_norm": 0.16290788207612428, + "learning_rate": 4.601499287255748e-05, + "loss": 0.3885, + "step": 750 + }, + { + "epoch": 2.5356917245735944, + "grad_norm": 0.14489151093892186, + "learning_rate": 4.592170686614926e-05, + "loss": 0.3909, + "step": 751 + }, + { + "epoch": 2.5390608549168245, + "grad_norm": 0.1464122207528577, + "learning_rate": 4.582838791864846e-05, + "loss": 0.3864, + "step": 752 + }, + { + "epoch": 2.5424299852600547, + "grad_norm": 0.1543922436683134, + "learning_rate": 4.5735036549166907e-05, + "loss": 0.3781, + "step": 753 + }, + { + "epoch": 2.545799115603285, + "grad_norm": 0.1511363443793848, + "learning_rate": 4.5641653276996774e-05, + "loss": 0.388, + "step": 754 + }, + { + "epoch": 2.549168245946515, + "grad_norm": 0.14775900613642287, + "learning_rate": 4.5548238621607735e-05, + "loss": 0.3829, + "step": 755 + }, + { + "epoch": 2.552537376289745, + "grad_norm": 0.1609040357156897, + "learning_rate": 4.5454793102644006e-05, + "loss": 0.3913, + "step": 756 + }, + { + "epoch": 2.5559065066329754, + "grad_norm": 0.17452716126040962, + "learning_rate": 4.5361317239921515e-05, + "loss": 0.387, + "step": 757 + }, + { + "epoch": 2.5592756369762055, + "grad_norm": 0.15479208730353294, + "learning_rate": 4.5267811553424945e-05, + "loss": 0.3794, + "step": 758 + }, + { + "epoch": 2.5626447673194357, + "grad_norm": 0.16977092756406884, + "learning_rate": 4.517427656330496e-05, + "loss": 0.3813, + "step": 759 + }, + { + "epoch": 2.566013897662666, + "grad_norm": 0.15943557512689435, + "learning_rate": 4.5080712789875154e-05, + "loss": 0.3886, + "step": 760 + }, + { + "epoch": 2.569383028005896, + "grad_norm": 0.15146661036703893, + "learning_rate": 4.498712075360929e-05, + "loss": 0.3779, + "step": 761 + }, + { + "epoch": 2.572752158349126, + "grad_norm": 0.1583016214192411, + "learning_rate": 4.489350097513829e-05, + "loss": 0.3861, + "step": 762 + }, + { + "epoch": 2.5761212886923563, + "grad_norm": 0.18203713661130738, + "learning_rate": 4.479985397524748e-05, + "loss": 0.3872, + "step": 763 + }, + { + "epoch": 2.5794904190355865, + "grad_norm": 0.1411770309939346, + "learning_rate": 4.470618027487354e-05, + "loss": 0.3833, + "step": 764 + }, + { + "epoch": 2.5828595493788167, + "grad_norm": 0.15778048291503943, + "learning_rate": 4.4612480395101736e-05, + "loss": 0.3835, + "step": 765 + }, + { + "epoch": 2.586228679722047, + "grad_norm": 0.20283325612723238, + "learning_rate": 4.451875485716292e-05, + "loss": 0.3804, + "step": 766 + }, + { + "epoch": 2.589597810065277, + "grad_norm": 0.15957667387644875, + "learning_rate": 4.44250041824307e-05, + "loss": 0.3759, + "step": 767 + }, + { + "epoch": 2.592966940408507, + "grad_norm": 0.15580437360078891, + "learning_rate": 4.4331228892418473e-05, + "loss": 0.3869, + "step": 768 + }, + { + "epoch": 2.5963360707517373, + "grad_norm": 0.16733864762153852, + "learning_rate": 4.4237429508776645e-05, + "loss": 0.3901, + "step": 769 + }, + { + "epoch": 2.5997052010949675, + "grad_norm": 0.16840382892762462, + "learning_rate": 4.414360655328957e-05, + "loss": 0.3887, + "step": 770 + }, + { + "epoch": 2.6030743314381977, + "grad_norm": 0.16500477542253614, + "learning_rate": 4.4049760547872786e-05, + "loss": 0.3821, + "step": 771 + }, + { + "epoch": 2.606443461781428, + "grad_norm": 0.17637661287184536, + "learning_rate": 4.395589201457e-05, + "loss": 0.3901, + "step": 772 + }, + { + "epoch": 2.6098125921246575, + "grad_norm": 0.1426864712324038, + "learning_rate": 4.386200147555027e-05, + "loss": 0.3822, + "step": 773 + }, + { + "epoch": 2.613181722467888, + "grad_norm": 0.1359883054124575, + "learning_rate": 4.376808945310505e-05, + "loss": 0.3907, + "step": 774 + }, + { + "epoch": 2.616550852811118, + "grad_norm": 0.15390613245324686, + "learning_rate": 4.3674156469645335e-05, + "loss": 0.3844, + "step": 775 + }, + { + "epoch": 2.6199199831543485, + "grad_norm": 0.12544051069791048, + "learning_rate": 4.358020304769867e-05, + "loss": 0.3848, + "step": 776 + }, + { + "epoch": 2.623289113497578, + "grad_norm": 0.12982821849005882, + "learning_rate": 4.348622970990634e-05, + "loss": 0.386, + "step": 777 + }, + { + "epoch": 2.626658243840809, + "grad_norm": 0.15120996993879657, + "learning_rate": 4.339223697902037e-05, + "loss": 0.3809, + "step": 778 + }, + { + "epoch": 2.6300273741840385, + "grad_norm": 0.13233029817309008, + "learning_rate": 4.329822537790073e-05, + "loss": 0.3841, + "step": 779 + }, + { + "epoch": 2.633396504527269, + "grad_norm": 0.14136223246926025, + "learning_rate": 4.320419542951228e-05, + "loss": 0.3838, + "step": 780 + }, + { + "epoch": 2.636765634870499, + "grad_norm": 0.1228901057783663, + "learning_rate": 4.3110147656922034e-05, + "loss": 0.3802, + "step": 781 + }, + { + "epoch": 2.6401347652137295, + "grad_norm": 0.13251524939594994, + "learning_rate": 4.3016082583296067e-05, + "loss": 0.378, + "step": 782 + }, + { + "epoch": 2.643503895556959, + "grad_norm": 0.13001677701359055, + "learning_rate": 4.292200073189676e-05, + "loss": 0.3841, + "step": 783 + }, + { + "epoch": 2.6468730259001894, + "grad_norm": 0.15991064871524435, + "learning_rate": 4.2827902626079784e-05, + "loss": 0.3875, + "step": 784 + }, + { + "epoch": 2.6502421562434195, + "grad_norm": 0.12111670308432425, + "learning_rate": 4.2733788789291275e-05, + "loss": 0.3873, + "step": 785 + }, + { + "epoch": 2.6536112865866497, + "grad_norm": 0.1593860904845142, + "learning_rate": 4.263965974506483e-05, + "loss": 0.3864, + "step": 786 + }, + { + "epoch": 2.65698041692988, + "grad_norm": 0.16167383614529757, + "learning_rate": 4.254551601701866e-05, + "loss": 0.3845, + "step": 787 + }, + { + "epoch": 2.66034954727311, + "grad_norm": 0.13801503703615994, + "learning_rate": 4.2451358128852654e-05, + "loss": 0.3876, + "step": 788 + }, + { + "epoch": 2.66371867761634, + "grad_norm": 0.13674433021590243, + "learning_rate": 4.23571866043455e-05, + "loss": 0.3836, + "step": 789 + }, + { + "epoch": 2.6670878079595703, + "grad_norm": 0.1567228984572654, + "learning_rate": 4.22630019673517e-05, + "loss": 0.3819, + "step": 790 + }, + { + "epoch": 2.6704569383028005, + "grad_norm": 0.13292233430502193, + "learning_rate": 4.216880474179871e-05, + "loss": 0.3772, + "step": 791 + }, + { + "epoch": 2.6738260686460307, + "grad_norm": 0.14610126476091434, + "learning_rate": 4.207459545168405e-05, + "loss": 0.391, + "step": 792 + }, + { + "epoch": 2.677195198989261, + "grad_norm": 0.1295036399986597, + "learning_rate": 4.198037462107228e-05, + "loss": 0.39, + "step": 793 + }, + { + "epoch": 2.680564329332491, + "grad_norm": 0.14286486693120076, + "learning_rate": 4.188614277409224e-05, + "loss": 0.3824, + "step": 794 + }, + { + "epoch": 2.683933459675721, + "grad_norm": 0.1395089402065071, + "learning_rate": 4.179190043493397e-05, + "loss": 0.3893, + "step": 795 + }, + { + "epoch": 2.6873025900189513, + "grad_norm": 0.1312675673324047, + "learning_rate": 4.169764812784594e-05, + "loss": 0.3839, + "step": 796 + }, + { + "epoch": 2.6906717203621815, + "grad_norm": 0.15056150493927153, + "learning_rate": 4.1603386377132045e-05, + "loss": 0.3766, + "step": 797 + }, + { + "epoch": 2.6940408507054117, + "grad_norm": 0.15234002339266034, + "learning_rate": 4.1509115707148695e-05, + "loss": 0.3875, + "step": 798 + }, + { + "epoch": 2.697409981048642, + "grad_norm": 0.14172473902716337, + "learning_rate": 4.1414836642301954e-05, + "loss": 0.3835, + "step": 799 + }, + { + "epoch": 2.700779111391872, + "grad_norm": 0.1244063349961557, + "learning_rate": 4.132054970704454e-05, + "loss": 0.384, + "step": 800 + }, + { + "epoch": 2.704148241735102, + "grad_norm": 0.13151454461470574, + "learning_rate": 4.122625542587301e-05, + "loss": 0.3814, + "step": 801 + }, + { + "epoch": 2.7075173720783323, + "grad_norm": 0.13472018853386267, + "learning_rate": 4.1131954323324734e-05, + "loss": 0.3832, + "step": 802 + }, + { + "epoch": 2.7108865024215625, + "grad_norm": 0.14391402812007115, + "learning_rate": 4.103764692397504e-05, + "loss": 0.3907, + "step": 803 + }, + { + "epoch": 2.7142556327647926, + "grad_norm": 0.1204377593661656, + "learning_rate": 4.094333375243428e-05, + "loss": 0.3779, + "step": 804 + }, + { + "epoch": 2.717624763108023, + "grad_norm": 0.1345036853381592, + "learning_rate": 4.084901533334495e-05, + "loss": 0.3837, + "step": 805 + }, + { + "epoch": 2.720993893451253, + "grad_norm": 0.151432229349483, + "learning_rate": 4.075469219137868e-05, + "loss": 0.3867, + "step": 806 + }, + { + "epoch": 2.724363023794483, + "grad_norm": 0.13412559508113278, + "learning_rate": 4.066036485123344e-05, + "loss": 0.3809, + "step": 807 + }, + { + "epoch": 2.7277321541377133, + "grad_norm": 0.1407083620047968, + "learning_rate": 4.056603383763049e-05, + "loss": 0.3893, + "step": 808 + }, + { + "epoch": 2.7311012844809435, + "grad_norm": 0.1304023157361848, + "learning_rate": 4.0471699675311564e-05, + "loss": 0.3873, + "step": 809 + }, + { + "epoch": 2.7344704148241736, + "grad_norm": 0.13069329962842927, + "learning_rate": 4.0377362889035875e-05, + "loss": 0.3845, + "step": 810 + }, + { + "epoch": 2.737839545167404, + "grad_norm": 0.134836479542485, + "learning_rate": 4.0283024003577284e-05, + "loss": 0.3806, + "step": 811 + }, + { + "epoch": 2.741208675510634, + "grad_norm": 0.12753418534583713, + "learning_rate": 4.0188683543721295e-05, + "loss": 0.3797, + "step": 812 + }, + { + "epoch": 2.744577805853864, + "grad_norm": 0.13228859664320883, + "learning_rate": 4.009434203426215e-05, + "loss": 0.3856, + "step": 813 + }, + { + "epoch": 2.747946936197094, + "grad_norm": 0.14892311316819778, + "learning_rate": 4e-05, + "loss": 0.3838, + "step": 814 + }, + { + "epoch": 2.7513160665403245, + "grad_norm": 0.13386473278676905, + "learning_rate": 3.9905657965737854e-05, + "loss": 0.3829, + "step": 815 + }, + { + "epoch": 2.754685196883554, + "grad_norm": 0.14219980607382138, + "learning_rate": 3.981131645627872e-05, + "loss": 0.3819, + "step": 816 + }, + { + "epoch": 2.758054327226785, + "grad_norm": 0.1388449346696737, + "learning_rate": 3.971697599642273e-05, + "loss": 0.3834, + "step": 817 + }, + { + "epoch": 2.7614234575700145, + "grad_norm": 0.12977851410941868, + "learning_rate": 3.9622637110964125e-05, + "loss": 0.3831, + "step": 818 + }, + { + "epoch": 2.764792587913245, + "grad_norm": 0.13978459681010671, + "learning_rate": 3.9528300324688456e-05, + "loss": 0.383, + "step": 819 + }, + { + "epoch": 2.768161718256475, + "grad_norm": 0.13570459222433323, + "learning_rate": 3.943396616236953e-05, + "loss": 0.3851, + "step": 820 + }, + { + "epoch": 2.7715308485997054, + "grad_norm": 0.1347307304770039, + "learning_rate": 3.933963514876657e-05, + "loss": 0.3872, + "step": 821 + }, + { + "epoch": 2.774899978942935, + "grad_norm": 0.14708015270111557, + "learning_rate": 3.9245307808621325e-05, + "loss": 0.385, + "step": 822 + }, + { + "epoch": 2.7782691092861658, + "grad_norm": 0.12260128131766068, + "learning_rate": 3.915098466665506e-05, + "loss": 0.3855, + "step": 823 + }, + { + "epoch": 2.7816382396293955, + "grad_norm": 0.12292579106408079, + "learning_rate": 3.905666624756573e-05, + "loss": 0.3869, + "step": 824 + }, + { + "epoch": 2.7850073699726257, + "grad_norm": 0.14492807851132256, + "learning_rate": 3.8962353076024984e-05, + "loss": 0.3821, + "step": 825 + }, + { + "epoch": 2.788376500315856, + "grad_norm": 0.14449210295060477, + "learning_rate": 3.886804567667528e-05, + "loss": 0.3808, + "step": 826 + }, + { + "epoch": 2.791745630659086, + "grad_norm": 0.12971902156372891, + "learning_rate": 3.8773744574127e-05, + "loss": 0.3878, + "step": 827 + }, + { + "epoch": 2.795114761002316, + "grad_norm": 0.14230416274316593, + "learning_rate": 3.867945029295546e-05, + "loss": 0.3814, + "step": 828 + }, + { + "epoch": 2.7984838913455463, + "grad_norm": 0.1224339186137515, + "learning_rate": 3.858516335769806e-05, + "loss": 0.3819, + "step": 829 + }, + { + "epoch": 2.8018530216887765, + "grad_norm": 0.16733669157218356, + "learning_rate": 3.8490884292851325e-05, + "loss": 0.3825, + "step": 830 + }, + { + "epoch": 2.8052221520320066, + "grad_norm": 0.13398557625334945, + "learning_rate": 3.839661362286797e-05, + "loss": 0.3785, + "step": 831 + }, + { + "epoch": 2.808591282375237, + "grad_norm": 0.14930405489150408, + "learning_rate": 3.830235187215408e-05, + "loss": 0.3806, + "step": 832 + }, + { + "epoch": 2.811960412718467, + "grad_norm": 0.14534442897149916, + "learning_rate": 3.820809956506604e-05, + "loss": 0.3869, + "step": 833 + }, + { + "epoch": 2.815329543061697, + "grad_norm": 0.14294161233646072, + "learning_rate": 3.8113857225907783e-05, + "loss": 0.3834, + "step": 834 + }, + { + "epoch": 2.8186986734049273, + "grad_norm": 0.1304818403113972, + "learning_rate": 3.801962537892773e-05, + "loss": 0.3917, + "step": 835 + }, + { + "epoch": 2.8220678037481575, + "grad_norm": 0.16153213081562928, + "learning_rate": 3.792540454831596e-05, + "loss": 0.3877, + "step": 836 + }, + { + "epoch": 2.8254369340913876, + "grad_norm": 0.12199316427929723, + "learning_rate": 3.7831195258201295e-05, + "loss": 0.3836, + "step": 837 + }, + { + "epoch": 2.828806064434618, + "grad_norm": 0.14527010576989632, + "learning_rate": 3.7736998032648305e-05, + "loss": 0.3827, + "step": 838 + }, + { + "epoch": 2.832175194777848, + "grad_norm": 0.15971096124557288, + "learning_rate": 3.7642813395654504e-05, + "loss": 0.3801, + "step": 839 + }, + { + "epoch": 2.835544325121078, + "grad_norm": 0.12345484787366505, + "learning_rate": 3.754864187114736e-05, + "loss": 0.3855, + "step": 840 + }, + { + "epoch": 2.8389134554643083, + "grad_norm": 0.13837193216510435, + "learning_rate": 3.745448398298135e-05, + "loss": 0.3828, + "step": 841 + }, + { + "epoch": 2.8422825858075385, + "grad_norm": 0.1545419687841436, + "learning_rate": 3.736034025493519e-05, + "loss": 0.3821, + "step": 842 + }, + { + "epoch": 2.8456517161507686, + "grad_norm": 0.12965815907805744, + "learning_rate": 3.726621121070873e-05, + "loss": 0.3885, + "step": 843 + }, + { + "epoch": 2.849020846493999, + "grad_norm": 0.14437205080738458, + "learning_rate": 3.717209737392022e-05, + "loss": 0.3757, + "step": 844 + }, + { + "epoch": 2.852389976837229, + "grad_norm": 0.13760242198629977, + "learning_rate": 3.707799926810326e-05, + "loss": 0.3841, + "step": 845 + }, + { + "epoch": 2.855759107180459, + "grad_norm": 0.16923959033588096, + "learning_rate": 3.698391741670394e-05, + "loss": 0.3837, + "step": 846 + }, + { + "epoch": 2.8591282375236893, + "grad_norm": 0.1483758913428858, + "learning_rate": 3.688985234307798e-05, + "loss": 0.3854, + "step": 847 + }, + { + "epoch": 2.8624973678669194, + "grad_norm": 0.1409446277936609, + "learning_rate": 3.679580457048772e-05, + "loss": 0.3865, + "step": 848 + }, + { + "epoch": 2.8658664982101496, + "grad_norm": 0.13848959127311186, + "learning_rate": 3.6701774622099286e-05, + "loss": 0.3847, + "step": 849 + }, + { + "epoch": 2.8692356285533798, + "grad_norm": 0.13440901679008035, + "learning_rate": 3.660776302097965e-05, + "loss": 0.3809, + "step": 850 + }, + { + "epoch": 2.87260475889661, + "grad_norm": 0.13528288220600784, + "learning_rate": 3.6513770290093674e-05, + "loss": 0.3844, + "step": 851 + }, + { + "epoch": 2.87597388923984, + "grad_norm": 0.11930769920642463, + "learning_rate": 3.641979695230135e-05, + "loss": 0.3853, + "step": 852 + }, + { + "epoch": 2.8793430195830703, + "grad_norm": 0.1302640412084013, + "learning_rate": 3.632584353035467e-05, + "loss": 0.3834, + "step": 853 + }, + { + "epoch": 2.8827121499263004, + "grad_norm": 0.12093299855424389, + "learning_rate": 3.6231910546894956e-05, + "loss": 0.3851, + "step": 854 + }, + { + "epoch": 2.8860812802695306, + "grad_norm": 0.1342477899550942, + "learning_rate": 3.613799852444975e-05, + "loss": 0.3883, + "step": 855 + }, + { + "epoch": 2.8894504106127608, + "grad_norm": 0.11778883888529185, + "learning_rate": 3.6044107985430015e-05, + "loss": 0.3823, + "step": 856 + }, + { + "epoch": 2.8928195409559905, + "grad_norm": 0.12271043639462616, + "learning_rate": 3.595023945212723e-05, + "loss": 0.3816, + "step": 857 + }, + { + "epoch": 2.896188671299221, + "grad_norm": 0.12188701757865371, + "learning_rate": 3.585639344671043e-05, + "loss": 0.3863, + "step": 858 + }, + { + "epoch": 2.899557801642451, + "grad_norm": 0.12511895990769892, + "learning_rate": 3.576257049122336e-05, + "loss": 0.3829, + "step": 859 + }, + { + "epoch": 2.9029269319856814, + "grad_norm": 0.12002503720509249, + "learning_rate": 3.5668771107581526e-05, + "loss": 0.377, + "step": 860 + }, + { + "epoch": 2.906296062328911, + "grad_norm": 0.12993074211163566, + "learning_rate": 3.5574995817569317e-05, + "loss": 0.3755, + "step": 861 + }, + { + "epoch": 2.9096651926721417, + "grad_norm": 0.10532634808065627, + "learning_rate": 3.5481245142837095e-05, + "loss": 0.3869, + "step": 862 + }, + { + "epoch": 2.9130343230153715, + "grad_norm": 0.1296191433786778, + "learning_rate": 3.5387519604898264e-05, + "loss": 0.382, + "step": 863 + }, + { + "epoch": 2.916403453358602, + "grad_norm": 0.10734185230078218, + "learning_rate": 3.5293819725126464e-05, + "loss": 0.3849, + "step": 864 + }, + { + "epoch": 2.919772583701832, + "grad_norm": 0.1077939586524133, + "learning_rate": 3.520014602475252e-05, + "loss": 0.3828, + "step": 865 + }, + { + "epoch": 2.923141714045062, + "grad_norm": 0.12191898052299041, + "learning_rate": 3.5106499024861715e-05, + "loss": 0.3809, + "step": 866 + }, + { + "epoch": 2.926510844388292, + "grad_norm": 0.12081068176606237, + "learning_rate": 3.501287924639074e-05, + "loss": 0.3892, + "step": 867 + }, + { + "epoch": 2.9298799747315223, + "grad_norm": 0.13361270574401832, + "learning_rate": 3.491928721012485e-05, + "loss": 0.3818, + "step": 868 + }, + { + "epoch": 2.9332491050747524, + "grad_norm": 0.12126810590661805, + "learning_rate": 3.482572343669506e-05, + "loss": 0.3834, + "step": 869 + }, + { + "epoch": 2.9366182354179826, + "grad_norm": 0.1258581729968798, + "learning_rate": 3.4732188446575055e-05, + "loss": 0.3822, + "step": 870 + }, + { + "epoch": 2.939987365761213, + "grad_norm": 0.11858345315742196, + "learning_rate": 3.4638682760078505e-05, + "loss": 0.3922, + "step": 871 + }, + { + "epoch": 2.943356496104443, + "grad_norm": 0.11372309799338015, + "learning_rate": 3.454520689735602e-05, + "loss": 0.3824, + "step": 872 + }, + { + "epoch": 2.946725626447673, + "grad_norm": 0.14113850726940133, + "learning_rate": 3.445176137839227e-05, + "loss": 0.3796, + "step": 873 + }, + { + "epoch": 2.9500947567909033, + "grad_norm": 0.11612037625898579, + "learning_rate": 3.435834672300324e-05, + "loss": 0.3873, + "step": 874 + }, + { + "epoch": 2.9534638871341334, + "grad_norm": 0.12263857158882245, + "learning_rate": 3.426496345083309e-05, + "loss": 0.3807, + "step": 875 + }, + { + "epoch": 2.9568330174773636, + "grad_norm": 0.13787793243918434, + "learning_rate": 3.417161208135155e-05, + "loss": 0.3865, + "step": 876 + }, + { + "epoch": 2.9602021478205938, + "grad_norm": 0.12537808395950803, + "learning_rate": 3.407829313385075e-05, + "loss": 0.3887, + "step": 877 + }, + { + "epoch": 2.963571278163824, + "grad_norm": 0.1233586121783003, + "learning_rate": 3.398500712744254e-05, + "loss": 0.3831, + "step": 878 + }, + { + "epoch": 2.966940408507054, + "grad_norm": 0.127510517027595, + "learning_rate": 3.38917545810555e-05, + "loss": 0.3855, + "step": 879 + }, + { + "epoch": 2.9703095388502843, + "grad_norm": 0.12958054002462321, + "learning_rate": 3.379853601343209e-05, + "loss": 0.3867, + "step": 880 + }, + { + "epoch": 2.9736786691935144, + "grad_norm": 0.11339310625974686, + "learning_rate": 3.3705351943125755e-05, + "loss": 0.381, + "step": 881 + }, + { + "epoch": 2.9770477995367446, + "grad_norm": 0.1441132631100554, + "learning_rate": 3.361220288849804e-05, + "loss": 0.3853, + "step": 882 + }, + { + "epoch": 2.9804169298799748, + "grad_norm": 0.12590761879480403, + "learning_rate": 3.351908936771566e-05, + "loss": 0.3821, + "step": 883 + }, + { + "epoch": 2.983786060223205, + "grad_norm": 0.12580062137496578, + "learning_rate": 3.342601189874777e-05, + "loss": 0.3912, + "step": 884 + }, + { + "epoch": 2.987155190566435, + "grad_norm": 0.1375861040816144, + "learning_rate": 3.3332970999362836e-05, + "loss": 0.3843, + "step": 885 + }, + { + "epoch": 2.9905243209096652, + "grad_norm": 0.11745115999108842, + "learning_rate": 3.323996718712605e-05, + "loss": 0.3793, + "step": 886 + }, + { + "epoch": 2.9938934512528954, + "grad_norm": 0.1154957553487754, + "learning_rate": 3.3147000979396156e-05, + "loss": 0.386, + "step": 887 + }, + { + "epoch": 2.9972625815961256, + "grad_norm": 0.14419491852541183, + "learning_rate": 3.305407289332279e-05, + "loss": 0.387, + "step": 888 + }, + { + "epoch": 3.00336913034323, + "grad_norm": 0.17453356323499444, + "learning_rate": 3.296118344584352e-05, + "loss": 0.3658, + "step": 889 + }, + { + "epoch": 3.0067382606864603, + "grad_norm": 0.19958889229278365, + "learning_rate": 3.2868333153680964e-05, + "loss": 0.3563, + "step": 890 + }, + { + "epoch": 3.0101073910296905, + "grad_norm": 0.14823110731719627, + "learning_rate": 3.277552253333993e-05, + "loss": 0.3592, + "step": 891 + }, + { + "epoch": 3.0134765213729207, + "grad_norm": 0.15078557386759514, + "learning_rate": 3.2682752101104536e-05, + "loss": 0.3648, + "step": 892 + }, + { + "epoch": 3.016845651716151, + "grad_norm": 0.15261085897213972, + "learning_rate": 3.259002237303535e-05, + "loss": 0.365, + "step": 893 + }, + { + "epoch": 3.020214782059381, + "grad_norm": 0.12773087702299238, + "learning_rate": 3.249733386496653e-05, + "loss": 0.359, + "step": 894 + }, + { + "epoch": 3.023583912402611, + "grad_norm": 0.13787164527794113, + "learning_rate": 3.2404687092502865e-05, + "loss": 0.361, + "step": 895 + }, + { + "epoch": 3.0269530427458413, + "grad_norm": 0.15396809320630023, + "learning_rate": 3.231208257101709e-05, + "loss": 0.3639, + "step": 896 + }, + { + "epoch": 3.0303221730890715, + "grad_norm": 0.13565056548247828, + "learning_rate": 3.221952081564682e-05, + "loss": 0.3632, + "step": 897 + }, + { + "epoch": 3.0336913034323016, + "grad_norm": 0.16070873287428322, + "learning_rate": 3.212700234129179e-05, + "loss": 0.3594, + "step": 898 + }, + { + "epoch": 3.037060433775532, + "grad_norm": 0.14022297658804933, + "learning_rate": 3.2034527662611e-05, + "loss": 0.363, + "step": 899 + }, + { + "epoch": 3.040429564118762, + "grad_norm": 0.14407779140042834, + "learning_rate": 3.194209729401979e-05, + "loss": 0.3612, + "step": 900 + }, + { + "epoch": 3.043798694461992, + "grad_norm": 0.13752049086764745, + "learning_rate": 3.184971174968705e-05, + "loss": 0.3645, + "step": 901 + }, + { + "epoch": 3.0471678248052223, + "grad_norm": 0.13823706755645496, + "learning_rate": 3.175737154353231e-05, + "loss": 0.3626, + "step": 902 + }, + { + "epoch": 3.0505369551484525, + "grad_norm": 0.16264110826907188, + "learning_rate": 3.166507718922285e-05, + "loss": 0.3566, + "step": 903 + }, + { + "epoch": 3.0539060854916826, + "grad_norm": 0.15511577954565434, + "learning_rate": 3.157282920017096e-05, + "loss": 0.361, + "step": 904 + }, + { + "epoch": 3.057275215834913, + "grad_norm": 0.15232517037403773, + "learning_rate": 3.1480628089530943e-05, + "loss": 0.3662, + "step": 905 + }, + { + "epoch": 3.060644346178143, + "grad_norm": 0.17112367414740937, + "learning_rate": 3.1388474370196395e-05, + "loss": 0.3638, + "step": 906 + }, + { + "epoch": 3.064013476521373, + "grad_norm": 0.12748688705449465, + "learning_rate": 3.129636855479723e-05, + "loss": 0.3579, + "step": 907 + }, + { + "epoch": 3.0673826068646033, + "grad_norm": 0.14714355107055627, + "learning_rate": 3.12043111556969e-05, + "loss": 0.3582, + "step": 908 + }, + { + "epoch": 3.070751737207833, + "grad_norm": 0.13462631797401237, + "learning_rate": 3.111230268498954e-05, + "loss": 0.367, + "step": 909 + }, + { + "epoch": 3.074120867551063, + "grad_norm": 0.1372418048121636, + "learning_rate": 3.1020343654497096e-05, + "loss": 0.3588, + "step": 910 + }, + { + "epoch": 3.0774899978942933, + "grad_norm": 0.13072048530956415, + "learning_rate": 3.0928434575766505e-05, + "loss": 0.361, + "step": 911 + }, + { + "epoch": 3.0808591282375235, + "grad_norm": 0.12852995212281998, + "learning_rate": 3.083657596006681e-05, + "loss": 0.3543, + "step": 912 + }, + { + "epoch": 3.0842282585807537, + "grad_norm": 0.12589969103284174, + "learning_rate": 3.0744768318386346e-05, + "loss": 0.3573, + "step": 913 + }, + { + "epoch": 3.087597388923984, + "grad_norm": 0.1042227599830766, + "learning_rate": 3.065301216142991e-05, + "loss": 0.3571, + "step": 914 + }, + { + "epoch": 3.090966519267214, + "grad_norm": 0.12641784465437736, + "learning_rate": 3.056130799961587e-05, + "loss": 0.361, + "step": 915 + }, + { + "epoch": 3.094335649610444, + "grad_norm": 0.1189011090318916, + "learning_rate": 3.046965634307341e-05, + "loss": 0.3653, + "step": 916 + }, + { + "epoch": 3.0977047799536743, + "grad_norm": 0.11559017960748716, + "learning_rate": 3.0378057701639575e-05, + "loss": 0.371, + "step": 917 + }, + { + "epoch": 3.1010739102969045, + "grad_norm": 0.1198695027252497, + "learning_rate": 3.028651258485652e-05, + "loss": 0.3667, + "step": 918 + }, + { + "epoch": 3.1044430406401347, + "grad_norm": 0.11196979369755074, + "learning_rate": 3.019502150196869e-05, + "loss": 0.3575, + "step": 919 + }, + { + "epoch": 3.107812170983365, + "grad_norm": 0.12042692289106809, + "learning_rate": 3.010358496191991e-05, + "loss": 0.3618, + "step": 920 + }, + { + "epoch": 3.111181301326595, + "grad_norm": 0.1238521643735063, + "learning_rate": 3.0012203473350616e-05, + "loss": 0.3672, + "step": 921 + }, + { + "epoch": 3.114550431669825, + "grad_norm": 0.11597672612469004, + "learning_rate": 2.9920877544595002e-05, + "loss": 0.3577, + "step": 922 + }, + { + "epoch": 3.1179195620130553, + "grad_norm": 0.11363631100554263, + "learning_rate": 2.982960768367818e-05, + "loss": 0.3637, + "step": 923 + }, + { + "epoch": 3.1212886923562855, + "grad_norm": 0.12223781700368476, + "learning_rate": 2.9738394398313405e-05, + "loss": 0.3575, + "step": 924 + }, + { + "epoch": 3.1246578226995156, + "grad_norm": 0.11310391813366659, + "learning_rate": 2.9647238195899168e-05, + "loss": 0.3666, + "step": 925 + }, + { + "epoch": 3.128026953042746, + "grad_norm": 0.12749001851980382, + "learning_rate": 2.955613958351647e-05, + "loss": 0.3577, + "step": 926 + }, + { + "epoch": 3.131396083385976, + "grad_norm": 0.11106465012495607, + "learning_rate": 2.946509906792593e-05, + "loss": 0.3661, + "step": 927 + }, + { + "epoch": 3.134765213729206, + "grad_norm": 0.13265615613597764, + "learning_rate": 2.9374117155564957e-05, + "loss": 0.3613, + "step": 928 + }, + { + "epoch": 3.1381343440724363, + "grad_norm": 0.1062334645184232, + "learning_rate": 2.928319435254501e-05, + "loss": 0.3601, + "step": 929 + }, + { + "epoch": 3.1415034744156665, + "grad_norm": 0.13654759521524176, + "learning_rate": 2.919233116464872e-05, + "loss": 0.357, + "step": 930 + }, + { + "epoch": 3.1448726047588966, + "grad_norm": 0.12274484555896063, + "learning_rate": 2.9101528097327093e-05, + "loss": 0.3659, + "step": 931 + }, + { + "epoch": 3.148241735102127, + "grad_norm": 0.11432950773248603, + "learning_rate": 2.9010785655696698e-05, + "loss": 0.3638, + "step": 932 + }, + { + "epoch": 3.151610865445357, + "grad_norm": 0.11354842248203202, + "learning_rate": 2.892010434453684e-05, + "loss": 0.36, + "step": 933 + }, + { + "epoch": 3.154979995788587, + "grad_norm": 0.12098639250864718, + "learning_rate": 2.88294846682868e-05, + "loss": 0.3591, + "step": 934 + }, + { + "epoch": 3.1583491261318173, + "grad_norm": 0.11027079481756498, + "learning_rate": 2.873892713104298e-05, + "loss": 0.3595, + "step": 935 + }, + { + "epoch": 3.1617182564750475, + "grad_norm": 0.12568594872253705, + "learning_rate": 2.864843223655613e-05, + "loss": 0.3678, + "step": 936 + }, + { + "epoch": 3.1650873868182776, + "grad_norm": 0.11667961571614835, + "learning_rate": 2.855800048822852e-05, + "loss": 0.3608, + "step": 937 + }, + { + "epoch": 3.168456517161508, + "grad_norm": 0.11058294572640527, + "learning_rate": 2.8467632389111126e-05, + "loss": 0.3683, + "step": 938 + }, + { + "epoch": 3.171825647504738, + "grad_norm": 0.1187950796415824, + "learning_rate": 2.837732844190094e-05, + "loss": 0.3644, + "step": 939 + }, + { + "epoch": 3.175194777847968, + "grad_norm": 0.10656085663558766, + "learning_rate": 2.828708914893799e-05, + "loss": 0.3671, + "step": 940 + }, + { + "epoch": 3.1785639081911983, + "grad_norm": 0.10817099139196962, + "learning_rate": 2.8196915012202728e-05, + "loss": 0.3672, + "step": 941 + }, + { + "epoch": 3.1819330385344284, + "grad_norm": 0.10075876050195509, + "learning_rate": 2.8106806533313106e-05, + "loss": 0.3631, + "step": 942 + }, + { + "epoch": 3.1853021688776586, + "grad_norm": 0.11551691063136907, + "learning_rate": 2.8016764213521875e-05, + "loss": 0.3608, + "step": 943 + }, + { + "epoch": 3.1886712992208888, + "grad_norm": 0.10092150997385874, + "learning_rate": 2.7926788553713734e-05, + "loss": 0.3652, + "step": 944 + }, + { + "epoch": 3.192040429564119, + "grad_norm": 0.11020311291162539, + "learning_rate": 2.783688005440256e-05, + "loss": 0.3656, + "step": 945 + }, + { + "epoch": 3.195409559907349, + "grad_norm": 0.10850184905841719, + "learning_rate": 2.7747039215728667e-05, + "loss": 0.3648, + "step": 946 + }, + { + "epoch": 3.1987786902505793, + "grad_norm": 0.10954311066114457, + "learning_rate": 2.7657266537455938e-05, + "loss": 0.3651, + "step": 947 + }, + { + "epoch": 3.2021478205938094, + "grad_norm": 0.10365234676829252, + "learning_rate": 2.7567562518969155e-05, + "loss": 0.3533, + "step": 948 + }, + { + "epoch": 3.2055169509370396, + "grad_norm": 0.10204242463146666, + "learning_rate": 2.7477927659271117e-05, + "loss": 0.3622, + "step": 949 + }, + { + "epoch": 3.2088860812802693, + "grad_norm": 0.10799341793471445, + "learning_rate": 2.7388362456979906e-05, + "loss": 0.3625, + "step": 950 + }, + { + "epoch": 3.2122552116235, + "grad_norm": 0.11115544373524708, + "learning_rate": 2.7298867410326155e-05, + "loss": 0.3629, + "step": 951 + }, + { + "epoch": 3.2156243419667296, + "grad_norm": 0.10949003369065348, + "learning_rate": 2.7209443017150193e-05, + "loss": 0.3635, + "step": 952 + }, + { + "epoch": 3.21899347230996, + "grad_norm": 0.10963161775177817, + "learning_rate": 2.712008977489936e-05, + "loss": 0.3594, + "step": 953 + }, + { + "epoch": 3.22236260265319, + "grad_norm": 0.11805544027584379, + "learning_rate": 2.703080818062517e-05, + "loss": 0.3635, + "step": 954 + }, + { + "epoch": 3.22573173299642, + "grad_norm": 0.10196046146217858, + "learning_rate": 2.694159873098058e-05, + "loss": 0.3626, + "step": 955 + }, + { + "epoch": 3.2291008633396503, + "grad_norm": 0.1120026689331707, + "learning_rate": 2.6852461922217253e-05, + "loss": 0.3649, + "step": 956 + }, + { + "epoch": 3.2324699936828805, + "grad_norm": 0.10926346301227147, + "learning_rate": 2.6763398250182714e-05, + "loss": 0.3579, + "step": 957 + }, + { + "epoch": 3.2358391240261106, + "grad_norm": 0.10913175373351278, + "learning_rate": 2.66744082103177e-05, + "loss": 0.3639, + "step": 958 + }, + { + "epoch": 3.239208254369341, + "grad_norm": 0.10485736112258066, + "learning_rate": 2.658549229765332e-05, + "loss": 0.3592, + "step": 959 + }, + { + "epoch": 3.242577384712571, + "grad_norm": 0.12101416878728995, + "learning_rate": 2.6496651006808308e-05, + "loss": 0.3574, + "step": 960 + }, + { + "epoch": 3.245946515055801, + "grad_norm": 0.1071236277697119, + "learning_rate": 2.6407884831986367e-05, + "loss": 0.3627, + "step": 961 + }, + { + "epoch": 3.2493156453990313, + "grad_norm": 0.11778875174805165, + "learning_rate": 2.6319194266973256e-05, + "loss": 0.365, + "step": 962 + }, + { + "epoch": 3.2526847757422614, + "grad_norm": 0.12437906053481307, + "learning_rate": 2.6230579805134203e-05, + "loss": 0.3582, + "step": 963 + }, + { + "epoch": 3.2560539060854916, + "grad_norm": 0.11016391828701566, + "learning_rate": 2.614204193941107e-05, + "loss": 0.3628, + "step": 964 + }, + { + "epoch": 3.2594230364287218, + "grad_norm": 0.131288542140626, + "learning_rate": 2.6053581162319606e-05, + "loss": 0.3634, + "step": 965 + }, + { + "epoch": 3.262792166771952, + "grad_norm": 0.10515921544500577, + "learning_rate": 2.5965197965946783e-05, + "loss": 0.3649, + "step": 966 + }, + { + "epoch": 3.266161297115182, + "grad_norm": 0.12739731098762894, + "learning_rate": 2.587689284194797e-05, + "loss": 0.3703, + "step": 967 + }, + { + "epoch": 3.2695304274584123, + "grad_norm": 0.10406377203116793, + "learning_rate": 2.5788666281544258e-05, + "loss": 0.3657, + "step": 968 + }, + { + "epoch": 3.2728995578016424, + "grad_norm": 0.11191689402983139, + "learning_rate": 2.5700518775519702e-05, + "loss": 0.359, + "step": 969 + }, + { + "epoch": 3.2762686881448726, + "grad_norm": 0.10680144927044027, + "learning_rate": 2.561245081421857e-05, + "loss": 0.3604, + "step": 970 + }, + { + "epoch": 3.2796378184881028, + "grad_norm": 0.11505057898142523, + "learning_rate": 2.5524462887542703e-05, + "loss": 0.3599, + "step": 971 + }, + { + "epoch": 3.283006948831333, + "grad_norm": 0.10674300454641518, + "learning_rate": 2.5436555484948643e-05, + "loss": 0.3625, + "step": 972 + }, + { + "epoch": 3.286376079174563, + "grad_norm": 0.10772282874724956, + "learning_rate": 2.534872909544509e-05, + "loss": 0.3586, + "step": 973 + }, + { + "epoch": 3.2897452095177933, + "grad_norm": 0.11061913724144044, + "learning_rate": 2.5260984207590015e-05, + "loss": 0.3695, + "step": 974 + }, + { + "epoch": 3.2931143398610234, + "grad_norm": 0.11314868048581533, + "learning_rate": 2.517332130948802e-05, + "loss": 0.3597, + "step": 975 + }, + { + "epoch": 3.2964834702042536, + "grad_norm": 0.10483488263899578, + "learning_rate": 2.5085740888787662e-05, + "loss": 0.3583, + "step": 976 + }, + { + "epoch": 3.2998526005474837, + "grad_norm": 0.10912778564330813, + "learning_rate": 2.4998243432678644e-05, + "loss": 0.3601, + "step": 977 + }, + { + "epoch": 3.303221730890714, + "grad_norm": 0.11466754101476578, + "learning_rate": 2.4910829427889205e-05, + "loss": 0.3643, + "step": 978 + }, + { + "epoch": 3.306590861233944, + "grad_norm": 0.10733537636590312, + "learning_rate": 2.4823499360683333e-05, + "loss": 0.3651, + "step": 979 + }, + { + "epoch": 3.3099599915771742, + "grad_norm": 0.1161393261879057, + "learning_rate": 2.473625371685806e-05, + "loss": 0.3599, + "step": 980 + }, + { + "epoch": 3.3133291219204044, + "grad_norm": 0.0982571093572832, + "learning_rate": 2.464909298174088e-05, + "loss": 0.3526, + "step": 981 + }, + { + "epoch": 3.3166982522636346, + "grad_norm": 0.1100159657444912, + "learning_rate": 2.4562017640186847e-05, + "loss": 0.3626, + "step": 982 + }, + { + "epoch": 3.3200673826068647, + "grad_norm": 0.09926349760672294, + "learning_rate": 2.4475028176576102e-05, + "loss": 0.3677, + "step": 983 + }, + { + "epoch": 3.323436512950095, + "grad_norm": 0.12050759797842048, + "learning_rate": 2.4388125074810986e-05, + "loss": 0.359, + "step": 984 + }, + { + "epoch": 3.326805643293325, + "grad_norm": 0.09987805749588798, + "learning_rate": 2.430130881831345e-05, + "loss": 0.3618, + "step": 985 + }, + { + "epoch": 3.3301747736365552, + "grad_norm": 0.1091783241310202, + "learning_rate": 2.4214579890022373e-05, + "loss": 0.3696, + "step": 986 + }, + { + "epoch": 3.3335439039797854, + "grad_norm": 0.10898707191962656, + "learning_rate": 2.41279387723908e-05, + "loss": 0.3638, + "step": 987 + }, + { + "epoch": 3.3369130343230156, + "grad_norm": 0.10558034784682291, + "learning_rate": 2.404138594738335e-05, + "loss": 0.357, + "step": 988 + }, + { + "epoch": 3.3402821646662453, + "grad_norm": 0.10689449489731055, + "learning_rate": 2.395492189647347e-05, + "loss": 0.3594, + "step": 989 + }, + { + "epoch": 3.343651295009476, + "grad_norm": 0.11118497131539316, + "learning_rate": 2.386854710064075e-05, + "loss": 0.3542, + "step": 990 + }, + { + "epoch": 3.3470204253527056, + "grad_norm": 0.10782085280238568, + "learning_rate": 2.3782262040368344e-05, + "loss": 0.3608, + "step": 991 + }, + { + "epoch": 3.350389555695936, + "grad_norm": 0.10697566924440428, + "learning_rate": 2.369606719564015e-05, + "loss": 0.3551, + "step": 992 + }, + { + "epoch": 3.353758686039166, + "grad_norm": 0.09605638199170409, + "learning_rate": 2.3609963045938288e-05, + "loss": 0.3618, + "step": 993 + }, + { + "epoch": 3.357127816382396, + "grad_norm": 0.10827169360976367, + "learning_rate": 2.35239500702403e-05, + "loss": 0.3565, + "step": 994 + }, + { + "epoch": 3.3604969467256263, + "grad_norm": 0.10198375263244171, + "learning_rate": 2.3438028747016586e-05, + "loss": 0.3626, + "step": 995 + }, + { + "epoch": 3.3638660770688564, + "grad_norm": 0.1159958447674676, + "learning_rate": 2.3352199554227698e-05, + "loss": 0.3629, + "step": 996 + }, + { + "epoch": 3.3672352074120866, + "grad_norm": 0.10457139377595129, + "learning_rate": 2.326646296932168e-05, + "loss": 0.3638, + "step": 997 + }, + { + "epoch": 3.3706043377553168, + "grad_norm": 0.10333006497152411, + "learning_rate": 2.318081946923144e-05, + "loss": 0.3612, + "step": 998 + }, + { + "epoch": 3.373973468098547, + "grad_norm": 0.10461115888151253, + "learning_rate": 2.3095269530372032e-05, + "loss": 0.362, + "step": 999 + }, + { + "epoch": 3.377342598441777, + "grad_norm": 0.10087292499347122, + "learning_rate": 2.3009813628638085e-05, + "loss": 0.3603, + "step": 1000 + }, + { + "epoch": 3.3807117287850073, + "grad_norm": 0.09894098741998586, + "learning_rate": 2.2924452239401153e-05, + "loss": 0.3635, + "step": 1001 + }, + { + "epoch": 3.3840808591282374, + "grad_norm": 0.10636129988239897, + "learning_rate": 2.283918583750695e-05, + "loss": 0.3589, + "step": 1002 + }, + { + "epoch": 3.3874499894714676, + "grad_norm": 0.1087735124770059, + "learning_rate": 2.2754014897272868e-05, + "loss": 0.3603, + "step": 1003 + }, + { + "epoch": 3.3908191198146977, + "grad_norm": 0.1045786633320159, + "learning_rate": 2.266893989248527e-05, + "loss": 0.3634, + "step": 1004 + }, + { + "epoch": 3.394188250157928, + "grad_norm": 0.10630134016191294, + "learning_rate": 2.258396129639679e-05, + "loss": 0.3626, + "step": 1005 + }, + { + "epoch": 3.397557380501158, + "grad_norm": 0.10814614823364664, + "learning_rate": 2.2499079581723846e-05, + "loss": 0.3682, + "step": 1006 + }, + { + "epoch": 3.4009265108443882, + "grad_norm": 0.10249665362012134, + "learning_rate": 2.2414295220643822e-05, + "loss": 0.361, + "step": 1007 + }, + { + "epoch": 3.4042956411876184, + "grad_norm": 0.10378027402659071, + "learning_rate": 2.2329608684792676e-05, + "loss": 0.3606, + "step": 1008 + }, + { + "epoch": 3.4076647715308486, + "grad_norm": 0.10027376191210695, + "learning_rate": 2.22450204452621e-05, + "loss": 0.3608, + "step": 1009 + }, + { + "epoch": 3.4110339018740787, + "grad_norm": 0.10689722485945972, + "learning_rate": 2.216053097259697e-05, + "loss": 0.3706, + "step": 1010 + }, + { + "epoch": 3.414403032217309, + "grad_norm": 0.10357939152860053, + "learning_rate": 2.2076140736792805e-05, + "loss": 0.3623, + "step": 1011 + }, + { + "epoch": 3.417772162560539, + "grad_norm": 0.0902315706129379, + "learning_rate": 2.1991850207293064e-05, + "loss": 0.3596, + "step": 1012 + }, + { + "epoch": 3.4211412929037692, + "grad_norm": 0.10842563552035595, + "learning_rate": 2.1907659852986588e-05, + "loss": 0.3637, + "step": 1013 + }, + { + "epoch": 3.4245104232469994, + "grad_norm": 0.09666903812158173, + "learning_rate": 2.1823570142204902e-05, + "loss": 0.3624, + "step": 1014 + }, + { + "epoch": 3.4278795535902296, + "grad_norm": 0.100083090000888, + "learning_rate": 2.1739581542719748e-05, + "loss": 0.3624, + "step": 1015 + }, + { + "epoch": 3.4312486839334597, + "grad_norm": 0.10755809720758686, + "learning_rate": 2.1655694521740376e-05, + "loss": 0.3624, + "step": 1016 + }, + { + "epoch": 3.43461781427669, + "grad_norm": 0.1024231010803628, + "learning_rate": 2.1571909545910953e-05, + "loss": 0.3621, + "step": 1017 + }, + { + "epoch": 3.43798694461992, + "grad_norm": 0.10562299735859218, + "learning_rate": 2.1488227081308054e-05, + "loss": 0.3626, + "step": 1018 + }, + { + "epoch": 3.44135607496315, + "grad_norm": 0.0993759886031881, + "learning_rate": 2.140464759343794e-05, + "loss": 0.3654, + "step": 1019 + }, + { + "epoch": 3.4447252053063804, + "grad_norm": 0.09933521966725083, + "learning_rate": 2.132117154723408e-05, + "loss": 0.356, + "step": 1020 + }, + { + "epoch": 3.4480943356496105, + "grad_norm": 0.09953034686873165, + "learning_rate": 2.123779940705453e-05, + "loss": 0.366, + "step": 1021 + }, + { + "epoch": 3.4514634659928407, + "grad_norm": 0.10175075266526791, + "learning_rate": 2.115453163667929e-05, + "loss": 0.3583, + "step": 1022 + }, + { + "epoch": 3.454832596336071, + "grad_norm": 0.09594990983302608, + "learning_rate": 2.1071368699307818e-05, + "loss": 0.3584, + "step": 1023 + }, + { + "epoch": 3.458201726679301, + "grad_norm": 0.10219150476255269, + "learning_rate": 2.0988311057556397e-05, + "loss": 0.3597, + "step": 1024 + }, + { + "epoch": 3.461570857022531, + "grad_norm": 0.09691112693809913, + "learning_rate": 2.0905359173455593e-05, + "loss": 0.3621, + "step": 1025 + }, + { + "epoch": 3.4649399873657614, + "grad_norm": 0.09661009238935536, + "learning_rate": 2.0822513508447608e-05, + "loss": 0.3567, + "step": 1026 + }, + { + "epoch": 3.4683091177089915, + "grad_norm": 0.09590582546596066, + "learning_rate": 2.073977452338384e-05, + "loss": 0.3646, + "step": 1027 + }, + { + "epoch": 3.4716782480522217, + "grad_norm": 0.09606588648905236, + "learning_rate": 2.065714267852223e-05, + "loss": 0.3641, + "step": 1028 + }, + { + "epoch": 3.475047378395452, + "grad_norm": 0.10295819559523817, + "learning_rate": 2.057461843352469e-05, + "loss": 0.3557, + "step": 1029 + }, + { + "epoch": 3.4784165087386816, + "grad_norm": 0.09150758299366415, + "learning_rate": 2.049220224745463e-05, + "loss": 0.3636, + "step": 1030 + }, + { + "epoch": 3.481785639081912, + "grad_norm": 0.10198222794968945, + "learning_rate": 2.0409894578774302e-05, + "loss": 0.3642, + "step": 1031 + }, + { + "epoch": 3.485154769425142, + "grad_norm": 0.09986839616807734, + "learning_rate": 2.032769588534233e-05, + "loss": 0.3673, + "step": 1032 + }, + { + "epoch": 3.4885238997683725, + "grad_norm": 0.09939024454656914, + "learning_rate": 2.0245606624411165e-05, + "loss": 0.3591, + "step": 1033 + }, + { + "epoch": 3.4918930301116022, + "grad_norm": 0.09144469769761462, + "learning_rate": 2.0163627252624427e-05, + "loss": 0.3683, + "step": 1034 + }, + { + "epoch": 3.495262160454833, + "grad_norm": 0.09038126728850328, + "learning_rate": 2.0081758226014516e-05, + "loss": 0.3585, + "step": 1035 + }, + { + "epoch": 3.4986312907980626, + "grad_norm": 0.09791848188862595, + "learning_rate": 2.0000000000000012e-05, + "loss": 0.3633, + "step": 1036 + }, + { + "epoch": 3.502000421141293, + "grad_norm": 0.09451335114654308, + "learning_rate": 1.9918353029383065e-05, + "loss": 0.3563, + "step": 1037 + }, + { + "epoch": 3.505369551484523, + "grad_norm": 0.09766694825632033, + "learning_rate": 1.9836817768347015e-05, + "loss": 0.3634, + "step": 1038 + }, + { + "epoch": 3.508738681827753, + "grad_norm": 0.09523718569743331, + "learning_rate": 1.9755394670453745e-05, + "loss": 0.364, + "step": 1039 + }, + { + "epoch": 3.5121078121709832, + "grad_norm": 0.09716008155147098, + "learning_rate": 1.9674084188641235e-05, + "loss": 0.3614, + "step": 1040 + }, + { + "epoch": 3.5154769425142134, + "grad_norm": 0.09307186314834033, + "learning_rate": 1.9592886775220957e-05, + "loss": 0.3663, + "step": 1041 + }, + { + "epoch": 3.5188460728574436, + "grad_norm": 0.0966569916505279, + "learning_rate": 1.9511802881875438e-05, + "loss": 0.3628, + "step": 1042 + }, + { + "epoch": 3.5222152032006737, + "grad_norm": 0.09953663178152124, + "learning_rate": 1.943083295965572e-05, + "loss": 0.3653, + "step": 1043 + }, + { + "epoch": 3.525584333543904, + "grad_norm": 0.09000177069317349, + "learning_rate": 1.9349977458978846e-05, + "loss": 0.357, + "step": 1044 + }, + { + "epoch": 3.528953463887134, + "grad_norm": 0.09693735378111683, + "learning_rate": 1.9269236829625387e-05, + "loss": 0.3623, + "step": 1045 + }, + { + "epoch": 3.532322594230364, + "grad_norm": 0.1010678964013295, + "learning_rate": 1.9188611520736846e-05, + "loss": 0.3631, + "step": 1046 + }, + { + "epoch": 3.5356917245735944, + "grad_norm": 0.08709082546898574, + "learning_rate": 1.9108101980813277e-05, + "loss": 0.3559, + "step": 1047 + }, + { + "epoch": 3.5390608549168245, + "grad_norm": 0.09973595422763583, + "learning_rate": 1.902770865771074e-05, + "loss": 0.3572, + "step": 1048 + }, + { + "epoch": 3.5424299852600547, + "grad_norm": 0.0932062947908472, + "learning_rate": 1.8947431998638762e-05, + "loss": 0.3703, + "step": 1049 + }, + { + "epoch": 3.545799115603285, + "grad_norm": 0.0927283626151012, + "learning_rate": 1.886727245015794e-05, + "loss": 0.3604, + "step": 1050 + }, + { + "epoch": 3.549168245946515, + "grad_norm": 0.0899928028286008, + "learning_rate": 1.8787230458177408e-05, + "loss": 0.3596, + "step": 1051 + }, + { + "epoch": 3.552537376289745, + "grad_norm": 0.09291563797483152, + "learning_rate": 1.8707306467952323e-05, + "loss": 0.3602, + "step": 1052 + }, + { + "epoch": 3.5559065066329754, + "grad_norm": 0.0873067226862915, + "learning_rate": 1.862750092408147e-05, + "loss": 0.3632, + "step": 1053 + }, + { + "epoch": 3.5592756369762055, + "grad_norm": 0.09201291502685034, + "learning_rate": 1.8547814270504705e-05, + "loss": 0.3665, + "step": 1054 + }, + { + "epoch": 3.5626447673194357, + "grad_norm": 0.08672862490756593, + "learning_rate": 1.8468246950500556e-05, + "loss": 0.3595, + "step": 1055 + }, + { + "epoch": 3.566013897662666, + "grad_norm": 0.08745897822977576, + "learning_rate": 1.838879940668373e-05, + "loss": 0.3605, + "step": 1056 + }, + { + "epoch": 3.569383028005896, + "grad_norm": 0.09233748547600358, + "learning_rate": 1.83094720810026e-05, + "loss": 0.36, + "step": 1057 + }, + { + "epoch": 3.572752158349126, + "grad_norm": 0.09364157413857832, + "learning_rate": 1.823026541473684e-05, + "loss": 0.3642, + "step": 1058 + }, + { + "epoch": 3.5761212886923563, + "grad_norm": 0.0919710962105762, + "learning_rate": 1.8151179848494905e-05, + "loss": 0.3629, + "step": 1059 + }, + { + "epoch": 3.5794904190355865, + "grad_norm": 0.09562060747735483, + "learning_rate": 1.8072215822211613e-05, + "loss": 0.3623, + "step": 1060 + }, + { + "epoch": 3.5828595493788167, + "grad_norm": 0.08937202924753714, + "learning_rate": 1.7993373775145663e-05, + "loss": 0.3608, + "step": 1061 + }, + { + "epoch": 3.586228679722047, + "grad_norm": 0.09294261879238143, + "learning_rate": 1.7914654145877187e-05, + "loss": 0.3605, + "step": 1062 + }, + { + "epoch": 3.589597810065277, + "grad_norm": 0.08234694139654683, + "learning_rate": 1.7836057372305423e-05, + "loss": 0.3628, + "step": 1063 + }, + { + "epoch": 3.592966940408507, + "grad_norm": 0.09244388782945556, + "learning_rate": 1.77575838916461e-05, + "loss": 0.3584, + "step": 1064 + }, + { + "epoch": 3.5963360707517373, + "grad_norm": 0.0993638664940156, + "learning_rate": 1.767923414042915e-05, + "loss": 0.3614, + "step": 1065 + }, + { + "epoch": 3.5997052010949675, + "grad_norm": 0.08789579064875053, + "learning_rate": 1.760100855449619e-05, + "loss": 0.3603, + "step": 1066 + }, + { + "epoch": 3.6030743314381977, + "grad_norm": 0.10411154322718334, + "learning_rate": 1.752290756899816e-05, + "loss": 0.3624, + "step": 1067 + }, + { + "epoch": 3.606443461781428, + "grad_norm": 0.08047024019249209, + "learning_rate": 1.7444931618392894e-05, + "loss": 0.3585, + "step": 1068 + }, + { + "epoch": 3.6098125921246575, + "grad_norm": 0.10614805370086325, + "learning_rate": 1.736708113644262e-05, + "loss": 0.363, + "step": 1069 + }, + { + "epoch": 3.613181722467888, + "grad_norm": 0.08611033060017922, + "learning_rate": 1.7289356556211687e-05, + "loss": 0.3637, + "step": 1070 + }, + { + "epoch": 3.616550852811118, + "grad_norm": 0.0873884543774505, + "learning_rate": 1.7211758310064042e-05, + "loss": 0.3578, + "step": 1071 + }, + { + "epoch": 3.6199199831543485, + "grad_norm": 0.09185990558523179, + "learning_rate": 1.7134286829660855e-05, + "loss": 0.3677, + "step": 1072 + }, + { + "epoch": 3.623289113497578, + "grad_norm": 0.08742862884585491, + "learning_rate": 1.7056942545958167e-05, + "loss": 0.3657, + "step": 1073 + }, + { + "epoch": 3.626658243840809, + "grad_norm": 0.08793012887671753, + "learning_rate": 1.697972588920439e-05, + "loss": 0.3655, + "step": 1074 + }, + { + "epoch": 3.6300273741840385, + "grad_norm": 0.09014218865360733, + "learning_rate": 1.6902637288938074e-05, + "loss": 0.364, + "step": 1075 + }, + { + "epoch": 3.633396504527269, + "grad_norm": 0.08892601725042051, + "learning_rate": 1.6825677173985332e-05, + "loss": 0.3665, + "step": 1076 + }, + { + "epoch": 3.636765634870499, + "grad_norm": 0.0878924041737089, + "learning_rate": 1.6748845972457562e-05, + "loss": 0.3563, + "step": 1077 + }, + { + "epoch": 3.6401347652137295, + "grad_norm": 0.09417513459021953, + "learning_rate": 1.6672144111749066e-05, + "loss": 0.3657, + "step": 1078 + }, + { + "epoch": 3.643503895556959, + "grad_norm": 0.09041822306873473, + "learning_rate": 1.659557201853465e-05, + "loss": 0.3687, + "step": 1079 + }, + { + "epoch": 3.6468730259001894, + "grad_norm": 0.08690354592106783, + "learning_rate": 1.6519130118767258e-05, + "loss": 0.3601, + "step": 1080 + }, + { + "epoch": 3.6502421562434195, + "grad_norm": 0.08875815871506505, + "learning_rate": 1.6442818837675578e-05, + "loss": 0.3602, + "step": 1081 + }, + { + "epoch": 3.6536112865866497, + "grad_norm": 0.08649489906143072, + "learning_rate": 1.6366638599761676e-05, + "loss": 0.362, + "step": 1082 + }, + { + "epoch": 3.65698041692988, + "grad_norm": 0.08572937873824316, + "learning_rate": 1.6290589828798736e-05, + "loss": 0.3614, + "step": 1083 + }, + { + "epoch": 3.66034954727311, + "grad_norm": 0.09798247509524252, + "learning_rate": 1.621467294782854e-05, + "loss": 0.3608, + "step": 1084 + }, + { + "epoch": 3.66371867761634, + "grad_norm": 0.08586977784228111, + "learning_rate": 1.6138888379159238e-05, + "loss": 0.3602, + "step": 1085 + }, + { + "epoch": 3.6670878079595703, + "grad_norm": 0.09119393564711122, + "learning_rate": 1.606323654436293e-05, + "loss": 0.3641, + "step": 1086 + }, + { + "epoch": 3.6704569383028005, + "grad_norm": 0.09035766592284558, + "learning_rate": 1.5987717864273377e-05, + "loss": 0.366, + "step": 1087 + }, + { + "epoch": 3.6738260686460307, + "grad_norm": 0.08837128914166983, + "learning_rate": 1.591233275898363e-05, + "loss": 0.3621, + "step": 1088 + }, + { + "epoch": 3.677195198989261, + "grad_norm": 0.09501671591225473, + "learning_rate": 1.5837081647843652e-05, + "loss": 0.3655, + "step": 1089 + }, + { + "epoch": 3.680564329332491, + "grad_norm": 0.08827549191913663, + "learning_rate": 1.5761964949458076e-05, + "loss": 0.3664, + "step": 1090 + }, + { + "epoch": 3.683933459675721, + "grad_norm": 0.08653211178416792, + "learning_rate": 1.5686983081683816e-05, + "loss": 0.3613, + "step": 1091 + }, + { + "epoch": 3.6873025900189513, + "grad_norm": 0.09007377723059057, + "learning_rate": 1.5612136461627726e-05, + "loss": 0.3605, + "step": 1092 + }, + { + "epoch": 3.6906717203621815, + "grad_norm": 0.08875691955803114, + "learning_rate": 1.5537425505644358e-05, + "loss": 0.3692, + "step": 1093 + }, + { + "epoch": 3.6940408507054117, + "grad_norm": 0.0865487137501953, + "learning_rate": 1.546285062933352e-05, + "loss": 0.3637, + "step": 1094 + }, + { + "epoch": 3.697409981048642, + "grad_norm": 0.08441316995604657, + "learning_rate": 1.5388412247538148e-05, + "loss": 0.3566, + "step": 1095 + }, + { + "epoch": 3.700779111391872, + "grad_norm": 0.08795109300895403, + "learning_rate": 1.5314110774341803e-05, + "loss": 0.3649, + "step": 1096 + }, + { + "epoch": 3.704148241735102, + "grad_norm": 0.08660466649366423, + "learning_rate": 1.5239946623066466e-05, + "loss": 0.3656, + "step": 1097 + }, + { + "epoch": 3.7075173720783323, + "grad_norm": 0.08962594968540999, + "learning_rate": 1.5165920206270257e-05, + "loss": 0.3578, + "step": 1098 + }, + { + "epoch": 3.7108865024215625, + "grad_norm": 0.08885368968596062, + "learning_rate": 1.5092031935745102e-05, + "loss": 0.362, + "step": 1099 + }, + { + "epoch": 3.7142556327647926, + "grad_norm": 0.0928847110975983, + "learning_rate": 1.5018282222514451e-05, + "loss": 0.3673, + "step": 1100 + }, + { + "epoch": 3.717624763108023, + "grad_norm": 0.08764355173587007, + "learning_rate": 1.4944671476830967e-05, + "loss": 0.3559, + "step": 1101 + }, + { + "epoch": 3.720993893451253, + "grad_norm": 0.08858832328866184, + "learning_rate": 1.4871200108174306e-05, + "loss": 0.3621, + "step": 1102 + }, + { + "epoch": 3.724363023794483, + "grad_norm": 0.08763675374855749, + "learning_rate": 1.479786852524879e-05, + "loss": 0.3588, + "step": 1103 + }, + { + "epoch": 3.7277321541377133, + "grad_norm": 0.08909210540103846, + "learning_rate": 1.4724677135981118e-05, + "loss": 0.3625, + "step": 1104 + }, + { + "epoch": 3.7311012844809435, + "grad_norm": 0.08178768871920535, + "learning_rate": 1.4651626347518169e-05, + "loss": 0.3621, + "step": 1105 + }, + { + "epoch": 3.7344704148241736, + "grad_norm": 0.09156080753526148, + "learning_rate": 1.457871656622463e-05, + "loss": 0.359, + "step": 1106 + }, + { + "epoch": 3.737839545167404, + "grad_norm": 0.08325843071720376, + "learning_rate": 1.4505948197680892e-05, + "loss": 0.3607, + "step": 1107 + }, + { + "epoch": 3.741208675510634, + "grad_norm": 0.08448426244553969, + "learning_rate": 1.4433321646680614e-05, + "loss": 0.3648, + "step": 1108 + }, + { + "epoch": 3.744577805853864, + "grad_norm": 0.08086940191673836, + "learning_rate": 1.4360837317228571e-05, + "loss": 0.3588, + "step": 1109 + }, + { + "epoch": 3.747946936197094, + "grad_norm": 0.08083156500148386, + "learning_rate": 1.4288495612538427e-05, + "loss": 0.3571, + "step": 1110 + }, + { + "epoch": 3.7513160665403245, + "grad_norm": 0.08098967397738577, + "learning_rate": 1.4216296935030433e-05, + "loss": 0.3661, + "step": 1111 + }, + { + "epoch": 3.754685196883554, + "grad_norm": 0.08420676698532144, + "learning_rate": 1.4144241686329236e-05, + "loss": 0.3667, + "step": 1112 + }, + { + "epoch": 3.758054327226785, + "grad_norm": 0.08205757559067017, + "learning_rate": 1.4072330267261585e-05, + "loss": 0.3538, + "step": 1113 + }, + { + "epoch": 3.7614234575700145, + "grad_norm": 0.08130986735808181, + "learning_rate": 1.400056307785413e-05, + "loss": 0.358, + "step": 1114 + }, + { + "epoch": 3.764792587913245, + "grad_norm": 0.08513069994957134, + "learning_rate": 1.3928940517331282e-05, + "loss": 0.363, + "step": 1115 + }, + { + "epoch": 3.768161718256475, + "grad_norm": 0.08636038178242018, + "learning_rate": 1.3857462984112831e-05, + "loss": 0.3625, + "step": 1116 + }, + { + "epoch": 3.7715308485997054, + "grad_norm": 0.08228222146136721, + "learning_rate": 1.3786130875811864e-05, + "loss": 0.3643, + "step": 1117 + }, + { + "epoch": 3.774899978942935, + "grad_norm": 0.08245367651432615, + "learning_rate": 1.371494458923246e-05, + "loss": 0.3611, + "step": 1118 + }, + { + "epoch": 3.7782691092861658, + "grad_norm": 0.08984002063694464, + "learning_rate": 1.3643904520367568e-05, + "loss": 0.3665, + "step": 1119 + }, + { + "epoch": 3.7816382396293955, + "grad_norm": 0.08004513922265995, + "learning_rate": 1.3573011064396751e-05, + "loss": 0.3626, + "step": 1120 + }, + { + "epoch": 3.7850073699726257, + "grad_norm": 0.08501294133045856, + "learning_rate": 1.3502264615683966e-05, + "loss": 0.3584, + "step": 1121 + }, + { + "epoch": 3.788376500315856, + "grad_norm": 0.08838080335200914, + "learning_rate": 1.3431665567775439e-05, + "loss": 0.3584, + "step": 1122 + }, + { + "epoch": 3.791745630659086, + "grad_norm": 0.08330553315133392, + "learning_rate": 1.3361214313397444e-05, + "loss": 0.36, + "step": 1123 + }, + { + "epoch": 3.795114761002316, + "grad_norm": 0.08670146765162016, + "learning_rate": 1.3290911244454066e-05, + "loss": 0.3661, + "step": 1124 + }, + { + "epoch": 3.7984838913455463, + "grad_norm": 0.0841408453670069, + "learning_rate": 1.3220756752025126e-05, + "loss": 0.363, + "step": 1125 + }, + { + "epoch": 3.8018530216887765, + "grad_norm": 0.08384047682221397, + "learning_rate": 1.3150751226363886e-05, + "loss": 0.3622, + "step": 1126 + }, + { + "epoch": 3.8052221520320066, + "grad_norm": 0.08347244270329462, + "learning_rate": 1.3080895056895022e-05, + "loss": 0.3618, + "step": 1127 + }, + { + "epoch": 3.808591282375237, + "grad_norm": 0.0851964538331852, + "learning_rate": 1.3011188632212307e-05, + "loss": 0.3639, + "step": 1128 + }, + { + "epoch": 3.811960412718467, + "grad_norm": 0.08389988414632749, + "learning_rate": 1.2941632340076531e-05, + "loss": 0.3656, + "step": 1129 + }, + { + "epoch": 3.815329543061697, + "grad_norm": 0.0818943745196087, + "learning_rate": 1.2872226567413346e-05, + "loss": 0.3595, + "step": 1130 + }, + { + "epoch": 3.8186986734049273, + "grad_norm": 0.07744154226291297, + "learning_rate": 1.2802971700311103e-05, + "loss": 0.3595, + "step": 1131 + }, + { + "epoch": 3.8220678037481575, + "grad_norm": 0.08550107649728135, + "learning_rate": 1.2733868124018694e-05, + "loss": 0.3614, + "step": 1132 + }, + { + "epoch": 3.8254369340913876, + "grad_norm": 0.07860069500089853, + "learning_rate": 1.2664916222943392e-05, + "loss": 0.3552, + "step": 1133 + }, + { + "epoch": 3.828806064434618, + "grad_norm": 0.08126878361185912, + "learning_rate": 1.2596116380648761e-05, + "loss": 0.3622, + "step": 1134 + }, + { + "epoch": 3.832175194777848, + "grad_norm": 0.08610190267035886, + "learning_rate": 1.2527468979852513e-05, + "loss": 0.3645, + "step": 1135 + }, + { + "epoch": 3.835544325121078, + "grad_norm": 0.0815351952289208, + "learning_rate": 1.2458974402424312e-05, + "loss": 0.36, + "step": 1136 + }, + { + "epoch": 3.8389134554643083, + "grad_norm": 0.08556542224799225, + "learning_rate": 1.239063302938376e-05, + "loss": 0.3581, + "step": 1137 + }, + { + "epoch": 3.8422825858075385, + "grad_norm": 0.0863667479775735, + "learning_rate": 1.2322445240898158e-05, + "loss": 0.3592, + "step": 1138 + }, + { + "epoch": 3.8456517161507686, + "grad_norm": 0.09012131069922455, + "learning_rate": 1.2254411416280494e-05, + "loss": 0.3608, + "step": 1139 + }, + { + "epoch": 3.849020846493999, + "grad_norm": 0.0813110318115451, + "learning_rate": 1.2186531933987294e-05, + "loss": 0.3617, + "step": 1140 + }, + { + "epoch": 3.852389976837229, + "grad_norm": 0.08621918893656133, + "learning_rate": 1.2118807171616469e-05, + "loss": 0.3632, + "step": 1141 + }, + { + "epoch": 3.855759107180459, + "grad_norm": 0.08876046697132542, + "learning_rate": 1.2051237505905302e-05, + "loss": 0.363, + "step": 1142 + }, + { + "epoch": 3.8591282375236893, + "grad_norm": 0.08486782812443205, + "learning_rate": 1.1983823312728306e-05, + "loss": 0.3681, + "step": 1143 + }, + { + "epoch": 3.8624973678669194, + "grad_norm": 0.08182983032740812, + "learning_rate": 1.19165649670951e-05, + "loss": 0.3635, + "step": 1144 + }, + { + "epoch": 3.8658664982101496, + "grad_norm": 0.07583529894267067, + "learning_rate": 1.1849462843148398e-05, + "loss": 0.3633, + "step": 1145 + }, + { + "epoch": 3.8692356285533798, + "grad_norm": 0.09126795810440728, + "learning_rate": 1.1782517314161872e-05, + "loss": 0.3584, + "step": 1146 + }, + { + "epoch": 3.87260475889661, + "grad_norm": 0.0811651957931282, + "learning_rate": 1.1715728752538103e-05, + "loss": 0.3617, + "step": 1147 + }, + { + "epoch": 3.87597388923984, + "grad_norm": 0.0843774615335534, + "learning_rate": 1.164909752980648e-05, + "loss": 0.3644, + "step": 1148 + }, + { + "epoch": 3.8793430195830703, + "grad_norm": 0.08099134098178276, + "learning_rate": 1.1582624016621154e-05, + "loss": 0.3595, + "step": 1149 + }, + { + "epoch": 3.8827121499263004, + "grad_norm": 0.07907946240714192, + "learning_rate": 1.1516308582758983e-05, + "loss": 0.3614, + "step": 1150 + }, + { + "epoch": 3.8860812802695306, + "grad_norm": 0.08423868561227663, + "learning_rate": 1.1450151597117479e-05, + "loss": 0.3613, + "step": 1151 + }, + { + "epoch": 3.8894504106127608, + "grad_norm": 0.08033686574245383, + "learning_rate": 1.1384153427712729e-05, + "loss": 0.3642, + "step": 1152 + }, + { + "epoch": 3.8928195409559905, + "grad_norm": 0.07677407670189697, + "learning_rate": 1.1318314441677348e-05, + "loss": 0.3569, + "step": 1153 + }, + { + "epoch": 3.896188671299221, + "grad_norm": 0.07906769729135289, + "learning_rate": 1.1252635005258466e-05, + "loss": 0.3595, + "step": 1154 + }, + { + "epoch": 3.899557801642451, + "grad_norm": 0.08225694582677316, + "learning_rate": 1.1187115483815693e-05, + "loss": 0.3644, + "step": 1155 + }, + { + "epoch": 3.9029269319856814, + "grad_norm": 0.08435086211540141, + "learning_rate": 1.1121756241819023e-05, + "loss": 0.3629, + "step": 1156 + }, + { + "epoch": 3.906296062328911, + "grad_norm": 0.0779208137414844, + "learning_rate": 1.105655764284689e-05, + "loss": 0.3594, + "step": 1157 + }, + { + "epoch": 3.9096651926721417, + "grad_norm": 0.07917404294021134, + "learning_rate": 1.0991520049584112e-05, + "loss": 0.3649, + "step": 1158 + }, + { + "epoch": 3.9130343230153715, + "grad_norm": 0.07819778959894405, + "learning_rate": 1.0926643823819827e-05, + "loss": 0.3643, + "step": 1159 + }, + { + "epoch": 3.916403453358602, + "grad_norm": 0.0822711836526933, + "learning_rate": 1.0861929326445572e-05, + "loss": 0.3627, + "step": 1160 + }, + { + "epoch": 3.919772583701832, + "grad_norm": 0.07971405947853387, + "learning_rate": 1.0797376917453187e-05, + "loss": 0.3599, + "step": 1161 + }, + { + "epoch": 3.923141714045062, + "grad_norm": 0.08341374870417605, + "learning_rate": 1.0732986955932869e-05, + "loss": 0.3555, + "step": 1162 + }, + { + "epoch": 3.926510844388292, + "grad_norm": 0.07752760209485876, + "learning_rate": 1.0668759800071174e-05, + "loss": 0.3591, + "step": 1163 + }, + { + "epoch": 3.9298799747315223, + "grad_norm": 0.0795643390039002, + "learning_rate": 1.0604695807148971e-05, + "loss": 0.3568, + "step": 1164 + }, + { + "epoch": 3.9332491050747524, + "grad_norm": 0.07806803915038091, + "learning_rate": 1.0540795333539515e-05, + "loss": 0.3629, + "step": 1165 + }, + { + "epoch": 3.9366182354179826, + "grad_norm": 0.07851161347206628, + "learning_rate": 1.0477058734706436e-05, + "loss": 0.3611, + "step": 1166 + }, + { + "epoch": 3.939987365761213, + "grad_norm": 0.07762641296833782, + "learning_rate": 1.0413486365201785e-05, + "loss": 0.3613, + "step": 1167 + }, + { + "epoch": 3.943356496104443, + "grad_norm": 0.08535005147447429, + "learning_rate": 1.0350078578664005e-05, + "loss": 0.3591, + "step": 1168 + }, + { + "epoch": 3.946725626447673, + "grad_norm": 0.07824237520005016, + "learning_rate": 1.0286835727816001e-05, + "loss": 0.363, + "step": 1169 + }, + { + "epoch": 3.9500947567909033, + "grad_norm": 0.0725027774844816, + "learning_rate": 1.0223758164463246e-05, + "loss": 0.361, + "step": 1170 + }, + { + "epoch": 3.9534638871341334, + "grad_norm": 0.08250211916215387, + "learning_rate": 1.0160846239491673e-05, + "loss": 0.3706, + "step": 1171 + }, + { + "epoch": 3.9568330174773636, + "grad_norm": 0.07768057857668437, + "learning_rate": 1.0098100302865865e-05, + "loss": 0.358, + "step": 1172 + }, + { + "epoch": 3.9602021478205938, + "grad_norm": 0.0743357334386284, + "learning_rate": 1.003552070362701e-05, + "loss": 0.3588, + "step": 1173 + }, + { + "epoch": 3.963571278163824, + "grad_norm": 0.08538154828312804, + "learning_rate": 9.973107789891024e-06, + "loss": 0.3687, + "step": 1174 + }, + { + "epoch": 3.966940408507054, + "grad_norm": 0.08474253190258095, + "learning_rate": 9.910861908846598e-06, + "loss": 0.36, + "step": 1175 + }, + { + "epoch": 3.9703095388502843, + "grad_norm": 0.07698260800417392, + "learning_rate": 9.848783406753224e-06, + "loss": 0.3655, + "step": 1176 + }, + { + "epoch": 3.9736786691935144, + "grad_norm": 0.07875068992732076, + "learning_rate": 9.786872628939329e-06, + "loss": 0.3605, + "step": 1177 + }, + { + "epoch": 3.9770477995367446, + "grad_norm": 0.08337836249305365, + "learning_rate": 9.725129919800339e-06, + "loss": 0.3653, + "step": 1178 + }, + { + "epoch": 3.9804169298799748, + "grad_norm": 0.0799444611097984, + "learning_rate": 9.66355562279671e-06, + "loss": 0.3604, + "step": 1179 + }, + { + "epoch": 3.983786060223205, + "grad_norm": 0.08618283586928363, + "learning_rate": 9.60215008045211e-06, + "loss": 0.3637, + "step": 1180 + }, + { + "epoch": 3.987155190566435, + "grad_norm": 0.08302579845358256, + "learning_rate": 9.540913634351408e-06, + "loss": 0.3602, + "step": 1181 + }, + { + "epoch": 3.9905243209096652, + "grad_norm": 0.07735294324245658, + "learning_rate": 9.479846625138909e-06, + "loss": 0.3596, + "step": 1182 + }, + { + "epoch": 3.9938934512528954, + "grad_norm": 0.07471734423709958, + "learning_rate": 9.418949392516307e-06, + "loss": 0.3611, + "step": 1183 + }, + { + "epoch": 3.9972625815961256, + "grad_norm": 0.08214012704171592, + "learning_rate": 9.358222275240884e-06, + "loss": 0.3648, + "step": 1184 + }, + { + "epoch": 4.00336913034323, + "grad_norm": 0.11292758122904588, + "learning_rate": 9.297665611123628e-06, + "loss": 0.3527, + "step": 1185 + }, + { + "epoch": 4.00673826068646, + "grad_norm": 0.0941098295127884, + "learning_rate": 9.237279737027326e-06, + "loss": 0.3472, + "step": 1186 + }, + { + "epoch": 4.01010739102969, + "grad_norm": 0.09639154458998347, + "learning_rate": 9.177064988864712e-06, + "loss": 0.3425, + "step": 1187 + }, + { + "epoch": 4.013476521372921, + "grad_norm": 0.09835304863889502, + "learning_rate": 9.117021701596567e-06, + "loss": 0.3446, + "step": 1188 + }, + { + "epoch": 4.01684565171615, + "grad_norm": 0.08987244503280054, + "learning_rate": 9.057150209229845e-06, + "loss": 0.3513, + "step": 1189 + }, + { + "epoch": 4.020214782059381, + "grad_norm": 0.10031177854257561, + "learning_rate": 8.99745084481594e-06, + "loss": 0.3516, + "step": 1190 + }, + { + "epoch": 4.023583912402611, + "grad_norm": 0.10651297976200229, + "learning_rate": 8.937923940448634e-06, + "loss": 0.3489, + "step": 1191 + }, + { + "epoch": 4.026953042745841, + "grad_norm": 0.08656835316363745, + "learning_rate": 8.87856982726243e-06, + "loss": 0.3402, + "step": 1192 + }, + { + "epoch": 4.030322173089071, + "grad_norm": 0.0977560831877126, + "learning_rate": 8.819388835430569e-06, + "loss": 0.348, + "step": 1193 + }, + { + "epoch": 4.033691303432302, + "grad_norm": 0.09746909055035731, + "learning_rate": 8.7603812941633e-06, + "loss": 0.3492, + "step": 1194 + }, + { + "epoch": 4.037060433775531, + "grad_norm": 0.08395050874481182, + "learning_rate": 8.701547531706018e-06, + "loss": 0.3482, + "step": 1195 + }, + { + "epoch": 4.040429564118762, + "grad_norm": 0.09139581639425662, + "learning_rate": 8.642887875337376e-06, + "loss": 0.3509, + "step": 1196 + }, + { + "epoch": 4.043798694461992, + "grad_norm": 0.09015094643326858, + "learning_rate": 8.584402651367556e-06, + "loss": 0.3445, + "step": 1197 + }, + { + "epoch": 4.047167824805222, + "grad_norm": 0.08067803096785321, + "learning_rate": 8.526092185136394e-06, + "loss": 0.345, + "step": 1198 + }, + { + "epoch": 4.050536955148452, + "grad_norm": 0.08630631888609785, + "learning_rate": 8.467956801011618e-06, + "loss": 0.338, + "step": 1199 + }, + { + "epoch": 4.053906085491683, + "grad_norm": 0.08433690244909006, + "learning_rate": 8.409996822386972e-06, + "loss": 0.343, + "step": 1200 + }, + { + "epoch": 4.057275215834912, + "grad_norm": 0.07920044123514752, + "learning_rate": 8.352212571680458e-06, + "loss": 0.3473, + "step": 1201 + }, + { + "epoch": 4.060644346178143, + "grad_norm": 0.07927154455223241, + "learning_rate": 8.294604370332613e-06, + "loss": 0.3482, + "step": 1202 + }, + { + "epoch": 4.064013476521373, + "grad_norm": 0.08109057542606768, + "learning_rate": 8.23717253880457e-06, + "loss": 0.3428, + "step": 1203 + }, + { + "epoch": 4.067382606864603, + "grad_norm": 0.08569342844895425, + "learning_rate": 8.17991739657641e-06, + "loss": 0.3474, + "step": 1204 + }, + { + "epoch": 4.070751737207833, + "grad_norm": 0.08637139957757115, + "learning_rate": 8.122839262145294e-06, + "loss": 0.3467, + "step": 1205 + }, + { + "epoch": 4.074120867551064, + "grad_norm": 0.07781808041765698, + "learning_rate": 8.06593845302376e-06, + "loss": 0.3395, + "step": 1206 + }, + { + "epoch": 4.077489997894293, + "grad_norm": 0.08111376806052889, + "learning_rate": 8.00921528573793e-06, + "loss": 0.3389, + "step": 1207 + }, + { + "epoch": 4.080859128237524, + "grad_norm": 0.08619767447901233, + "learning_rate": 7.952670075825702e-06, + "loss": 0.348, + "step": 1208 + }, + { + "epoch": 4.084228258580754, + "grad_norm": 0.07737321565650793, + "learning_rate": 7.896303137835084e-06, + "loss": 0.3373, + "step": 1209 + }, + { + "epoch": 4.087597388923984, + "grad_norm": 0.07775405743530504, + "learning_rate": 7.840114785322384e-06, + "loss": 0.3443, + "step": 1210 + }, + { + "epoch": 4.090966519267214, + "grad_norm": 0.07816418598625743, + "learning_rate": 7.78410533085046e-06, + "loss": 0.345, + "step": 1211 + }, + { + "epoch": 4.094335649610445, + "grad_norm": 0.08021420493935687, + "learning_rate": 7.728275085987041e-06, + "loss": 0.3445, + "step": 1212 + }, + { + "epoch": 4.097704779953674, + "grad_norm": 0.07501876010838501, + "learning_rate": 7.672624361302894e-06, + "loss": 0.345, + "step": 1213 + }, + { + "epoch": 4.101073910296905, + "grad_norm": 0.07616193917641446, + "learning_rate": 7.6171534663702416e-06, + "loss": 0.3451, + "step": 1214 + }, + { + "epoch": 4.104443040640135, + "grad_norm": 0.08197274858236898, + "learning_rate": 7.5618627097608835e-06, + "loss": 0.3481, + "step": 1215 + }, + { + "epoch": 4.107812170983365, + "grad_norm": 0.07483017111226394, + "learning_rate": 7.50675239904457e-06, + "loss": 0.3454, + "step": 1216 + }, + { + "epoch": 4.111181301326595, + "grad_norm": 0.07441931083866478, + "learning_rate": 7.451822840787279e-06, + "loss": 0.3469, + "step": 1217 + }, + { + "epoch": 4.114550431669826, + "grad_norm": 0.08142190767207858, + "learning_rate": 7.397074340549508e-06, + "loss": 0.3431, + "step": 1218 + }, + { + "epoch": 4.117919562013055, + "grad_norm": 0.07876869644542178, + "learning_rate": 7.342507202884577e-06, + "loss": 0.3462, + "step": 1219 + }, + { + "epoch": 4.121288692356286, + "grad_norm": 0.07845687277699909, + "learning_rate": 7.288121731336901e-06, + "loss": 0.3456, + "step": 1220 + }, + { + "epoch": 4.124657822699516, + "grad_norm": 0.07817574483354851, + "learning_rate": 7.233918228440324e-06, + "loss": 0.3436, + "step": 1221 + }, + { + "epoch": 4.128026953042746, + "grad_norm": 0.07876507958828823, + "learning_rate": 7.1798969957165025e-06, + "loss": 0.3493, + "step": 1222 + }, + { + "epoch": 4.131396083385976, + "grad_norm": 0.07707210638891601, + "learning_rate": 7.126058333673094e-06, + "loss": 0.3402, + "step": 1223 + }, + { + "epoch": 4.134765213729207, + "grad_norm": 0.07947117463971737, + "learning_rate": 7.072402541802197e-06, + "loss": 0.3478, + "step": 1224 + }, + { + "epoch": 4.138134344072436, + "grad_norm": 0.07708906857469865, + "learning_rate": 7.018929918578621e-06, + "loss": 0.3457, + "step": 1225 + }, + { + "epoch": 4.141503474415666, + "grad_norm": 0.08008450821251828, + "learning_rate": 6.965640761458274e-06, + "loss": 0.3414, + "step": 1226 + }, + { + "epoch": 4.144872604758897, + "grad_norm": 0.07732322409168987, + "learning_rate": 6.912535366876483e-06, + "loss": 0.3427, + "step": 1227 + }, + { + "epoch": 4.148241735102126, + "grad_norm": 0.07450575669616548, + "learning_rate": 6.859614030246318e-06, + "loss": 0.3477, + "step": 1228 + }, + { + "epoch": 4.151610865445357, + "grad_norm": 0.08433118593640568, + "learning_rate": 6.806877045957003e-06, + "loss": 0.3425, + "step": 1229 + }, + { + "epoch": 4.154979995788587, + "grad_norm": 0.07513389398253724, + "learning_rate": 6.754324707372264e-06, + "loss": 0.3443, + "step": 1230 + }, + { + "epoch": 4.158349126131817, + "grad_norm": 0.07536890804885507, + "learning_rate": 6.701957306828637e-06, + "loss": 0.3438, + "step": 1231 + }, + { + "epoch": 4.161718256475047, + "grad_norm": 0.07685668754719273, + "learning_rate": 6.649775135633944e-06, + "loss": 0.3401, + "step": 1232 + }, + { + "epoch": 4.165087386818278, + "grad_norm": 0.07956673529792976, + "learning_rate": 6.597778484065571e-06, + "loss": 0.3503, + "step": 1233 + }, + { + "epoch": 4.168456517161507, + "grad_norm": 0.07209527381971025, + "learning_rate": 6.545967641368958e-06, + "loss": 0.3434, + "step": 1234 + }, + { + "epoch": 4.171825647504738, + "grad_norm": 0.07458918014634688, + "learning_rate": 6.494342895755879e-06, + "loss": 0.343, + "step": 1235 + }, + { + "epoch": 4.175194777847968, + "grad_norm": 0.08077306421411162, + "learning_rate": 6.4429045344029136e-06, + "loss": 0.3513, + "step": 1236 + }, + { + "epoch": 4.178563908191198, + "grad_norm": 0.08065308092284855, + "learning_rate": 6.391652843449829e-06, + "loss": 0.3434, + "step": 1237 + }, + { + "epoch": 4.181933038534428, + "grad_norm": 0.0731775502872814, + "learning_rate": 6.340588107997994e-06, + "loss": 0.3443, + "step": 1238 + }, + { + "epoch": 4.185302168877659, + "grad_norm": 0.07546567416391478, + "learning_rate": 6.289710612108786e-06, + "loss": 0.3434, + "step": 1239 + }, + { + "epoch": 4.188671299220888, + "grad_norm": 0.07650397977406549, + "learning_rate": 6.239020638801987e-06, + "loss": 0.3452, + "step": 1240 + }, + { + "epoch": 4.192040429564119, + "grad_norm": 0.07431679145535366, + "learning_rate": 6.18851847005427e-06, + "loss": 0.3484, + "step": 1241 + }, + { + "epoch": 4.195409559907349, + "grad_norm": 0.07416827387620398, + "learning_rate": 6.1382043867975836e-06, + "loss": 0.3452, + "step": 1242 + }, + { + "epoch": 4.198778690250579, + "grad_norm": 0.07754320392922942, + "learning_rate": 6.088078668917572e-06, + "loss": 0.3491, + "step": 1243 + }, + { + "epoch": 4.202147820593809, + "grad_norm": 0.07827458851806732, + "learning_rate": 6.038141595252094e-06, + "loss": 0.3406, + "step": 1244 + }, + { + "epoch": 4.20551695093704, + "grad_norm": 0.0725724426162921, + "learning_rate": 5.9883934435895774e-06, + "loss": 0.3496, + "step": 1245 + }, + { + "epoch": 4.208886081280269, + "grad_norm": 0.0719909369345341, + "learning_rate": 5.9388344906675485e-06, + "loss": 0.3526, + "step": 1246 + }, + { + "epoch": 4.2122552116235, + "grad_norm": 0.07567213228800986, + "learning_rate": 5.889465012171069e-06, + "loss": 0.3468, + "step": 1247 + }, + { + "epoch": 4.21562434196673, + "grad_norm": 0.07098076354440293, + "learning_rate": 5.840285282731173e-06, + "loss": 0.3466, + "step": 1248 + }, + { + "epoch": 4.21899347230996, + "grad_norm": 0.07019771893928237, + "learning_rate": 5.791295575923382e-06, + "loss": 0.3448, + "step": 1249 + }, + { + "epoch": 4.22236260265319, + "grad_norm": 0.07471579252214251, + "learning_rate": 5.742496164266174e-06, + "loss": 0.3491, + "step": 1250 + }, + { + "epoch": 4.225731732996421, + "grad_norm": 0.07236549423445121, + "learning_rate": 5.693887319219422e-06, + "loss": 0.3499, + "step": 1251 + }, + { + "epoch": 4.22910086333965, + "grad_norm": 0.07134479537520134, + "learning_rate": 5.645469311182958e-06, + "loss": 0.3459, + "step": 1252 + }, + { + "epoch": 4.232469993682881, + "grad_norm": 0.07072016749147457, + "learning_rate": 5.597242409495018e-06, + "loss": 0.3438, + "step": 1253 + }, + { + "epoch": 4.235839124026111, + "grad_norm": 0.07179051070856982, + "learning_rate": 5.549206882430773e-06, + "loss": 0.3419, + "step": 1254 + }, + { + "epoch": 4.239208254369341, + "grad_norm": 0.07302770625869862, + "learning_rate": 5.501362997200787e-06, + "loss": 0.3487, + "step": 1255 + }, + { + "epoch": 4.242577384712571, + "grad_norm": 0.06976392401988353, + "learning_rate": 5.453711019949581e-06, + "loss": 0.344, + "step": 1256 + }, + { + "epoch": 4.245946515055802, + "grad_norm": 0.07078499285712887, + "learning_rate": 5.406251215754146e-06, + "loss": 0.3465, + "step": 1257 + }, + { + "epoch": 4.249315645399031, + "grad_norm": 0.07118826571789505, + "learning_rate": 5.358983848622452e-06, + "loss": 0.3504, + "step": 1258 + }, + { + "epoch": 4.252684775742262, + "grad_norm": 0.0686563097499576, + "learning_rate": 5.311909181491994e-06, + "loss": 0.3433, + "step": 1259 + }, + { + "epoch": 4.256053906085492, + "grad_norm": 0.06836729686980945, + "learning_rate": 5.265027476228297e-06, + "loss": 0.3428, + "step": 1260 + }, + { + "epoch": 4.259423036428722, + "grad_norm": 0.07026205200909408, + "learning_rate": 5.218338993623499e-06, + "loss": 0.3475, + "step": 1261 + }, + { + "epoch": 4.262792166771952, + "grad_norm": 0.07032323091306557, + "learning_rate": 5.171843993394903e-06, + "loss": 0.3431, + "step": 1262 + }, + { + "epoch": 4.2661612971151825, + "grad_norm": 0.07423746533959613, + "learning_rate": 5.125542734183473e-06, + "loss": 0.3445, + "step": 1263 + }, + { + "epoch": 4.269530427458412, + "grad_norm": 0.07841448579779874, + "learning_rate": 5.079435473552474e-06, + "loss": 0.3481, + "step": 1264 + }, + { + "epoch": 4.272899557801642, + "grad_norm": 0.07040437269579536, + "learning_rate": 5.033522467985985e-06, + "loss": 0.3422, + "step": 1265 + }, + { + "epoch": 4.276268688144873, + "grad_norm": 0.07271729651198641, + "learning_rate": 4.987803972887482e-06, + "loss": 0.3433, + "step": 1266 + }, + { + "epoch": 4.279637818488103, + "grad_norm": 0.07717082685197238, + "learning_rate": 4.9422802425784475e-06, + "loss": 0.3459, + "step": 1267 + }, + { + "epoch": 4.283006948831333, + "grad_norm": 0.07646859752104176, + "learning_rate": 4.896951530296896e-06, + "loss": 0.3487, + "step": 1268 + }, + { + "epoch": 4.286376079174563, + "grad_norm": 0.07196146666335995, + "learning_rate": 4.851818088196041e-06, + "loss": 0.3451, + "step": 1269 + }, + { + "epoch": 4.289745209517793, + "grad_norm": 0.07601088345941356, + "learning_rate": 4.806880167342831e-06, + "loss": 0.346, + "step": 1270 + }, + { + "epoch": 4.293114339861023, + "grad_norm": 0.0730390084111676, + "learning_rate": 4.762138017716571e-06, + "loss": 0.3451, + "step": 1271 + }, + { + "epoch": 4.296483470204254, + "grad_norm": 0.08370554873202815, + "learning_rate": 4.7175918882075465e-06, + "loss": 0.3413, + "step": 1272 + }, + { + "epoch": 4.299852600547483, + "grad_norm": 0.07165140458981821, + "learning_rate": 4.673242026615627e-06, + "loss": 0.3413, + "step": 1273 + }, + { + "epoch": 4.303221730890714, + "grad_norm": 0.07124644667052794, + "learning_rate": 4.6290886796488946e-06, + "loss": 0.3474, + "step": 1274 + }, + { + "epoch": 4.306590861233944, + "grad_norm": 0.07331931362741691, + "learning_rate": 4.58513209292224e-06, + "loss": 0.3445, + "step": 1275 + }, + { + "epoch": 4.309959991577174, + "grad_norm": 0.07237727500497035, + "learning_rate": 4.54137251095605e-06, + "loss": 0.3511, + "step": 1276 + }, + { + "epoch": 4.313329121920404, + "grad_norm": 0.07038492284926416, + "learning_rate": 4.4978101771748195e-06, + "loss": 0.3429, + "step": 1277 + }, + { + "epoch": 4.316698252263635, + "grad_norm": 0.07186746493744087, + "learning_rate": 4.454445333905768e-06, + "loss": 0.3423, + "step": 1278 + }, + { + "epoch": 4.320067382606864, + "grad_norm": 0.07185532233373727, + "learning_rate": 4.411278222377551e-06, + "loss": 0.3416, + "step": 1279 + }, + { + "epoch": 4.323436512950095, + "grad_norm": 0.0702075072689657, + "learning_rate": 4.3683090827188666e-06, + "loss": 0.3452, + "step": 1280 + }, + { + "epoch": 4.326805643293325, + "grad_norm": 0.0752614715082349, + "learning_rate": 4.325538153957158e-06, + "loss": 0.3475, + "step": 1281 + }, + { + "epoch": 4.330174773636555, + "grad_norm": 0.07050331941427515, + "learning_rate": 4.282965674017265e-06, + "loss": 0.3477, + "step": 1282 + }, + { + "epoch": 4.333543903979785, + "grad_norm": 0.07219368807869528, + "learning_rate": 4.240591879720084e-06, + "loss": 0.3497, + "step": 1283 + }, + { + "epoch": 4.336913034323016, + "grad_norm": 0.06956963675751204, + "learning_rate": 4.198417006781283e-06, + "loss": 0.3474, + "step": 1284 + }, + { + "epoch": 4.340282164666245, + "grad_norm": 0.06960098578843016, + "learning_rate": 4.156441289809983e-06, + "loss": 0.3445, + "step": 1285 + }, + { + "epoch": 4.343651295009476, + "grad_norm": 0.07648526368534525, + "learning_rate": 4.114664962307439e-06, + "loss": 0.3479, + "step": 1286 + }, + { + "epoch": 4.347020425352706, + "grad_norm": 0.07088809269875901, + "learning_rate": 4.073088256665742e-06, + "loss": 0.3421, + "step": 1287 + }, + { + "epoch": 4.350389555695936, + "grad_norm": 0.07273421779811111, + "learning_rate": 4.031711404166525e-06, + "loss": 0.344, + "step": 1288 + }, + { + "epoch": 4.353758686039166, + "grad_norm": 0.07174713114445853, + "learning_rate": 3.9905346349797234e-06, + "loss": 0.3441, + "step": 1289 + }, + { + "epoch": 4.3571278163823965, + "grad_norm": 0.07290897068132188, + "learning_rate": 3.949558178162209e-06, + "loss": 0.3462, + "step": 1290 + }, + { + "epoch": 4.360496946725626, + "grad_norm": 0.07194649852054723, + "learning_rate": 3.9087822616565984e-06, + "loss": 0.3478, + "step": 1291 + }, + { + "epoch": 4.363866077068857, + "grad_norm": 0.07337266992394913, + "learning_rate": 3.86820711228991e-06, + "loss": 0.3447, + "step": 1292 + }, + { + "epoch": 4.367235207412087, + "grad_norm": 0.07030690021581439, + "learning_rate": 3.827832955772372e-06, + "loss": 0.3456, + "step": 1293 + }, + { + "epoch": 4.370604337755317, + "grad_norm": 0.07201158711941352, + "learning_rate": 3.7876600166961353e-06, + "loss": 0.3465, + "step": 1294 + }, + { + "epoch": 4.373973468098547, + "grad_norm": 0.07511999851456955, + "learning_rate": 3.747688518534003e-06, + "loss": 0.3509, + "step": 1295 + }, + { + "epoch": 4.3773425984417775, + "grad_norm": 0.07172350904328591, + "learning_rate": 3.707918683638223e-06, + "loss": 0.345, + "step": 1296 + }, + { + "epoch": 4.380711728785007, + "grad_norm": 0.0693885503387989, + "learning_rate": 3.6683507332392476e-06, + "loss": 0.3453, + "step": 1297 + }, + { + "epoch": 4.384080859128238, + "grad_norm": 0.07019744686285931, + "learning_rate": 3.628984887444462e-06, + "loss": 0.3432, + "step": 1298 + }, + { + "epoch": 4.387449989471468, + "grad_norm": 0.06892399615992918, + "learning_rate": 3.589821365237023e-06, + "loss": 0.3422, + "step": 1299 + }, + { + "epoch": 4.390819119814698, + "grad_norm": 0.0711323225202878, + "learning_rate": 3.550860384474568e-06, + "loss": 0.3468, + "step": 1300 + }, + { + "epoch": 4.394188250157928, + "grad_norm": 0.07222951484982641, + "learning_rate": 3.5121021618881e-06, + "loss": 0.3444, + "step": 1301 + }, + { + "epoch": 4.3975573805011585, + "grad_norm": 0.07011816955357002, + "learning_rate": 3.473546913080674e-06, + "loss": 0.3417, + "step": 1302 + }, + { + "epoch": 4.400926510844388, + "grad_norm": 0.06918135608237871, + "learning_rate": 3.4351948525262625e-06, + "loss": 0.3431, + "step": 1303 + }, + { + "epoch": 4.404295641187619, + "grad_norm": 0.07183448949638974, + "learning_rate": 3.397046193568558e-06, + "loss": 0.3454, + "step": 1304 + }, + { + "epoch": 4.407664771530849, + "grad_norm": 0.06841029875272973, + "learning_rate": 3.3591011484197744e-06, + "loss": 0.3471, + "step": 1305 + }, + { + "epoch": 4.411033901874079, + "grad_norm": 0.07008578728288764, + "learning_rate": 3.3213599281594688e-06, + "loss": 0.3469, + "step": 1306 + }, + { + "epoch": 4.414403032217309, + "grad_norm": 0.06784411674661273, + "learning_rate": 3.28382274273336e-06, + "loss": 0.3452, + "step": 1307 + }, + { + "epoch": 4.417772162560539, + "grad_norm": 0.06727601165426443, + "learning_rate": 3.246489800952155e-06, + "loss": 0.3513, + "step": 1308 + }, + { + "epoch": 4.421141292903769, + "grad_norm": 0.06930299868926686, + "learning_rate": 3.209361310490451e-06, + "loss": 0.344, + "step": 1309 + }, + { + "epoch": 4.424510423247, + "grad_norm": 0.06983137546711997, + "learning_rate": 3.172437477885475e-06, + "loss": 0.3432, + "step": 1310 + }, + { + "epoch": 4.4278795535902296, + "grad_norm": 0.06738405898147315, + "learning_rate": 3.1357185085360233e-06, + "loss": 0.3412, + "step": 1311 + }, + { + "epoch": 4.431248683933459, + "grad_norm": 0.069114436702608, + "learning_rate": 3.099204606701256e-06, + "loss": 0.3438, + "step": 1312 + }, + { + "epoch": 4.43461781427669, + "grad_norm": 0.07063250147224803, + "learning_rate": 3.062895975499616e-06, + "loss": 0.3449, + "step": 1313 + }, + { + "epoch": 4.43798694461992, + "grad_norm": 0.06869203050534661, + "learning_rate": 3.026792816907671e-06, + "loss": 0.347, + "step": 1314 + }, + { + "epoch": 4.44135607496315, + "grad_norm": 0.06790795340800003, + "learning_rate": 2.9908953317589675e-06, + "loss": 0.3511, + "step": 1315 + }, + { + "epoch": 4.44472520530638, + "grad_norm": 0.06801706888897209, + "learning_rate": 2.955203719742965e-06, + "loss": 0.3499, + "step": 1316 + }, + { + "epoch": 4.4480943356496105, + "grad_norm": 0.06703090567229934, + "learning_rate": 2.9197181794038896e-06, + "loss": 0.3409, + "step": 1317 + }, + { + "epoch": 4.45146346599284, + "grad_norm": 0.06845785402581211, + "learning_rate": 2.884438908139626e-06, + "loss": 0.3451, + "step": 1318 + }, + { + "epoch": 4.454832596336071, + "grad_norm": 0.06809288242514337, + "learning_rate": 2.8493661022006615e-06, + "loss": 0.349, + "step": 1319 + }, + { + "epoch": 4.458201726679301, + "grad_norm": 0.06993068933675987, + "learning_rate": 2.814499956688912e-06, + "loss": 0.3457, + "step": 1320 + }, + { + "epoch": 4.461570857022531, + "grad_norm": 0.06709969061038806, + "learning_rate": 2.7798406655567565e-06, + "loss": 0.3512, + "step": 1321 + }, + { + "epoch": 4.464939987365761, + "grad_norm": 0.06978980053246452, + "learning_rate": 2.7453884216058368e-06, + "loss": 0.3452, + "step": 1322 + }, + { + "epoch": 4.4683091177089915, + "grad_norm": 0.06806425403838408, + "learning_rate": 2.7111434164860573e-06, + "loss": 0.3489, + "step": 1323 + }, + { + "epoch": 4.471678248052221, + "grad_norm": 0.07023315792460011, + "learning_rate": 2.677105840694507e-06, + "loss": 0.3484, + "step": 1324 + }, + { + "epoch": 4.475047378395452, + "grad_norm": 0.0671632913864402, + "learning_rate": 2.6432758835743854e-06, + "loss": 0.3475, + "step": 1325 + }, + { + "epoch": 4.478416508738682, + "grad_norm": 0.0668737342617598, + "learning_rate": 2.6096537333139616e-06, + "loss": 0.3402, + "step": 1326 + }, + { + "epoch": 4.481785639081912, + "grad_norm": 0.06731813732301019, + "learning_rate": 2.5762395769455183e-06, + "loss": 0.3472, + "step": 1327 + }, + { + "epoch": 4.485154769425142, + "grad_norm": 0.06962894223132757, + "learning_rate": 2.5430336003443045e-06, + "loss": 0.3411, + "step": 1328 + }, + { + "epoch": 4.4885238997683725, + "grad_norm": 0.06651868659879541, + "learning_rate": 2.5100359882275526e-06, + "loss": 0.3463, + "step": 1329 + }, + { + "epoch": 4.491893030111602, + "grad_norm": 0.06589574436809537, + "learning_rate": 2.4772469241533648e-06, + "loss": 0.3449, + "step": 1330 + }, + { + "epoch": 4.495262160454833, + "grad_norm": 0.06851573366912253, + "learning_rate": 2.444666590519775e-06, + "loss": 0.3478, + "step": 1331 + }, + { + "epoch": 4.498631290798063, + "grad_norm": 0.06812334086330306, + "learning_rate": 2.4122951685636674e-06, + "loss": 0.3493, + "step": 1332 + }, + { + "epoch": 4.502000421141293, + "grad_norm": 0.06783544762672909, + "learning_rate": 2.380132838359819e-06, + "loss": 0.3458, + "step": 1333 + }, + { + "epoch": 4.505369551484523, + "grad_norm": 0.06645851016955091, + "learning_rate": 2.3481797788198745e-06, + "loss": 0.3487, + "step": 1334 + }, + { + "epoch": 4.5087386818277535, + "grad_norm": 0.06691716361429041, + "learning_rate": 2.3164361676913406e-06, + "loss": 0.3461, + "step": 1335 + }, + { + "epoch": 4.512107812170983, + "grad_norm": 0.066561132769546, + "learning_rate": 2.284902181556632e-06, + "loss": 0.3451, + "step": 1336 + }, + { + "epoch": 4.515476942514214, + "grad_norm": 0.06972464277014613, + "learning_rate": 2.2535779958320614e-06, + "loss": 0.3363, + "step": 1337 + }, + { + "epoch": 4.5188460728574436, + "grad_norm": 0.06662582951723346, + "learning_rate": 2.2224637847668484e-06, + "loss": 0.3462, + "step": 1338 + }, + { + "epoch": 4.522215203200674, + "grad_norm": 0.06683364588110276, + "learning_rate": 2.1915597214422048e-06, + "loss": 0.345, + "step": 1339 + }, + { + "epoch": 4.525584333543904, + "grad_norm": 0.06973071855720024, + "learning_rate": 2.1608659777703033e-06, + "loss": 0.3486, + "step": 1340 + }, + { + "epoch": 4.5289534638871345, + "grad_norm": 0.06547912030107868, + "learning_rate": 2.130382724493405e-06, + "loss": 0.3481, + "step": 1341 + }, + { + "epoch": 4.532322594230364, + "grad_norm": 0.06796161455803124, + "learning_rate": 2.100110131182813e-06, + "loss": 0.3488, + "step": 1342 + }, + { + "epoch": 4.535691724573595, + "grad_norm": 0.06643717641974535, + "learning_rate": 2.070048366238e-06, + "loss": 0.3453, + "step": 1343 + }, + { + "epoch": 4.5390608549168245, + "grad_norm": 0.0657312313993076, + "learning_rate": 2.0401975968856514e-06, + "loss": 0.3364, + "step": 1344 + }, + { + "epoch": 4.542429985260055, + "grad_norm": 0.0662991056630753, + "learning_rate": 2.010557989178725e-06, + "loss": 0.3456, + "step": 1345 + }, + { + "epoch": 4.545799115603285, + "grad_norm": 0.06723548381525182, + "learning_rate": 1.981129707995542e-06, + "loss": 0.3428, + "step": 1346 + }, + { + "epoch": 4.549168245946515, + "grad_norm": 0.06854275132803765, + "learning_rate": 1.9519129170388496e-06, + "loss": 0.3519, + "step": 1347 + }, + { + "epoch": 4.552537376289745, + "grad_norm": 0.0687917997485082, + "learning_rate": 1.9229077788349393e-06, + "loss": 0.342, + "step": 1348 + }, + { + "epoch": 4.555906506632976, + "grad_norm": 0.06728042882661939, + "learning_rate": 1.8941144547327228e-06, + "loss": 0.3513, + "step": 1349 + }, + { + "epoch": 4.5592756369762055, + "grad_norm": 0.06733086071107253, + "learning_rate": 1.865533104902828e-06, + "loss": 0.3432, + "step": 1350 + }, + { + "epoch": 4.562644767319435, + "grad_norm": 0.06653132755662035, + "learning_rate": 1.8371638883367371e-06, + "loss": 0.3455, + "step": 1351 + }, + { + "epoch": 4.566013897662666, + "grad_norm": 0.07062467690102314, + "learning_rate": 1.8090069628458583e-06, + "loss": 0.3513, + "step": 1352 + }, + { + "epoch": 4.5693830280058965, + "grad_norm": 0.06749739958232552, + "learning_rate": 1.7810624850607007e-06, + "loss": 0.3422, + "step": 1353 + }, + { + "epoch": 4.572752158349126, + "grad_norm": 0.06715174264716953, + "learning_rate": 1.7533306104299663e-06, + "loss": 0.3427, + "step": 1354 + }, + { + "epoch": 4.576121288692356, + "grad_norm": 0.06825607468688703, + "learning_rate": 1.7258114932196824e-06, + "loss": 0.3484, + "step": 1355 + }, + { + "epoch": 4.5794904190355865, + "grad_norm": 0.0662384762896948, + "learning_rate": 1.6985052865123641e-06, + "loss": 0.344, + "step": 1356 + }, + { + "epoch": 4.582859549378816, + "grad_norm": 0.06749795339121123, + "learning_rate": 1.6714121422061636e-06, + "loss": 0.348, + "step": 1357 + }, + { + "epoch": 4.586228679722047, + "grad_norm": 0.06937799589584792, + "learning_rate": 1.6445322110140116e-06, + "loss": 0.3473, + "step": 1358 + }, + { + "epoch": 4.589597810065277, + "grad_norm": 0.06748221547140407, + "learning_rate": 1.617865642462766e-06, + "loss": 0.3414, + "step": 1359 + }, + { + "epoch": 4.592966940408507, + "grad_norm": 0.06814928775630703, + "learning_rate": 1.59141258489242e-06, + "loss": 0.345, + "step": 1360 + }, + { + "epoch": 4.596336070751737, + "grad_norm": 0.07057379791962957, + "learning_rate": 1.5651731854552466e-06, + "loss": 0.3432, + "step": 1361 + }, + { + "epoch": 4.5997052010949675, + "grad_norm": 0.06665029276024906, + "learning_rate": 1.53914759011498e-06, + "loss": 0.3524, + "step": 1362 + }, + { + "epoch": 4.603074331438197, + "grad_norm": 0.06906650342043347, + "learning_rate": 1.513335943646026e-06, + "loss": 0.3457, + "step": 1363 + }, + { + "epoch": 4.606443461781428, + "grad_norm": 0.06942705785663987, + "learning_rate": 1.4877383896326269e-06, + "loss": 0.3435, + "step": 1364 + }, + { + "epoch": 4.6098125921246575, + "grad_norm": 0.06819335124159634, + "learning_rate": 1.4623550704680889e-06, + "loss": 0.3508, + "step": 1365 + }, + { + "epoch": 4.613181722467888, + "grad_norm": 0.06742489592183823, + "learning_rate": 1.4371861273539778e-06, + "loss": 0.3457, + "step": 1366 + }, + { + "epoch": 4.616550852811118, + "grad_norm": 0.064467972456891, + "learning_rate": 1.4122317002993247e-06, + "loss": 0.3437, + "step": 1367 + }, + { + "epoch": 4.6199199831543485, + "grad_norm": 0.06450585611276803, + "learning_rate": 1.3874919281198662e-06, + "loss": 0.3471, + "step": 1368 + }, + { + "epoch": 4.623289113497578, + "grad_norm": 0.06675137780602221, + "learning_rate": 1.3629669484372722e-06, + "loss": 0.3497, + "step": 1369 + }, + { + "epoch": 4.626658243840809, + "grad_norm": 0.06713388756947067, + "learning_rate": 1.3386568976783453e-06, + "loss": 0.3423, + "step": 1370 + }, + { + "epoch": 4.6300273741840385, + "grad_norm": 0.0647734710561896, + "learning_rate": 1.3145619110743169e-06, + "loss": 0.3451, + "step": 1371 + }, + { + "epoch": 4.633396504527269, + "grad_norm": 0.06580879452568121, + "learning_rate": 1.2906821226600453e-06, + "loss": 0.3429, + "step": 1372 + }, + { + "epoch": 4.636765634870499, + "grad_norm": 0.06578978457756152, + "learning_rate": 1.2670176652733023e-06, + "loss": 0.342, + "step": 1373 + }, + { + "epoch": 4.6401347652137295, + "grad_norm": 0.06786565921397064, + "learning_rate": 1.2435686705540228e-06, + "loss": 0.3458, + "step": 1374 + }, + { + "epoch": 4.643503895556959, + "grad_norm": 0.06730192180307096, + "learning_rate": 1.2203352689435532e-06, + "loss": 0.3505, + "step": 1375 + }, + { + "epoch": 4.64687302590019, + "grad_norm": 0.06442684402191479, + "learning_rate": 1.1973175896839684e-06, + "loss": 0.3417, + "step": 1376 + }, + { + "epoch": 4.6502421562434195, + "grad_norm": 0.06497046470832643, + "learning_rate": 1.1745157608173253e-06, + "loss": 0.3429, + "step": 1377 + }, + { + "epoch": 4.65361128658665, + "grad_norm": 0.0655614246650691, + "learning_rate": 1.1519299091849523e-06, + "loss": 0.3405, + "step": 1378 + }, + { + "epoch": 4.65698041692988, + "grad_norm": 0.06746924444935623, + "learning_rate": 1.1295601604267348e-06, + "loss": 0.347, + "step": 1379 + }, + { + "epoch": 4.6603495472731105, + "grad_norm": 0.06671677812012947, + "learning_rate": 1.1074066389804395e-06, + "loss": 0.348, + "step": 1380 + }, + { + "epoch": 4.66371867761634, + "grad_norm": 0.06798688584484958, + "learning_rate": 1.0854694680810175e-06, + "loss": 0.3468, + "step": 1381 + }, + { + "epoch": 4.667087807959571, + "grad_norm": 0.06373690906496436, + "learning_rate": 1.0637487697598937e-06, + "loss": 0.3391, + "step": 1382 + }, + { + "epoch": 4.6704569383028005, + "grad_norm": 0.06902986516002681, + "learning_rate": 1.0422446648443142e-06, + "loss": 0.3449, + "step": 1383 + }, + { + "epoch": 4.673826068646031, + "grad_norm": 0.06783886040134948, + "learning_rate": 1.0209572729566708e-06, + "loss": 0.3469, + "step": 1384 + }, + { + "epoch": 4.677195198989261, + "grad_norm": 0.06789415607732335, + "learning_rate": 9.998867125138223e-07, + "loss": 0.3483, + "step": 1385 + }, + { + "epoch": 4.680564329332491, + "grad_norm": 0.06478682570392917, + "learning_rate": 9.790331007264543e-07, + "loss": 0.3465, + "step": 1386 + }, + { + "epoch": 4.683933459675721, + "grad_norm": 0.06659198241596209, + "learning_rate": 9.583965535983997e-07, + "loss": 0.3377, + "step": 1387 + }, + { + "epoch": 4.687302590018952, + "grad_norm": 0.06679774424195298, + "learning_rate": 9.379771859260267e-07, + "loss": 0.3474, + "step": 1388 + }, + { + "epoch": 4.6906717203621815, + "grad_norm": 0.06562337466888649, + "learning_rate": 9.177751112975853e-07, + "loss": 0.3378, + "step": 1389 + }, + { + "epoch": 4.694040850705411, + "grad_norm": 0.0643058634496552, + "learning_rate": 8.977904420925543e-07, + "loss": 0.3401, + "step": 1390 + }, + { + "epoch": 4.697409981048642, + "grad_norm": 0.06520681777558435, + "learning_rate": 8.780232894810558e-07, + "loss": 0.3476, + "step": 1391 + }, + { + "epoch": 4.700779111391872, + "grad_norm": 0.06652677782803126, + "learning_rate": 8.584737634232154e-07, + "loss": 0.3445, + "step": 1392 + }, + { + "epoch": 4.704148241735102, + "grad_norm": 0.06513347952901734, + "learning_rate": 8.391419726685446e-07, + "loss": 0.3486, + "step": 1393 + }, + { + "epoch": 4.707517372078332, + "grad_norm": 0.06577657248355921, + "learning_rate": 8.200280247553461e-07, + "loss": 0.3461, + "step": 1394 + }, + { + "epoch": 4.7108865024215625, + "grad_norm": 0.06369190711960318, + "learning_rate": 8.011320260101052e-07, + "loss": 0.3478, + "step": 1395 + }, + { + "epoch": 4.714255632764792, + "grad_norm": 0.06569207225402134, + "learning_rate": 7.824540815469306e-07, + "loss": 0.3496, + "step": 1396 + }, + { + "epoch": 4.717624763108023, + "grad_norm": 0.0636558204987421, + "learning_rate": 7.639942952669232e-07, + "loss": 0.3462, + "step": 1397 + }, + { + "epoch": 4.7209938934512525, + "grad_norm": 0.06451389941556673, + "learning_rate": 7.457527698576217e-07, + "loss": 0.3454, + "step": 1398 + }, + { + "epoch": 4.724363023794483, + "grad_norm": 0.06490245056573639, + "learning_rate": 7.277296067924377e-07, + "loss": 0.345, + "step": 1399 + }, + { + "epoch": 4.727732154137713, + "grad_norm": 0.06421211046867673, + "learning_rate": 7.099249063300751e-07, + "loss": 0.3509, + "step": 1400 + }, + { + "epoch": 4.7311012844809435, + "grad_norm": 0.06376468633122387, + "learning_rate": 6.923387675139958e-07, + "loss": 0.3449, + "step": 1401 + }, + { + "epoch": 4.734470414824173, + "grad_norm": 0.06306595288457956, + "learning_rate": 6.749712881718306e-07, + "loss": 0.3438, + "step": 1402 + }, + { + "epoch": 4.737839545167404, + "grad_norm": 0.064531257088043, + "learning_rate": 6.578225649148806e-07, + "loss": 0.3459, + "step": 1403 + }, + { + "epoch": 4.7412086755106335, + "grad_norm": 0.06475033645731526, + "learning_rate": 6.408926931375403e-07, + "loss": 0.3489, + "step": 1404 + }, + { + "epoch": 4.744577805853864, + "grad_norm": 0.06725279891073008, + "learning_rate": 6.241817670167961e-07, + "loss": 0.3517, + "step": 1405 + }, + { + "epoch": 4.747946936197094, + "grad_norm": 0.06576628806576036, + "learning_rate": 6.076898795116792e-07, + "loss": 0.3476, + "step": 1406 + }, + { + "epoch": 4.7513160665403245, + "grad_norm": 0.06636084321383787, + "learning_rate": 5.914171223627652e-07, + "loss": 0.3431, + "step": 1407 + }, + { + "epoch": 4.754685196883554, + "grad_norm": 0.06307439592979396, + "learning_rate": 5.753635860916617e-07, + "loss": 0.344, + "step": 1408 + }, + { + "epoch": 4.758054327226785, + "grad_norm": 0.06354853186497929, + "learning_rate": 5.595293600004948e-07, + "loss": 0.3452, + "step": 1409 + }, + { + "epoch": 4.7614234575700145, + "grad_norm": 0.06640861850363539, + "learning_rate": 5.43914532171419e-07, + "loss": 0.3498, + "step": 1410 + }, + { + "epoch": 4.764792587913245, + "grad_norm": 0.06432227056221736, + "learning_rate": 5.285191894661257e-07, + "loss": 0.3448, + "step": 1411 + }, + { + "epoch": 4.768161718256475, + "grad_norm": 0.0650298496723325, + "learning_rate": 5.133434175253715e-07, + "loss": 0.348, + "step": 1412 + }, + { + "epoch": 4.771530848599705, + "grad_norm": 0.0642338741687956, + "learning_rate": 4.983873007684769e-07, + "loss": 0.3504, + "step": 1413 + }, + { + "epoch": 4.774899978942935, + "grad_norm": 0.06597221985673193, + "learning_rate": 4.83650922392882e-07, + "loss": 0.3443, + "step": 1414 + }, + { + "epoch": 4.778269109286166, + "grad_norm": 0.06414310328903884, + "learning_rate": 4.691343643736579e-07, + "loss": 0.3498, + "step": 1415 + }, + { + "epoch": 4.7816382396293955, + "grad_norm": 0.06423727553913079, + "learning_rate": 4.5483770746309383e-07, + "loss": 0.3462, + "step": 1416 + }, + { + "epoch": 4.785007369972626, + "grad_norm": 0.06712703203955196, + "learning_rate": 4.4076103119018666e-07, + "loss": 0.344, + "step": 1417 + }, + { + "epoch": 4.788376500315856, + "grad_norm": 0.06406676946222813, + "learning_rate": 4.269044138602585e-07, + "loss": 0.3424, + "step": 1418 + }, + { + "epoch": 4.791745630659086, + "grad_norm": 0.0650048525731774, + "learning_rate": 4.132679325544775e-07, + "loss": 0.3434, + "step": 1419 + }, + { + "epoch": 4.795114761002316, + "grad_norm": 0.06381393163242242, + "learning_rate": 3.998516631294491e-07, + "loss": 0.3464, + "step": 1420 + }, + { + "epoch": 4.798483891345547, + "grad_norm": 0.062168147457412865, + "learning_rate": 3.866556802167942e-07, + "loss": 0.3447, + "step": 1421 + }, + { + "epoch": 4.8018530216887765, + "grad_norm": 0.06359774281703022, + "learning_rate": 3.736800572227317e-07, + "loss": 0.3452, + "step": 1422 + }, + { + "epoch": 4.805222152032007, + "grad_norm": 0.06777082256384792, + "learning_rate": 3.6092486632766543e-07, + "loss": 0.3405, + "step": 1423 + }, + { + "epoch": 4.808591282375237, + "grad_norm": 0.06518391137080269, + "learning_rate": 3.483901784857846e-07, + "loss": 0.3499, + "step": 1424 + }, + { + "epoch": 4.811960412718467, + "grad_norm": 0.06360491257484012, + "learning_rate": 3.3607606342467293e-07, + "loss": 0.3464, + "step": 1425 + }, + { + "epoch": 4.815329543061697, + "grad_norm": 0.0630016058736709, + "learning_rate": 3.239825896449267e-07, + "loss": 0.3493, + "step": 1426 + }, + { + "epoch": 4.818698673404928, + "grad_norm": 0.06424370898677036, + "learning_rate": 3.1210982441974623e-07, + "loss": 0.3424, + "step": 1427 + }, + { + "epoch": 4.8220678037481575, + "grad_norm": 0.06333420103209184, + "learning_rate": 3.004578337945985e-07, + "loss": 0.3444, + "step": 1428 + }, + { + "epoch": 4.825436934091387, + "grad_norm": 0.06413449663730773, + "learning_rate": 2.8902668258683043e-07, + "loss": 0.3465, + "step": 1429 + }, + { + "epoch": 4.828806064434618, + "grad_norm": 0.06372049815441223, + "learning_rate": 2.778164343852918e-07, + "loss": 0.3478, + "step": 1430 + }, + { + "epoch": 4.832175194777848, + "grad_norm": 0.06414269762017184, + "learning_rate": 2.668271515500287e-07, + "loss": 0.3502, + "step": 1431 + }, + { + "epoch": 4.835544325121078, + "grad_norm": 0.06533137367117652, + "learning_rate": 2.5605889521188364e-07, + "loss": 0.3491, + "step": 1432 + }, + { + "epoch": 4.838913455464308, + "grad_norm": 0.06350312986484183, + "learning_rate": 2.455117252721895e-07, + "loss": 0.3453, + "step": 1433 + }, + { + "epoch": 4.8422825858075385, + "grad_norm": 0.06475788404284327, + "learning_rate": 2.351857004024316e-07, + "loss": 0.3503, + "step": 1434 + }, + { + "epoch": 4.845651716150769, + "grad_norm": 0.0631781774805789, + "learning_rate": 2.2508087804390178e-07, + "loss": 0.3446, + "step": 1435 + }, + { + "epoch": 4.849020846493999, + "grad_norm": 0.06379282423784381, + "learning_rate": 2.1519731440740487e-07, + "loss": 0.3474, + "step": 1436 + }, + { + "epoch": 4.8523899768372285, + "grad_norm": 0.06402172658556064, + "learning_rate": 2.055350644729348e-07, + "loss": 0.3511, + "step": 1437 + }, + { + "epoch": 4.855759107180459, + "grad_norm": 0.06513215066751245, + "learning_rate": 1.9609418198935916e-07, + "loss": 0.3471, + "step": 1438 + }, + { + "epoch": 4.859128237523689, + "grad_norm": 0.06283559414952865, + "learning_rate": 1.8687471947413495e-07, + "loss": 0.3446, + "step": 1439 + }, + { + "epoch": 4.862497367866919, + "grad_norm": 0.06309493725276366, + "learning_rate": 1.778767282130156e-07, + "loss": 0.3431, + "step": 1440 + }, + { + "epoch": 4.865866498210149, + "grad_norm": 0.06560809934411752, + "learning_rate": 1.691002582597534e-07, + "loss": 0.3526, + "step": 1441 + }, + { + "epoch": 4.86923562855338, + "grad_norm": 0.06433417193452762, + "learning_rate": 1.6054535843582854e-07, + "loss": 0.3507, + "step": 1442 + }, + { + "epoch": 4.8726047588966095, + "grad_norm": 0.06442999780392818, + "learning_rate": 1.522120763301782e-07, + "loss": 0.3492, + "step": 1443 + }, + { + "epoch": 4.87597388923984, + "grad_norm": 0.06306148601810407, + "learning_rate": 1.4410045829893915e-07, + "loss": 0.3434, + "step": 1444 + }, + { + "epoch": 4.87934301958307, + "grad_norm": 0.06308220046755993, + "learning_rate": 1.3621054946517666e-07, + "loss": 0.3445, + "step": 1445 + }, + { + "epoch": 4.8827121499263, + "grad_norm": 0.06305097370353915, + "learning_rate": 1.2854239371863142e-07, + "loss": 0.3431, + "step": 1446 + }, + { + "epoch": 4.88608128026953, + "grad_norm": 0.06293090962933129, + "learning_rate": 1.2109603371548873e-07, + "loss": 0.3397, + "step": 1447 + }, + { + "epoch": 4.889450410612761, + "grad_norm": 0.06368330582611549, + "learning_rate": 1.1387151087814297e-07, + "loss": 0.3468, + "step": 1448 + }, + { + "epoch": 4.8928195409559905, + "grad_norm": 0.0642396525858067, + "learning_rate": 1.06868865394949e-07, + "loss": 0.3419, + "step": 1449 + }, + { + "epoch": 4.896188671299221, + "grad_norm": 0.06286580837917152, + "learning_rate": 1.0008813622001345e-07, + "loss": 0.3465, + "step": 1450 + }, + { + "epoch": 4.899557801642451, + "grad_norm": 0.0646704999704258, + "learning_rate": 9.352936107296817e-08, + "loss": 0.3515, + "step": 1451 + }, + { + "epoch": 4.902926931985681, + "grad_norm": 0.06254527612862122, + "learning_rate": 8.719257643877044e-08, + "loss": 0.3418, + "step": 1452 + }, + { + "epoch": 4.906296062328911, + "grad_norm": 0.06265534232163783, + "learning_rate": 8.107781756749866e-08, + "loss": 0.3417, + "step": 1453 + }, + { + "epoch": 4.909665192672142, + "grad_norm": 0.06417368994248919, + "learning_rate": 7.51851184741481e-08, + "loss": 0.3451, + "step": 1454 + }, + { + "epoch": 4.9130343230153715, + "grad_norm": 0.06427635001716354, + "learning_rate": 6.951451193844883e-08, + "loss": 0.3517, + "step": 1455 + }, + { + "epoch": 4.916403453358602, + "grad_norm": 0.06446286415220177, + "learning_rate": 6.40660295046791e-08, + "loss": 0.3499, + "step": 1456 + }, + { + "epoch": 4.919772583701832, + "grad_norm": 0.06325304997383964, + "learning_rate": 5.8839701481487875e-08, + "loss": 0.3437, + "step": 1457 + }, + { + "epoch": 4.923141714045062, + "grad_norm": 0.06376968784671593, + "learning_rate": 5.3835556941743695e-08, + "loss": 0.3423, + "step": 1458 + }, + { + "epoch": 4.926510844388292, + "grad_norm": 0.06529781285688359, + "learning_rate": 4.905362372234379e-08, + "loss": 0.3492, + "step": 1459 + }, + { + "epoch": 4.929879974731523, + "grad_norm": 0.06414078488995091, + "learning_rate": 4.449392842408529e-08, + "loss": 0.3479, + "step": 1460 + }, + { + "epoch": 4.9332491050747524, + "grad_norm": 0.06362859239383568, + "learning_rate": 4.015649641150976e-08, + "loss": 0.3492, + "step": 1461 + }, + { + "epoch": 4.936618235417983, + "grad_norm": 0.06341769294492185, + "learning_rate": 3.6041351812743374e-08, + "loss": 0.351, + "step": 1462 + }, + { + "epoch": 4.939987365761213, + "grad_norm": 0.06486183719402762, + "learning_rate": 3.21485175193903e-08, + "loss": 0.3511, + "step": 1463 + }, + { + "epoch": 4.943356496104443, + "grad_norm": 0.06360741943701602, + "learning_rate": 2.8478015186399477e-08, + "loss": 0.3471, + "step": 1464 + }, + { + "epoch": 4.946725626447673, + "grad_norm": 0.06343696624954866, + "learning_rate": 2.5029865231922524e-08, + "loss": 0.3448, + "step": 1465 + }, + { + "epoch": 4.950094756790904, + "grad_norm": 0.06343915127065658, + "learning_rate": 2.1804086837229344e-08, + "loss": 0.3416, + "step": 1466 + }, + { + "epoch": 4.953463887134133, + "grad_norm": 0.06487303485827695, + "learning_rate": 1.880069794657935e-08, + "loss": 0.3444, + "step": 1467 + }, + { + "epoch": 4.956833017477363, + "grad_norm": 0.062408603956769386, + "learning_rate": 1.601971526713708e-08, + "loss": 0.341, + "step": 1468 + }, + { + "epoch": 4.960202147820594, + "grad_norm": 0.06255760369115392, + "learning_rate": 1.3461154268865628e-08, + "loss": 0.3445, + "step": 1469 + }, + { + "epoch": 4.963571278163824, + "grad_norm": 0.062112638608570706, + "learning_rate": 1.112502918445113e-08, + "loss": 0.3391, + "step": 1470 + }, + { + "epoch": 4.966940408507054, + "grad_norm": 0.06398681422452646, + "learning_rate": 9.011353009222846e-09, + "loss": 0.3455, + "step": 1471 + }, + { + "epoch": 4.970309538850284, + "grad_norm": 0.0637738300165632, + "learning_rate": 7.12013750107321e-09, + "loss": 0.3438, + "step": 1472 + }, + { + "epoch": 4.973678669193514, + "grad_norm": 0.06456086790149927, + "learning_rate": 5.451393180400111e-09, + "loss": 0.3486, + "step": 1473 + }, + { + "epoch": 4.977047799536745, + "grad_norm": 0.06334490636848067, + "learning_rate": 4.00512933004471e-09, + "loss": 0.3456, + "step": 1474 + }, + { + "epoch": 4.980416929879975, + "grad_norm": 0.06295292438577572, + "learning_rate": 2.7813539952381563e-09, + "loss": 0.3445, + "step": 1475 + }, + { + "epoch": 4.9837860602232045, + "grad_norm": 0.0633108315280129, + "learning_rate": 1.7800739835616143e-09, + "loss": 0.3451, + "step": 1476 + }, + { + "epoch": 4.987155190566435, + "grad_norm": 0.0630218856533905, + "learning_rate": 1.0012948649018584e-09, + "loss": 0.3497, + "step": 1477 + }, + { + "epoch": 4.990524320909666, + "grad_norm": 0.06351551674205162, + "learning_rate": 4.450209714379483e-10, + "loss": 0.3382, + "step": 1478 + }, + { + "epoch": 4.993893451252895, + "grad_norm": 0.06362931363383374, + "learning_rate": 1.1125539757905756e-10, + "loss": 0.3436, + "step": 1479 + }, + { + "epoch": 4.997262581596125, + "grad_norm": 0.0635855435860357, + "learning_rate": 0.0, + "loss": 0.3456, + "step": 1480 + }, + { + "epoch": 4.997262581596125, + "step": 1480, + "total_flos": 3.94117975967185e+19, + "train_loss": 0.06913654437741718, + "train_runtime": 69116.03, + "train_samples_per_second": 10.993, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 1, + "max_steps": 1480, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.94117975967185e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}