| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 782, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0025575447570332483, |
| "grad_norm": 23.002245937737467, |
| "learning_rate": 9.999959651660741e-06, |
| "loss": 0.7701, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005115089514066497, |
| "grad_norm": 19.79462448378255, |
| "learning_rate": 9.999838607294157e-06, |
| "loss": 0.6713, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0076726342710997444, |
| "grad_norm": 55.54404299524967, |
| "learning_rate": 9.999636868853824e-06, |
| "loss": 0.4213, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.010230179028132993, |
| "grad_norm": 26.73299552059074, |
| "learning_rate": 9.999354439595668e-06, |
| "loss": 0.448, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01278772378516624, |
| "grad_norm": 50.03710815295853, |
| "learning_rate": 9.998991324077906e-06, |
| "loss": 0.3936, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.015345268542199489, |
| "grad_norm": 5.992119789491709, |
| "learning_rate": 9.998547528160987e-06, |
| "loss": 0.4206, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.017902813299232736, |
| "grad_norm": 4.816601147153798, |
| "learning_rate": 9.998023059007477e-06, |
| "loss": 0.3167, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.020460358056265986, |
| "grad_norm": 5.01117985790271, |
| "learning_rate": 9.997417925081963e-06, |
| "loss": 0.3277, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.023017902813299233, |
| "grad_norm": 4.645354592299978, |
| "learning_rate": 9.996732136150902e-06, |
| "loss": 0.3238, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.02557544757033248, |
| "grad_norm": 3.8492784258791737, |
| "learning_rate": 9.995965703282472e-06, |
| "loss": 0.2923, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.028132992327365727, |
| "grad_norm": 5.08776547587877, |
| "learning_rate": 9.995118638846394e-06, |
| "loss": 0.3566, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.030690537084398978, |
| "grad_norm": 5.890303471165605, |
| "learning_rate": 9.99419095651372e-06, |
| "loss": 0.3514, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.03324808184143223, |
| "grad_norm": 4.820498331793851, |
| "learning_rate": 9.993182671256633e-06, |
| "loss": 0.3659, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.03580562659846547, |
| "grad_norm": 5.443835157723716, |
| "learning_rate": 9.992093799348182e-06, |
| "loss": 0.3363, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03836317135549872, |
| "grad_norm": 31.850268661359863, |
| "learning_rate": 9.990924358362037e-06, |
| "loss": 0.2992, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04092071611253197, |
| "grad_norm": 4.020803137085757, |
| "learning_rate": 9.9896743671722e-06, |
| "loss": 0.3472, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.043478260869565216, |
| "grad_norm": 4.704323903966358, |
| "learning_rate": 9.988343845952697e-06, |
| "loss": 0.3549, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.04603580562659847, |
| "grad_norm": 4.016905077737534, |
| "learning_rate": 9.986932816177258e-06, |
| "loss": 0.2994, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04859335038363171, |
| "grad_norm": 5.055643436808016, |
| "learning_rate": 9.985441300618966e-06, |
| "loss": 0.3179, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.05115089514066496, |
| "grad_norm": 4.296200137910896, |
| "learning_rate": 9.98386932334989e-06, |
| "loss": 0.3163, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05370843989769821, |
| "grad_norm": 3.6283871609346217, |
| "learning_rate": 9.982216909740703e-06, |
| "loss": 0.2571, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.056265984654731455, |
| "grad_norm": 3.6813787133878213, |
| "learning_rate": 9.980484086460258e-06, |
| "loss": 0.3126, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.058823529411764705, |
| "grad_norm": 4.358055860374409, |
| "learning_rate": 9.978670881475173e-06, |
| "loss": 0.2937, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.061381074168797956, |
| "grad_norm": 4.477520167271448, |
| "learning_rate": 9.976777324049374e-06, |
| "loss": 0.3215, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.0639386189258312, |
| "grad_norm": 3.7668725752663277, |
| "learning_rate": 9.974803444743617e-06, |
| "loss": 0.2998, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06649616368286446, |
| "grad_norm": 4.0809484824780435, |
| "learning_rate": 9.972749275415005e-06, |
| "loss": 0.3044, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.06905370843989769, |
| "grad_norm": 5.413718732497824, |
| "learning_rate": 9.970614849216465e-06, |
| "loss": 0.2632, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.07161125319693094, |
| "grad_norm": 3.730536919122114, |
| "learning_rate": 9.96840020059622e-06, |
| "loss": 0.3246, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0741687979539642, |
| "grad_norm": 3.731019179851398, |
| "learning_rate": 9.966105365297226e-06, |
| "loss": 0.276, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.07672634271099744, |
| "grad_norm": 4.234204618475807, |
| "learning_rate": 9.963730380356599e-06, |
| "loss": 0.2954, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0792838874680307, |
| "grad_norm": 4.462805286038281, |
| "learning_rate": 9.96127528410502e-06, |
| "loss": 0.3379, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.08184143222506395, |
| "grad_norm": 3.9662958792274665, |
| "learning_rate": 9.958740116166113e-06, |
| "loss": 0.2945, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.08439897698209718, |
| "grad_norm": 3.571374464706402, |
| "learning_rate": 9.9561249174558e-06, |
| "loss": 0.261, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.08695652173913043, |
| "grad_norm": 3.981988792102028, |
| "learning_rate": 9.953429730181653e-06, |
| "loss": 0.3081, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.08951406649616368, |
| "grad_norm": 4.03802779192331, |
| "learning_rate": 9.950654597842209e-06, |
| "loss": 0.285, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09207161125319693, |
| "grad_norm": 3.5198570607375625, |
| "learning_rate": 9.947799565226253e-06, |
| "loss": 0.2842, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.09462915601023018, |
| "grad_norm": 4.510788445189442, |
| "learning_rate": 9.944864678412118e-06, |
| "loss": 0.2977, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.09718670076726342, |
| "grad_norm": 3.459042036170383, |
| "learning_rate": 9.94184998476693e-06, |
| "loss": 0.2482, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.09974424552429667, |
| "grad_norm": 3.6104760454517026, |
| "learning_rate": 9.938755532945838e-06, |
| "loss": 0.2911, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.10230179028132992, |
| "grad_norm": 3.2264924647067885, |
| "learning_rate": 9.93558137289124e-06, |
| "loss": 0.2823, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10485933503836317, |
| "grad_norm": 3.050087024580229, |
| "learning_rate": 9.932327555831972e-06, |
| "loss": 0.1963, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.10741687979539642, |
| "grad_norm": 3.0298519282035836, |
| "learning_rate": 9.928994134282477e-06, |
| "loss": 0.2329, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.10997442455242967, |
| "grad_norm": 3.489339177075647, |
| "learning_rate": 9.925581162041967e-06, |
| "loss": 0.2361, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.11253196930946291, |
| "grad_norm": 3.173939549803925, |
| "learning_rate": 9.922088694193546e-06, |
| "loss": 0.2317, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.11508951406649616, |
| "grad_norm": 2.7796494028422525, |
| "learning_rate": 9.918516787103322e-06, |
| "loss": 0.2345, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11764705882352941, |
| "grad_norm": 3.528817318108321, |
| "learning_rate": 9.91486549841951e-06, |
| "loss": 0.2543, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.12020460358056266, |
| "grad_norm": 3.535704208599901, |
| "learning_rate": 9.911134887071477e-06, |
| "loss": 0.2612, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.12276214833759591, |
| "grad_norm": 3.4658842882423158, |
| "learning_rate": 9.907325013268816e-06, |
| "loss": 0.2748, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.12531969309462915, |
| "grad_norm": 2.630457541489405, |
| "learning_rate": 9.903435938500356e-06, |
| "loss": 0.1958, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.1278772378516624, |
| "grad_norm": 2.915891246747101, |
| "learning_rate": 9.899467725533181e-06, |
| "loss": 0.2338, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13043478260869565, |
| "grad_norm": 3.40288133819665, |
| "learning_rate": 9.895420438411616e-06, |
| "loss": 0.2501, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.1329923273657289, |
| "grad_norm": 3.2304039819864148, |
| "learning_rate": 9.89129414245618e-06, |
| "loss": 0.222, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.13554987212276215, |
| "grad_norm": 2.911477652143155, |
| "learning_rate": 9.887088904262557e-06, |
| "loss": 0.2285, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.13810741687979539, |
| "grad_norm": 3.042507664568314, |
| "learning_rate": 9.882804791700488e-06, |
| "loss": 0.2582, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.14066496163682865, |
| "grad_norm": 2.980463931027759, |
| "learning_rate": 9.878441873912712e-06, |
| "loss": 0.2438, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1432225063938619, |
| "grad_norm": 3.8619552465100853, |
| "learning_rate": 9.87400022131382e-06, |
| "loss": 0.2897, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.14578005115089515, |
| "grad_norm": 2.69396158196004, |
| "learning_rate": 9.869479905589136e-06, |
| "loss": 0.2023, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.1483375959079284, |
| "grad_norm": 2.935987135845637, |
| "learning_rate": 9.864880999693551e-06, |
| "loss": 0.1895, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.15089514066496162, |
| "grad_norm": 3.2452196053990705, |
| "learning_rate": 9.860203577850353e-06, |
| "loss": 0.2361, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.1534526854219949, |
| "grad_norm": 2.87111398999677, |
| "learning_rate": 9.855447715550024e-06, |
| "loss": 0.2274, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15601023017902813, |
| "grad_norm": 2.4291351635075635, |
| "learning_rate": 9.850613489549018e-06, |
| "loss": 0.1846, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.1585677749360614, |
| "grad_norm": 3.488870929695702, |
| "learning_rate": 9.845700977868536e-06, |
| "loss": 0.2709, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.16112531969309463, |
| "grad_norm": 2.85759392571804, |
| "learning_rate": 9.840710259793251e-06, |
| "loss": 0.199, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.1636828644501279, |
| "grad_norm": 3.7624802921070772, |
| "learning_rate": 9.835641415870038e-06, |
| "loss": 0.2974, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.16624040920716113, |
| "grad_norm": 3.1043894363694924, |
| "learning_rate": 9.830494527906671e-06, |
| "loss": 0.1943, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.16879795396419436, |
| "grad_norm": 3.0241718749527813, |
| "learning_rate": 9.825269678970502e-06, |
| "loss": 0.2257, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.17135549872122763, |
| "grad_norm": 3.043220225169984, |
| "learning_rate": 9.819966953387122e-06, |
| "loss": 0.2404, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.17391304347826086, |
| "grad_norm": 3.326803312632149, |
| "learning_rate": 9.814586436738998e-06, |
| "loss": 0.2373, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.17647058823529413, |
| "grad_norm": 3.3830308282373807, |
| "learning_rate": 9.809128215864096e-06, |
| "loss": 0.2878, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.17902813299232737, |
| "grad_norm": 2.6361004206605534, |
| "learning_rate": 9.803592378854476e-06, |
| "loss": 0.2244, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1815856777493606, |
| "grad_norm": 2.813271069940707, |
| "learning_rate": 9.797979015054868e-06, |
| "loss": 0.2411, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.18414322250639387, |
| "grad_norm": 3.135685284217803, |
| "learning_rate": 9.792288215061237e-06, |
| "loss": 0.2383, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.1867007672634271, |
| "grad_norm": 3.6498597735648017, |
| "learning_rate": 9.786520070719313e-06, |
| "loss": 0.2021, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.18925831202046037, |
| "grad_norm": 2.96750507093361, |
| "learning_rate": 9.780674675123113e-06, |
| "loss": 0.2093, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.1918158567774936, |
| "grad_norm": 3.2483917783877003, |
| "learning_rate": 9.77475212261344e-06, |
| "loss": 0.2647, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19437340153452684, |
| "grad_norm": 3.2378425660345065, |
| "learning_rate": 9.768752508776358e-06, |
| "loss": 0.1871, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.1969309462915601, |
| "grad_norm": 2.755630840313636, |
| "learning_rate": 9.762675930441647e-06, |
| "loss": 0.2015, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.19948849104859334, |
| "grad_norm": 2.8181426454401493, |
| "learning_rate": 9.756522485681247e-06, |
| "loss": 0.2403, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2020460358056266, |
| "grad_norm": 3.632918000696892, |
| "learning_rate": 9.750292273807666e-06, |
| "loss": 0.2978, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.20460358056265984, |
| "grad_norm": 3.893813012751235, |
| "learning_rate": 9.743985395372387e-06, |
| "loss": 0.2615, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2071611253196931, |
| "grad_norm": 3.7796636445556815, |
| "learning_rate": 9.737601952164238e-06, |
| "loss": 0.2785, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.20971867007672634, |
| "grad_norm": 3.1590261591104243, |
| "learning_rate": 9.73114204720775e-06, |
| "loss": 0.2465, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.21227621483375958, |
| "grad_norm": 2.593757039460282, |
| "learning_rate": 9.724605784761501e-06, |
| "loss": 0.2303, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.21483375959079284, |
| "grad_norm": 2.9633378595540463, |
| "learning_rate": 9.717993270316421e-06, |
| "loss": 0.1924, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.21739130434782608, |
| "grad_norm": 2.834487770952517, |
| "learning_rate": 9.711304610594104e-06, |
| "loss": 0.2601, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.21994884910485935, |
| "grad_norm": 2.393543544620746, |
| "learning_rate": 9.704539913545073e-06, |
| "loss": 0.2051, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.22250639386189258, |
| "grad_norm": 2.900171504184875, |
| "learning_rate": 9.697699288347043e-06, |
| "loss": 0.1965, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.22506393861892582, |
| "grad_norm": 2.384624513559865, |
| "learning_rate": 9.690782845403164e-06, |
| "loss": 0.1861, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.22762148337595908, |
| "grad_norm": 2.8687734099612183, |
| "learning_rate": 9.683790696340229e-06, |
| "loss": 0.2277, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.23017902813299232, |
| "grad_norm": 3.1994663958961325, |
| "learning_rate": 9.676722954006878e-06, |
| "loss": 0.2639, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.23273657289002558, |
| "grad_norm": 3.5946497721000803, |
| "learning_rate": 9.669579732471779e-06, |
| "loss": 0.3063, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 3.2126140147347058, |
| "learning_rate": 9.66236114702178e-06, |
| "loss": 0.2311, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.23785166240409208, |
| "grad_norm": 2.898350923062013, |
| "learning_rate": 9.655067314160058e-06, |
| "loss": 0.2296, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.24040920716112532, |
| "grad_norm": 3.377946545735314, |
| "learning_rate": 9.647698351604227e-06, |
| "loss": 0.3008, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.24296675191815856, |
| "grad_norm": 3.299627018131192, |
| "learning_rate": 9.640254378284447e-06, |
| "loss": 0.2697, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.24552429667519182, |
| "grad_norm": 3.11855005777831, |
| "learning_rate": 9.632735514341508e-06, |
| "loss": 0.267, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.24808184143222506, |
| "grad_norm": 3.6830270996696823, |
| "learning_rate": 9.625141881124874e-06, |
| "loss": 0.3026, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.2506393861892583, |
| "grad_norm": 3.908203185558239, |
| "learning_rate": 9.617473601190743e-06, |
| "loss": 0.206, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.2531969309462916, |
| "grad_norm": 2.743300056729194, |
| "learning_rate": 9.609730798300056e-06, |
| "loss": 0.1799, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.2557544757033248, |
| "grad_norm": 4.814431714040076, |
| "learning_rate": 9.601913597416513e-06, |
| "loss": 0.211, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25831202046035806, |
| "grad_norm": 4.268877383945361, |
| "learning_rate": 9.594022124704541e-06, |
| "loss": 0.345, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.2608695652173913, |
| "grad_norm": 3.383521599612797, |
| "learning_rate": 9.586056507527266e-06, |
| "loss": 0.2137, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.26342710997442453, |
| "grad_norm": 3.4109070658867338, |
| "learning_rate": 9.578016874444459e-06, |
| "loss": 0.2096, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.2659846547314578, |
| "grad_norm": 4.44984138610315, |
| "learning_rate": 9.569903355210457e-06, |
| "loss": 0.181, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.26854219948849106, |
| "grad_norm": 3.14837415888387, |
| "learning_rate": 9.561716080772072e-06, |
| "loss": 0.2379, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2710997442455243, |
| "grad_norm": 3.538274906585293, |
| "learning_rate": 9.55345518326647e-06, |
| "loss": 0.2866, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.27365728900255754, |
| "grad_norm": 4.12006215133929, |
| "learning_rate": 9.545120796019056e-06, |
| "loss": 0.2089, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.27621483375959077, |
| "grad_norm": 3.7367349991712984, |
| "learning_rate": 9.5367130535413e-06, |
| "loss": 0.2824, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.27877237851662406, |
| "grad_norm": 24.188026787197614, |
| "learning_rate": 9.528232091528578e-06, |
| "loss": 0.2343, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.2813299232736573, |
| "grad_norm": 5.518174949495067, |
| "learning_rate": 9.519678046857987e-06, |
| "loss": 0.2359, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.28388746803069054, |
| "grad_norm": 3.1735804164552444, |
| "learning_rate": 9.511051057586125e-06, |
| "loss": 0.2662, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.2864450127877238, |
| "grad_norm": 2.6308622708865803, |
| "learning_rate": 9.502351262946865e-06, |
| "loss": 0.2315, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.289002557544757, |
| "grad_norm": 3.545647642738695, |
| "learning_rate": 9.493578803349117e-06, |
| "loss": 0.1944, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.2915601023017903, |
| "grad_norm": 3.5521955511759944, |
| "learning_rate": 9.48473382037455e-06, |
| "loss": 0.2626, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 4.054758472454413, |
| "learning_rate": 9.475816456775313e-06, |
| "loss": 0.2714, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2966751918158568, |
| "grad_norm": 8.230492979964824, |
| "learning_rate": 9.466826856471728e-06, |
| "loss": 0.2479, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.29923273657289, |
| "grad_norm": 3.6339322967957037, |
| "learning_rate": 9.457765164549979e-06, |
| "loss": 0.2345, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.30179028132992325, |
| "grad_norm": 297.39275307922753, |
| "learning_rate": 9.448631527259749e-06, |
| "loss": 0.3842, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.30434782608695654, |
| "grad_norm": 5.3234836238777, |
| "learning_rate": 9.439426092011877e-06, |
| "loss": 0.1926, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3069053708439898, |
| "grad_norm": 46.01851594690418, |
| "learning_rate": 9.430149007375974e-06, |
| "loss": 0.2565, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.309462915601023, |
| "grad_norm": 5.302182848361836, |
| "learning_rate": 9.42080042307802e-06, |
| "loss": 0.2408, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.31202046035805625, |
| "grad_norm": 2.578340943631881, |
| "learning_rate": 9.411380489997962e-06, |
| "loss": 0.1934, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.3145780051150895, |
| "grad_norm": 3.9625100814656786, |
| "learning_rate": 9.401889360167256e-06, |
| "loss": 0.2653, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.3171355498721228, |
| "grad_norm": 2.767266370495417, |
| "learning_rate": 9.392327186766434e-06, |
| "loss": 0.2236, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.319693094629156, |
| "grad_norm": 2.366592695821496, |
| "learning_rate": 9.382694124122624e-06, |
| "loss": 0.2089, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.32225063938618925, |
| "grad_norm": 2.366756458486761, |
| "learning_rate": 9.372990327707057e-06, |
| "loss": 0.184, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.3248081841432225, |
| "grad_norm": 2.7874992191290815, |
| "learning_rate": 9.36321595413256e-06, |
| "loss": 0.2171, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.3273657289002558, |
| "grad_norm": 2.8285353939596978, |
| "learning_rate": 9.353371161151032e-06, |
| "loss": 0.2968, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.329923273657289, |
| "grad_norm": 2.614287807796315, |
| "learning_rate": 9.34345610765089e-06, |
| "loss": 0.1734, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.33248081841432225, |
| "grad_norm": 3.3261759665055326, |
| "learning_rate": 9.333470953654513e-06, |
| "loss": 0.2976, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3350383631713555, |
| "grad_norm": 2.6928328036310862, |
| "learning_rate": 9.32341586031565e-06, |
| "loss": 0.2781, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.3375959079283887, |
| "grad_norm": 3.251358150140968, |
| "learning_rate": 9.31329098991683e-06, |
| "loss": 0.2767, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.340153452685422, |
| "grad_norm": 3.1871135167369418, |
| "learning_rate": 9.303096505866734e-06, |
| "loss": 0.2442, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.34271099744245526, |
| "grad_norm": 2.5681474052955444, |
| "learning_rate": 9.292832572697566e-06, |
| "loss": 0.2291, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.3452685421994885, |
| "grad_norm": 2.3640353419986506, |
| "learning_rate": 9.282499356062385e-06, |
| "loss": 0.2271, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.34782608695652173, |
| "grad_norm": 2.8099850692384134, |
| "learning_rate": 9.272097022732444e-06, |
| "loss": 0.2093, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.35038363171355497, |
| "grad_norm": 3.049978602373718, |
| "learning_rate": 9.261625740594494e-06, |
| "loss": 0.2372, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.35294117647058826, |
| "grad_norm": 3.137496790956829, |
| "learning_rate": 9.251085678648072e-06, |
| "loss": 0.2506, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.3554987212276215, |
| "grad_norm": 2.867267066152733, |
| "learning_rate": 9.240477007002777e-06, |
| "loss": 0.2147, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.35805626598465473, |
| "grad_norm": 3.3365434423871125, |
| "learning_rate": 9.22979989687552e-06, |
| "loss": 0.3041, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.36061381074168797, |
| "grad_norm": 2.9940933133119563, |
| "learning_rate": 9.219054520587766e-06, |
| "loss": 0.1847, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.3631713554987212, |
| "grad_norm": 2.4723991968002976, |
| "learning_rate": 9.208241051562753e-06, |
| "loss": 0.228, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.3657289002557545, |
| "grad_norm": 3.305464271619873, |
| "learning_rate": 9.197359664322684e-06, |
| "loss": 0.2376, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.36828644501278773, |
| "grad_norm": 2.8768687198709935, |
| "learning_rate": 9.186410534485924e-06, |
| "loss": 0.2564, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.37084398976982097, |
| "grad_norm": 2.478708936880423, |
| "learning_rate": 9.175393838764153e-06, |
| "loss": 0.1982, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3734015345268542, |
| "grad_norm": 2.3943362655534215, |
| "learning_rate": 9.164309754959523e-06, |
| "loss": 0.2107, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.37595907928388744, |
| "grad_norm": 2.6390423103142604, |
| "learning_rate": 9.153158461961782e-06, |
| "loss": 0.2024, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.37851662404092073, |
| "grad_norm": 2.5371647794977874, |
| "learning_rate": 9.14194013974539e-06, |
| "loss": 0.1916, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.38107416879795397, |
| "grad_norm": 2.458057040535335, |
| "learning_rate": 9.130654969366619e-06, |
| "loss": 0.2228, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.3836317135549872, |
| "grad_norm": 2.5637807307196736, |
| "learning_rate": 9.11930313296062e-06, |
| "loss": 0.212, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.38618925831202044, |
| "grad_norm": 3.123041884855774, |
| "learning_rate": 9.107884813738492e-06, |
| "loss": 0.2739, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.3887468030690537, |
| "grad_norm": 2.4409380929261904, |
| "learning_rate": 9.096400195984322e-06, |
| "loss": 0.1773, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.391304347826087, |
| "grad_norm": 3.0511733344744982, |
| "learning_rate": 9.08484946505221e-06, |
| "loss": 0.2202, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.3938618925831202, |
| "grad_norm": 2.881737174801456, |
| "learning_rate": 9.073232807363283e-06, |
| "loss": 0.2198, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.39641943734015345, |
| "grad_norm": 3.5525444573087652, |
| "learning_rate": 9.061550410402677e-06, |
| "loss": 0.2939, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3989769820971867, |
| "grad_norm": 3.3680781528749604, |
| "learning_rate": 9.049802462716521e-06, |
| "loss": 0.2434, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.40153452685422, |
| "grad_norm": 3.939196043495901, |
| "learning_rate": 9.037989153908882e-06, |
| "loss": 0.2628, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.4040920716112532, |
| "grad_norm": 2.5079359353983155, |
| "learning_rate": 9.026110674638722e-06, |
| "loss": 0.2171, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.40664961636828645, |
| "grad_norm": 2.593975182068212, |
| "learning_rate": 9.0141672166168e-06, |
| "loss": 0.1617, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.4092071611253197, |
| "grad_norm": 3.0574104006787604, |
| "learning_rate": 9.002158972602599e-06, |
| "loss": 0.3072, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4117647058823529, |
| "grad_norm": 2.948058496064816, |
| "learning_rate": 8.990086136401199e-06, |
| "loss": 0.2385, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.4143222506393862, |
| "grad_norm": 3.702960642790453, |
| "learning_rate": 8.977948902860154e-06, |
| "loss": 0.2847, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.41687979539641945, |
| "grad_norm": 2.669781467285976, |
| "learning_rate": 8.965747467866355e-06, |
| "loss": 0.1822, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.4194373401534527, |
| "grad_norm": 2.6806911897560295, |
| "learning_rate": 8.953482028342853e-06, |
| "loss": 0.2796, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.4219948849104859, |
| "grad_norm": 2.5442421084289255, |
| "learning_rate": 8.9411527822457e-06, |
| "loss": 0.1958, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.42455242966751916, |
| "grad_norm": 2.9096327300999345, |
| "learning_rate": 8.92875992856073e-06, |
| "loss": 0.2634, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.42710997442455245, |
| "grad_norm": 2.285111815505967, |
| "learning_rate": 8.916303667300373e-06, |
| "loss": 0.1844, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.4296675191815857, |
| "grad_norm": 3.143641298577265, |
| "learning_rate": 8.903784199500412e-06, |
| "loss": 0.22, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.4322250639386189, |
| "grad_norm": 3.1734443032455126, |
| "learning_rate": 8.89120172721674e-06, |
| "loss": 0.2289, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.43478260869565216, |
| "grad_norm": 3.236825673549631, |
| "learning_rate": 8.8785564535221e-06, |
| "loss": 0.2206, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4373401534526854, |
| "grad_norm": 2.993422962262724, |
| "learning_rate": 8.86584858250281e-06, |
| "loss": 0.2355, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.4398976982097187, |
| "grad_norm": 2.8378682738193826, |
| "learning_rate": 8.853078319255466e-06, |
| "loss": 0.2477, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.4424552429667519, |
| "grad_norm": 2.6712684943657403, |
| "learning_rate": 8.840245869883635e-06, |
| "loss": 0.2457, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.44501278772378516, |
| "grad_norm": 2.171829542476822, |
| "learning_rate": 8.827351441494525e-06, |
| "loss": 0.2035, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.4475703324808184, |
| "grad_norm": 3.227921740305447, |
| "learning_rate": 8.814395242195642e-06, |
| "loss": 0.2979, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.45012787723785164, |
| "grad_norm": 2.7461113116814926, |
| "learning_rate": 8.80137748109144e-06, |
| "loss": 0.2195, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.45268542199488493, |
| "grad_norm": 1.9489364216184313, |
| "learning_rate": 8.78829836827993e-06, |
| "loss": 0.1745, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.45524296675191817, |
| "grad_norm": 2.390111931670749, |
| "learning_rate": 8.77515811484931e-06, |
| "loss": 0.2287, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.4578005115089514, |
| "grad_norm": 2.32280460729142, |
| "learning_rate": 8.761956932874539e-06, |
| "loss": 0.1771, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.46035805626598464, |
| "grad_norm": 2.7594731738765237, |
| "learning_rate": 8.748695035413925e-06, |
| "loss": 0.2272, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4629156010230179, |
| "grad_norm": 2.2666034081466737, |
| "learning_rate": 8.735372636505681e-06, |
| "loss": 0.2167, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.46547314578005117, |
| "grad_norm": 2.427582658847167, |
| "learning_rate": 8.72198995116448e-06, |
| "loss": 0.24, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.4680306905370844, |
| "grad_norm": 2.5568780604163326, |
| "learning_rate": 8.708547195377968e-06, |
| "loss": 0.3001, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 2.8819289669178163, |
| "learning_rate": 8.695044586103297e-06, |
| "loss": 0.2397, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.4731457800511509, |
| "grad_norm": 3.867315677781016, |
| "learning_rate": 8.68148234126361e-06, |
| "loss": 0.2899, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.47570332480818417, |
| "grad_norm": 2.804180003869186, |
| "learning_rate": 8.667860679744529e-06, |
| "loss": 0.2094, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.4782608695652174, |
| "grad_norm": 2.7034209359291315, |
| "learning_rate": 8.65417982139062e-06, |
| "loss": 0.2324, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.48081841432225064, |
| "grad_norm": 2.2812365119673874, |
| "learning_rate": 8.640439987001855e-06, |
| "loss": 0.2029, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.4833759590792839, |
| "grad_norm": 2.6299858026479517, |
| "learning_rate": 8.626641398330027e-06, |
| "loss": 0.2137, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4859335038363171, |
| "grad_norm": 2.1221936568568465, |
| "learning_rate": 8.612784278075195e-06, |
| "loss": 0.2144, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4884910485933504, |
| "grad_norm": 2.6431227938147783, |
| "learning_rate": 8.598868849882074e-06, |
| "loss": 0.2368, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.49104859335038364, |
| "grad_norm": 2.4185335606377945, |
| "learning_rate": 8.58489533833643e-06, |
| "loss": 0.2053, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.4936061381074169, |
| "grad_norm": 2.205243127504733, |
| "learning_rate": 8.570863968961456e-06, |
| "loss": 0.1629, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.4961636828644501, |
| "grad_norm": 1.9725752515599342, |
| "learning_rate": 8.556774968214134e-06, |
| "loss": 0.2101, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.49872122762148335, |
| "grad_norm": 2.5299942102683035, |
| "learning_rate": 8.542628563481577e-06, |
| "loss": 0.2333, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5012787723785166, |
| "grad_norm": 2.41472732044959, |
| "learning_rate": 8.52842498307736e-06, |
| "loss": 0.2526, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.5038363171355499, |
| "grad_norm": 2.8335636174145593, |
| "learning_rate": 8.514164456237835e-06, |
| "loss": 0.25, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.5063938618925832, |
| "grad_norm": 2.793873822554608, |
| "learning_rate": 8.499847213118431e-06, |
| "loss": 0.2622, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.5089514066496164, |
| "grad_norm": 2.7012530682897378, |
| "learning_rate": 8.485473484789944e-06, |
| "loss": 0.2666, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.5115089514066496, |
| "grad_norm": 2.2375978558847387, |
| "learning_rate": 8.471043503234796e-06, |
| "loss": 0.2344, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5140664961636828, |
| "grad_norm": 3.1799564884853146, |
| "learning_rate": 8.45655750134331e-06, |
| "loss": 0.2335, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.5166240409207161, |
| "grad_norm": 2.0962041202170654, |
| "learning_rate": 8.442015712909926e-06, |
| "loss": 0.1948, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.5191815856777494, |
| "grad_norm": 2.372975255887274, |
| "learning_rate": 8.427418372629456e-06, |
| "loss": 0.2071, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.5217391304347826, |
| "grad_norm": 2.555304358266548, |
| "learning_rate": 8.412765716093273e-06, |
| "loss": 0.2152, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.5242966751918159, |
| "grad_norm": 1.9638554898256244, |
| "learning_rate": 8.398057979785515e-06, |
| "loss": 0.1731, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5268542199488491, |
| "grad_norm": 2.722593380051442, |
| "learning_rate": 8.383295401079284e-06, |
| "loss": 0.2152, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.5294117647058824, |
| "grad_norm": 2.718713160656891, |
| "learning_rate": 8.368478218232787e-06, |
| "loss": 0.2458, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.5319693094629157, |
| "grad_norm": 2.205180512995315, |
| "learning_rate": 8.353606670385514e-06, |
| "loss": 0.1922, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.5345268542199488, |
| "grad_norm": 2.099261923690752, |
| "learning_rate": 8.338680997554372e-06, |
| "loss": 0.1998, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.5370843989769821, |
| "grad_norm": 2.132007011749811, |
| "learning_rate": 8.3237014406298e-06, |
| "loss": 0.2043, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5396419437340153, |
| "grad_norm": 2.290560761638665, |
| "learning_rate": 8.308668241371897e-06, |
| "loss": 0.1652, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.5421994884910486, |
| "grad_norm": 2.12644563499644, |
| "learning_rate": 8.293581642406517e-06, |
| "loss": 0.1843, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.5447570332480819, |
| "grad_norm": 3.723287563479075, |
| "learning_rate": 8.278441887221338e-06, |
| "loss": 0.3019, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.5473145780051151, |
| "grad_norm": 2.2912570816048126, |
| "learning_rate": 8.263249220161957e-06, |
| "loss": 0.1796, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.5498721227621484, |
| "grad_norm": 2.260603019533916, |
| "learning_rate": 8.248003886427927e-06, |
| "loss": 0.1961, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5524296675191815, |
| "grad_norm": 2.257255113410867, |
| "learning_rate": 8.232706132068806e-06, |
| "loss": 0.1275, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.5549872122762148, |
| "grad_norm": 3.0673877778681113, |
| "learning_rate": 8.217356203980187e-06, |
| "loss": 0.1877, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.5575447570332481, |
| "grad_norm": 2.1489605272466634, |
| "learning_rate": 8.201954349899712e-06, |
| "loss": 0.2175, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.5601023017902813, |
| "grad_norm": 2.1580989041266676, |
| "learning_rate": 8.186500818403076e-06, |
| "loss": 0.1388, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.5626598465473146, |
| "grad_norm": 2.943597219795099, |
| "learning_rate": 8.17099585890001e-06, |
| "loss": 0.2643, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5652173913043478, |
| "grad_norm": 1.9908772203041976, |
| "learning_rate": 8.155439721630265e-06, |
| "loss": 0.1636, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.5677749360613811, |
| "grad_norm": 2.4027039208022916, |
| "learning_rate": 8.139832657659557e-06, |
| "loss": 0.2044, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.5703324808184144, |
| "grad_norm": 2.651295860517237, |
| "learning_rate": 8.124174918875532e-06, |
| "loss": 0.2978, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.5728900255754475, |
| "grad_norm": 2.7727762126746756, |
| "learning_rate": 8.108466757983695e-06, |
| "loss": 0.2093, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.5754475703324808, |
| "grad_norm": 2.2636479281802044, |
| "learning_rate": 8.092708428503324e-06, |
| "loss": 0.1748, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.578005115089514, |
| "grad_norm": 3.511479308858075, |
| "learning_rate": 8.076900184763394e-06, |
| "loss": 0.2064, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.5805626598465473, |
| "grad_norm": 2.3586737252559793, |
| "learning_rate": 8.061042281898453e-06, |
| "loss": 0.2045, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.5831202046035806, |
| "grad_norm": 2.62506739236279, |
| "learning_rate": 8.04513497584452e-06, |
| "loss": 0.2069, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.5856777493606138, |
| "grad_norm": 2.804637992821702, |
| "learning_rate": 8.02917852333495e-06, |
| "loss": 0.2787, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 2.5840154931492387, |
| "learning_rate": 8.013173181896283e-06, |
| "loss": 0.2566, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5907928388746803, |
| "grad_norm": 2.384157388224994, |
| "learning_rate": 7.9971192098441e-06, |
| "loss": 0.1297, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.5933503836317136, |
| "grad_norm": 3.616384836402104, |
| "learning_rate": 7.981016866278843e-06, |
| "loss": 0.2089, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.5959079283887468, |
| "grad_norm": 2.557601987276289, |
| "learning_rate": 7.964866411081645e-06, |
| "loss": 0.2178, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.59846547314578, |
| "grad_norm": 2.3704655232441154, |
| "learning_rate": 7.94866810491012e-06, |
| "loss": 0.2102, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.6010230179028133, |
| "grad_norm": 2.5523092935514566, |
| "learning_rate": 7.93242220919417e-06, |
| "loss": 0.2189, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.6035805626598465, |
| "grad_norm": 2.1709054016859493, |
| "learning_rate": 7.916128986131761e-06, |
| "loss": 0.1908, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.6061381074168798, |
| "grad_norm": 2.068222190576718, |
| "learning_rate": 7.899788698684687e-06, |
| "loss": 0.1975, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.6086956521739131, |
| "grad_norm": 2.836960021200559, |
| "learning_rate": 7.883401610574338e-06, |
| "loss": 0.255, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.6112531969309463, |
| "grad_norm": 1.8883272553824537, |
| "learning_rate": 7.866967986277423e-06, |
| "loss": 0.1455, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.6138107416879796, |
| "grad_norm": 2.4755576248259192, |
| "learning_rate": 7.850488091021726e-06, |
| "loss": 0.2442, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6163682864450127, |
| "grad_norm": 2.802725658457541, |
| "learning_rate": 7.833962190781809e-06, |
| "loss": 0.2283, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.618925831202046, |
| "grad_norm": 1.998218824904067, |
| "learning_rate": 7.817390552274721e-06, |
| "loss": 0.1389, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.6214833759590793, |
| "grad_norm": 2.6781876390883768, |
| "learning_rate": 7.800773442955703e-06, |
| "loss": 0.2238, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.6240409207161125, |
| "grad_norm": 2.564654638673255, |
| "learning_rate": 7.784111131013858e-06, |
| "loss": 0.2071, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.6265984654731458, |
| "grad_norm": 2.3286664497026903, |
| "learning_rate": 7.767403885367832e-06, |
| "loss": 0.1797, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.629156010230179, |
| "grad_norm": 2.940961657565362, |
| "learning_rate": 7.750651975661471e-06, |
| "loss": 0.2773, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.6317135549872123, |
| "grad_norm": 2.39717318483795, |
| "learning_rate": 7.733855672259472e-06, |
| "loss": 0.2242, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.6342710997442456, |
| "grad_norm": 2.018378796967046, |
| "learning_rate": 7.717015246243012e-06, |
| "loss": 0.1585, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.6368286445012787, |
| "grad_norm": 3.043878287662867, |
| "learning_rate": 7.700130969405377e-06, |
| "loss": 0.223, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.639386189258312, |
| "grad_norm": 2.8728908638133195, |
| "learning_rate": 7.683203114247587e-06, |
| "loss": 0.2143, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6419437340153452, |
| "grad_norm": 2.6954559773855706, |
| "learning_rate": 7.66623195397397e-06, |
| "loss": 0.2254, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.6445012787723785, |
| "grad_norm": 2.7134657682436236, |
| "learning_rate": 7.649217762487786e-06, |
| "loss": 0.2367, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.6470588235294118, |
| "grad_norm": 2.3389717040321525, |
| "learning_rate": 7.63216081438678e-06, |
| "loss": 0.1965, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.649616368286445, |
| "grad_norm": 1.9874680601076997, |
| "learning_rate": 7.615061384958764e-06, |
| "loss": 0.2062, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.6521739130434783, |
| "grad_norm": 2.9050691223424474, |
| "learning_rate": 7.597919750177168e-06, |
| "loss": 0.226, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6547314578005116, |
| "grad_norm": 2.304447009536863, |
| "learning_rate": 7.580736186696593e-06, |
| "loss": 0.1913, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.6572890025575447, |
| "grad_norm": 3.149415595365879, |
| "learning_rate": 7.563510971848339e-06, |
| "loss": 0.2187, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.659846547314578, |
| "grad_norm": 2.3690500597631843, |
| "learning_rate": 7.546244383635929e-06, |
| "loss": 0.25, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.6624040920716112, |
| "grad_norm": 2.430504175852079, |
| "learning_rate": 7.528936700730627e-06, |
| "loss": 0.2277, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.6649616368286445, |
| "grad_norm": 2.270735698643305, |
| "learning_rate": 7.5115882024669375e-06, |
| "loss": 0.1974, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6675191815856778, |
| "grad_norm": 2.4619767379298296, |
| "learning_rate": 7.494199168838099e-06, |
| "loss": 0.2642, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.670076726342711, |
| "grad_norm": 2.7725591682853543, |
| "learning_rate": 7.476769880491561e-06, |
| "loss": 0.2511, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.6726342710997443, |
| "grad_norm": 2.334392166881636, |
| "learning_rate": 7.459300618724462e-06, |
| "loss": 0.1837, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.6751918158567775, |
| "grad_norm": 3.6092024144297405, |
| "learning_rate": 7.44179166547908e-06, |
| "loss": 0.2718, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6777493606138107, |
| "grad_norm": 2.5798015356358266, |
| "learning_rate": 7.42424330333829e-06, |
| "loss": 0.2475, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.680306905370844, |
| "grad_norm": 2.33242353641567, |
| "learning_rate": 7.406655815520998e-06, |
| "loss": 0.2052, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.6828644501278772, |
| "grad_norm": 2.8947250508158984, |
| "learning_rate": 7.389029485877577e-06, |
| "loss": 0.23, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.6854219948849105, |
| "grad_norm": 2.0825711672647826, |
| "learning_rate": 7.371364598885276e-06, |
| "loss": 0.1965, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.6879795396419437, |
| "grad_norm": 1.9740063711309666, |
| "learning_rate": 7.353661439643638e-06, |
| "loss": 0.1678, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.690537084398977, |
| "grad_norm": 2.0678614348256974, |
| "learning_rate": 7.335920293869891e-06, |
| "loss": 0.1817, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6930946291560103, |
| "grad_norm": 1.9715896584301256, |
| "learning_rate": 7.318141447894344e-06, |
| "loss": 0.1317, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.6956521739130435, |
| "grad_norm": 2.260924051051695, |
| "learning_rate": 7.300325188655762e-06, |
| "loss": 0.2277, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.6982097186700768, |
| "grad_norm": 2.6190647544478796, |
| "learning_rate": 7.28247180369673e-06, |
| "loss": 0.2308, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.7007672634271099, |
| "grad_norm": 3.044526709385895, |
| "learning_rate": 7.264581581159024e-06, |
| "loss": 0.2258, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.7033248081841432, |
| "grad_norm": 2.4284330910741954, |
| "learning_rate": 7.246654809778951e-06, |
| "loss": 0.2381, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 2.2299016723949716, |
| "learning_rate": 7.2286917788826926e-06, |
| "loss": 0.1775, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.7084398976982097, |
| "grad_norm": 1.9829661983833986, |
| "learning_rate": 7.210692778381634e-06, |
| "loss": 0.1973, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.710997442455243, |
| "grad_norm": 3.083532290054799, |
| "learning_rate": 7.192658098767686e-06, |
| "loss": 0.2442, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.7135549872122762, |
| "grad_norm": 2.138475355654292, |
| "learning_rate": 7.174588031108598e-06, |
| "loss": 0.192, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.7161125319693095, |
| "grad_norm": 2.5991226258353817, |
| "learning_rate": 7.1564828670432595e-06, |
| "loss": 0.2268, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7186700767263428, |
| "grad_norm": 2.891848075907018, |
| "learning_rate": 7.138342898776989e-06, |
| "loss": 0.2115, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.7212276214833759, |
| "grad_norm": 2.726057165848505, |
| "learning_rate": 7.120168419076825e-06, |
| "loss": 0.229, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.7237851662404092, |
| "grad_norm": 2.534815026313142, |
| "learning_rate": 7.101959721266798e-06, |
| "loss": 0.2645, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.7263427109974424, |
| "grad_norm": 2.825513673241371, |
| "learning_rate": 7.083717099223192e-06, |
| "loss": 0.242, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.7289002557544757, |
| "grad_norm": 1.9732517778995824, |
| "learning_rate": 7.0654408473698084e-06, |
| "loss": 0.1711, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.731457800511509, |
| "grad_norm": 2.211705086269456, |
| "learning_rate": 7.047131260673214e-06, |
| "loss": 0.163, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.7340153452685422, |
| "grad_norm": 2.697861006533922, |
| "learning_rate": 7.0287886346379755e-06, |
| "loss": 0.2455, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.7365728900255755, |
| "grad_norm": 2.8082724007729123, |
| "learning_rate": 7.010413265301888e-06, |
| "loss": 0.2302, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.7391304347826086, |
| "grad_norm": 2.365367868240213, |
| "learning_rate": 6.9920054492312086e-06, |
| "loss": 0.2454, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.7416879795396419, |
| "grad_norm": 2.523482715187105, |
| "learning_rate": 6.97356548351586e-06, |
| "loss": 0.2117, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7442455242966752, |
| "grad_norm": 2.432296205013838, |
| "learning_rate": 6.9550936657646386e-06, |
| "loss": 0.2008, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.7468030690537084, |
| "grad_norm": 2.5596157723702575, |
| "learning_rate": 6.936590294100414e-06, |
| "loss": 0.1696, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.7493606138107417, |
| "grad_norm": 2.5875832384332074, |
| "learning_rate": 6.918055667155311e-06, |
| "loss": 0.1861, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.7519181585677749, |
| "grad_norm": 2.3852937224082167, |
| "learning_rate": 6.899490084065897e-06, |
| "loss": 0.1736, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.7544757033248082, |
| "grad_norm": 2.7399772311401422, |
| "learning_rate": 6.8808938444683505e-06, |
| "loss": 0.1942, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7570332480818415, |
| "grad_norm": 2.7081558169763524, |
| "learning_rate": 6.862267248493624e-06, |
| "loss": 0.1894, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.7595907928388747, |
| "grad_norm": 2.6345675146303975, |
| "learning_rate": 6.843610596762606e-06, |
| "loss": 0.2092, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.7621483375959079, |
| "grad_norm": 2.4002544312203176, |
| "learning_rate": 6.824924190381257e-06, |
| "loss": 0.1646, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.7647058823529411, |
| "grad_norm": 2.903014156552671, |
| "learning_rate": 6.806208330935766e-06, |
| "loss": 0.2274, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.7672634271099744, |
| "grad_norm": 2.3818958805093318, |
| "learning_rate": 6.7874633204876705e-06, |
| "loss": 0.1511, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7698209718670077, |
| "grad_norm": 2.1217119638619923, |
| "learning_rate": 6.768689461568987e-06, |
| "loss": 0.1818, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.7723785166240409, |
| "grad_norm": 1.9802995962813212, |
| "learning_rate": 6.7498870571773275e-06, |
| "loss": 0.1871, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.7749360613810742, |
| "grad_norm": 2.130497009697558, |
| "learning_rate": 6.731056410771008e-06, |
| "loss": 0.1929, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.7774936061381074, |
| "grad_norm": 2.6390968570982007, |
| "learning_rate": 6.712197826264154e-06, |
| "loss": 0.209, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.7800511508951407, |
| "grad_norm": 2.5486828023341763, |
| "learning_rate": 6.69331160802179e-06, |
| "loss": 0.1995, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.782608695652174, |
| "grad_norm": 2.1146363749572554, |
| "learning_rate": 6.674398060854931e-06, |
| "loss": 0.1411, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.7851662404092071, |
| "grad_norm": 2.327617229567959, |
| "learning_rate": 6.655457490015667e-06, |
| "loss": 0.2067, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.7877237851662404, |
| "grad_norm": 1.9607440916181138, |
| "learning_rate": 6.636490201192229e-06, |
| "loss": 0.1866, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.7902813299232737, |
| "grad_norm": 1.7057345078559258, |
| "learning_rate": 6.617496500504056e-06, |
| "loss": 0.0958, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.7928388746803069, |
| "grad_norm": 2.3104568865848334, |
| "learning_rate": 6.5984766944968636e-06, |
| "loss": 0.1813, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7953964194373402, |
| "grad_norm": 2.6850823821758647, |
| "learning_rate": 6.579431090137681e-06, |
| "loss": 0.1997, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.7979539641943734, |
| "grad_norm": 2.8109972215894006, |
| "learning_rate": 6.560359994809916e-06, |
| "loss": 0.3141, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.8005115089514067, |
| "grad_norm": 2.450236301729314, |
| "learning_rate": 6.541263716308375e-06, |
| "loss": 0.2208, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.80306905370844, |
| "grad_norm": 3.050425633867066, |
| "learning_rate": 6.522142562834307e-06, |
| "loss": 0.2554, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.8056265984654731, |
| "grad_norm": 2.0059660873591283, |
| "learning_rate": 6.502996842990431e-06, |
| "loss": 0.1905, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8081841432225064, |
| "grad_norm": 2.6797087837722504, |
| "learning_rate": 6.483826865775941e-06, |
| "loss": 0.236, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.8107416879795396, |
| "grad_norm": 2.282759943678343, |
| "learning_rate": 6.46463294058154e-06, |
| "loss": 0.1792, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.8132992327365729, |
| "grad_norm": 2.46733554686103, |
| "learning_rate": 6.445415377184427e-06, |
| "loss": 0.2151, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.8158567774936062, |
| "grad_norm": 2.3671420545552286, |
| "learning_rate": 6.426174485743309e-06, |
| "loss": 0.1765, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.8184143222506394, |
| "grad_norm": 1.9629970579121538, |
| "learning_rate": 6.4069105767933944e-06, |
| "loss": 0.1911, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8209718670076727, |
| "grad_norm": 3.6762465445985693, |
| "learning_rate": 6.387623961241375e-06, |
| "loss": 0.2756, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.8235294117647058, |
| "grad_norm": 2.532439314856185, |
| "learning_rate": 6.368314950360416e-06, |
| "loss": 0.1765, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.8260869565217391, |
| "grad_norm": 2.3922839492822003, |
| "learning_rate": 6.348983855785122e-06, |
| "loss": 0.1455, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.8286445012787724, |
| "grad_norm": 2.1577159803546437, |
| "learning_rate": 6.3296309895065215e-06, |
| "loss": 0.187, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.8312020460358056, |
| "grad_norm": 2.5897751675293565, |
| "learning_rate": 6.310256663867019e-06, |
| "loss": 0.1813, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8337595907928389, |
| "grad_norm": 2.9751298040801397, |
| "learning_rate": 6.290861191555359e-06, |
| "loss": 0.1861, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.8363171355498721, |
| "grad_norm": 3.0389364710917413, |
| "learning_rate": 6.271444885601583e-06, |
| "loss": 0.2479, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.8388746803069054, |
| "grad_norm": 2.6853416543912845, |
| "learning_rate": 6.252008059371968e-06, |
| "loss": 0.2179, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.8414322250639387, |
| "grad_norm": 1.804371167405513, |
| "learning_rate": 6.2325510265639785e-06, |
| "loss": 0.1489, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.8439897698209718, |
| "grad_norm": 2.687996264497428, |
| "learning_rate": 6.213074101201202e-06, |
| "loss": 0.1877, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8465473145780051, |
| "grad_norm": 2.2595021879894284, |
| "learning_rate": 6.193577597628268e-06, |
| "loss": 0.1835, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.8491048593350383, |
| "grad_norm": 1.9454030240892142, |
| "learning_rate": 6.174061830505801e-06, |
| "loss": 0.1744, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.8516624040920716, |
| "grad_norm": 2.7487602756044653, |
| "learning_rate": 6.154527114805312e-06, |
| "loss": 0.2756, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.8542199488491049, |
| "grad_norm": 2.1891761636099485, |
| "learning_rate": 6.1349737658041385e-06, |
| "loss": 0.2182, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.8567774936061381, |
| "grad_norm": 2.2860716470090625, |
| "learning_rate": 6.115402099080345e-06, |
| "loss": 0.1623, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.8593350383631714, |
| "grad_norm": 2.144126721212396, |
| "learning_rate": 6.095812430507627e-06, |
| "loss": 0.1309, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.8618925831202046, |
| "grad_norm": 2.92551680405092, |
| "learning_rate": 6.076205076250227e-06, |
| "loss": 0.2107, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.8644501278772379, |
| "grad_norm": 2.763974310218809, |
| "learning_rate": 6.056580352757813e-06, |
| "loss": 0.2358, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.8670076726342711, |
| "grad_norm": 2.2522928679519714, |
| "learning_rate": 6.036938576760388e-06, |
| "loss": 0.1657, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 1.8110497901458547, |
| "learning_rate": 6.0172800652631706e-06, |
| "loss": 0.1458, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8721227621483376, |
| "grad_norm": 2.632061983015455, |
| "learning_rate": 5.997605135541472e-06, |
| "loss": 0.1803, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.8746803069053708, |
| "grad_norm": 2.518578410021301, |
| "learning_rate": 5.977914105135594e-06, |
| "loss": 0.2311, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.8772378516624041, |
| "grad_norm": 1.9196947260190371, |
| "learning_rate": 5.9582072918456805e-06, |
| "loss": 0.1388, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.8797953964194374, |
| "grad_norm": 2.3343184448027303, |
| "learning_rate": 5.938485013726612e-06, |
| "loss": 0.2101, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 2.107250405421706, |
| "learning_rate": 5.918747589082853e-06, |
| "loss": 0.1753, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8849104859335039, |
| "grad_norm": 1.8161156075732543, |
| "learning_rate": 5.898995336463326e-06, |
| "loss": 0.1707, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.887468030690537, |
| "grad_norm": 2.0187878577288765, |
| "learning_rate": 5.879228574656269e-06, |
| "loss": 0.1289, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.8900255754475703, |
| "grad_norm": 3.2101942717767025, |
| "learning_rate": 5.859447622684084e-06, |
| "loss": 0.2727, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.8925831202046036, |
| "grad_norm": 2.1882518208143673, |
| "learning_rate": 5.839652799798197e-06, |
| "loss": 0.194, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.8951406649616368, |
| "grad_norm": 3.0670778313064044, |
| "learning_rate": 5.819844425473899e-06, |
| "loss": 0.2662, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8976982097186701, |
| "grad_norm": 2.263174260591853, |
| "learning_rate": 5.800022819405194e-06, |
| "loss": 0.1937, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.9002557544757033, |
| "grad_norm": 2.436527033057964, |
| "learning_rate": 5.780188301499636e-06, |
| "loss": 0.2269, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.9028132992327366, |
| "grad_norm": 3.0292476861844166, |
| "learning_rate": 5.760341191873167e-06, |
| "loss": 0.2593, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.9053708439897699, |
| "grad_norm": 2.7791662962631607, |
| "learning_rate": 5.740481810844952e-06, |
| "loss": 0.2198, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.907928388746803, |
| "grad_norm": 2.1999927858559145, |
| "learning_rate": 5.720610478932211e-06, |
| "loss": 0.164, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.9104859335038363, |
| "grad_norm": 2.5794977388267166, |
| "learning_rate": 5.700727516845038e-06, |
| "loss": 0.1773, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.9130434782608695, |
| "grad_norm": 2.147804700886211, |
| "learning_rate": 5.680833245481234e-06, |
| "loss": 0.2114, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.9156010230179028, |
| "grad_norm": 2.606878063368861, |
| "learning_rate": 5.660927985921122e-06, |
| "loss": 0.2173, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.9181585677749361, |
| "grad_norm": 2.340221113756964, |
| "learning_rate": 5.641012059422369e-06, |
| "loss": 0.2223, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.9207161125319693, |
| "grad_norm": 2.4961312290411994, |
| "learning_rate": 5.621085787414799e-06, |
| "loss": 0.2255, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9232736572890026, |
| "grad_norm": 2.5269182037727895, |
| "learning_rate": 5.601149491495206e-06, |
| "loss": 0.2135, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.9258312020460358, |
| "grad_norm": 2.0417402035129317, |
| "learning_rate": 5.581203493422161e-06, |
| "loss": 0.2078, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.928388746803069, |
| "grad_norm": 2.0727546621065227, |
| "learning_rate": 5.561248115110822e-06, |
| "loss": 0.1661, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.9309462915601023, |
| "grad_norm": 2.7807300008729348, |
| "learning_rate": 5.541283678627742e-06, |
| "loss": 0.1794, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.9335038363171355, |
| "grad_norm": 1.9508561433458118, |
| "learning_rate": 5.521310506185661e-06, |
| "loss": 0.1466, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.9360613810741688, |
| "grad_norm": 2.3405765286165603, |
| "learning_rate": 5.501328920138314e-06, |
| "loss": 0.2208, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.9386189258312021, |
| "grad_norm": 2.469209204439655, |
| "learning_rate": 5.481339242975227e-06, |
| "loss": 0.1634, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 2.40778706172643, |
| "learning_rate": 5.46134179731651e-06, |
| "loss": 0.1898, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.9437340153452686, |
| "grad_norm": 2.2778636718221965, |
| "learning_rate": 5.441336905907653e-06, |
| "loss": 0.1926, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.9462915601023018, |
| "grad_norm": 2.7625812295253422, |
| "learning_rate": 5.421324891614312e-06, |
| "loss": 0.1973, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.948849104859335, |
| "grad_norm": 1.7446912165051116, |
| "learning_rate": 5.4013060774171055e-06, |
| "loss": 0.1638, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.9514066496163683, |
| "grad_norm": 2.5444352315152066, |
| "learning_rate": 5.3812807864063946e-06, |
| "loss": 0.2318, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.9539641943734015, |
| "grad_norm": 2.1992426433820587, |
| "learning_rate": 5.361249341777075e-06, |
| "loss": 0.1843, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.9565217391304348, |
| "grad_norm": 2.5302908183096964, |
| "learning_rate": 5.341212066823356e-06, |
| "loss": 0.2005, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.959079283887468, |
| "grad_norm": 2.6580305183975748, |
| "learning_rate": 5.321169284933543e-06, |
| "loss": 0.2198, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9616368286445013, |
| "grad_norm": 2.2117215566531723, |
| "learning_rate": 5.3011213195848245e-06, |
| "loss": 0.2427, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.9641943734015346, |
| "grad_norm": 2.725818072341103, |
| "learning_rate": 5.281068494338039e-06, |
| "loss": 0.1718, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.9667519181585678, |
| "grad_norm": 2.2292620609222067, |
| "learning_rate": 5.26101113283247e-06, |
| "loss": 0.1534, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.969309462915601, |
| "grad_norm": 2.6330854799034813, |
| "learning_rate": 5.240949558780605e-06, |
| "loss": 0.2033, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.9718670076726342, |
| "grad_norm": 2.650742969398919, |
| "learning_rate": 5.220884095962924e-06, |
| "loss": 0.226, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9744245524296675, |
| "grad_norm": 2.6523001115826332, |
| "learning_rate": 5.200815068222666e-06, |
| "loss": 0.193, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.9769820971867008, |
| "grad_norm": 2.009859598152987, |
| "learning_rate": 5.1807427994606065e-06, |
| "loss": 0.1309, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.979539641943734, |
| "grad_norm": 3.0187208603458187, |
| "learning_rate": 5.1606676136298305e-06, |
| "loss": 0.1944, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.9820971867007673, |
| "grad_norm": 2.16488916376895, |
| "learning_rate": 5.140589834730503e-06, |
| "loss": 0.1758, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.9846547314578005, |
| "grad_norm": 2.26726877183814, |
| "learning_rate": 5.120509786804635e-06, |
| "loss": 0.2036, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9872122762148338, |
| "grad_norm": 2.5667448076446515, |
| "learning_rate": 5.100427793930862e-06, |
| "loss": 0.2137, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.989769820971867, |
| "grad_norm": 3.175490751374496, |
| "learning_rate": 5.08034418021921e-06, |
| "loss": 0.218, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.9923273657289002, |
| "grad_norm": 2.071152150237306, |
| "learning_rate": 5.06025926980586e-06, |
| "loss": 0.144, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.9948849104859335, |
| "grad_norm": 1.9502478990435759, |
| "learning_rate": 5.040173386847926e-06, |
| "loss": 0.1662, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.9974424552429667, |
| "grad_norm": 2.3496611393315554, |
| "learning_rate": 5.0200868555182155e-06, |
| "loss": 0.1997, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.1406113822102846, |
| "learning_rate": 5e-06, |
| "loss": 0.1649, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.0025575447570332, |
| "grad_norm": 1.7341758333987307, |
| "learning_rate": 4.979913144481785e-06, |
| "loss": 0.0982, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.0051150895140666, |
| "grad_norm": 1.6626061824434262, |
| "learning_rate": 4.959826613152074e-06, |
| "loss": 0.0868, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.0076726342710998, |
| "grad_norm": 1.420000749929643, |
| "learning_rate": 4.939740730194141e-06, |
| "loss": 0.0911, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.010230179028133, |
| "grad_norm": 1.697363619049948, |
| "learning_rate": 4.919655819780792e-06, |
| "loss": 0.076, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.0127877237851663, |
| "grad_norm": 1.4002681848958676, |
| "learning_rate": 4.899572206069138e-06, |
| "loss": 0.0805, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.0153452685421995, |
| "grad_norm": 1.4426210835750999, |
| "learning_rate": 4.879490213195366e-06, |
| "loss": 0.0879, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.0179028132992327, |
| "grad_norm": 2.1433788397954636, |
| "learning_rate": 4.8594101652694996e-06, |
| "loss": 0.1065, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.020460358056266, |
| "grad_norm": 1.6010853002979815, |
| "learning_rate": 4.839332386370171e-06, |
| "loss": 0.0743, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.0230179028132993, |
| "grad_norm": 1.5836077570503444, |
| "learning_rate": 4.819257200539394e-06, |
| "loss": 0.0832, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0255754475703325, |
| "grad_norm": 1.5997813293419636, |
| "learning_rate": 4.799184931777337e-06, |
| "loss": 0.0936, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.0281329923273657, |
| "grad_norm": 1.7521575539423502, |
| "learning_rate": 4.779115904037079e-06, |
| "loss": 0.0723, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.030690537084399, |
| "grad_norm": 1.7850328818891048, |
| "learning_rate": 4.759050441219395e-06, |
| "loss": 0.0664, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.0332480818414322, |
| "grad_norm": 1.4608104273507212, |
| "learning_rate": 4.738988867167531e-06, |
| "loss": 0.0573, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.0358056265984654, |
| "grad_norm": 1.8270185513924122, |
| "learning_rate": 4.718931505661961e-06, |
| "loss": 0.0681, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.0383631713554988, |
| "grad_norm": 1.8634214035735037, |
| "learning_rate": 4.698878680415176e-06, |
| "loss": 0.0717, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.040920716112532, |
| "grad_norm": 1.748667561404042, |
| "learning_rate": 4.678830715066458e-06, |
| "loss": 0.0683, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.0434782608695652, |
| "grad_norm": 2.014253739409776, |
| "learning_rate": 4.6587879331766465e-06, |
| "loss": 0.0827, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.0460358056265984, |
| "grad_norm": 1.7966173689760938, |
| "learning_rate": 4.638750658222927e-06, |
| "loss": 0.0981, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.0485933503836318, |
| "grad_norm": 1.6525905355592523, |
| "learning_rate": 4.618719213593605e-06, |
| "loss": 0.077, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.051150895140665, |
| "grad_norm": 1.4997606657164488, |
| "learning_rate": 4.598693922582896e-06, |
| "loss": 0.0702, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.0537084398976981, |
| "grad_norm": 2.022120534248971, |
| "learning_rate": 4.5786751083856895e-06, |
| "loss": 0.0814, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.0562659846547315, |
| "grad_norm": 1.9127347409400652, |
| "learning_rate": 4.558663094092348e-06, |
| "loss": 0.096, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.0588235294117647, |
| "grad_norm": 2.541069985292318, |
| "learning_rate": 4.53865820268349e-06, |
| "loss": 0.0974, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.061381074168798, |
| "grad_norm": 1.9801185106989778, |
| "learning_rate": 4.518660757024774e-06, |
| "loss": 0.0649, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.0639386189258313, |
| "grad_norm": 1.9483856548126361, |
| "learning_rate": 4.498671079861686e-06, |
| "loss": 0.0881, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.0664961636828645, |
| "grad_norm": 2.107304538718503, |
| "learning_rate": 4.478689493814341e-06, |
| "loss": 0.1178, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.0690537084398977, |
| "grad_norm": 2.072047135460629, |
| "learning_rate": 4.4587163213722595e-06, |
| "loss": 0.0971, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.0716112531969308, |
| "grad_norm": 1.6567001187673083, |
| "learning_rate": 4.438751884889179e-06, |
| "loss": 0.0729, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.0741687979539642, |
| "grad_norm": 1.6640746116359595, |
| "learning_rate": 4.41879650657784e-06, |
| "loss": 0.0637, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0767263427109974, |
| "grad_norm": 1.7205043208443138, |
| "learning_rate": 4.398850508504795e-06, |
| "loss": 0.0633, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.0792838874680306, |
| "grad_norm": 1.3625064664087017, |
| "learning_rate": 4.3789142125852015e-06, |
| "loss": 0.0652, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.081841432225064, |
| "grad_norm": 1.6440483798308387, |
| "learning_rate": 4.358987940577631e-06, |
| "loss": 0.0893, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.0843989769820972, |
| "grad_norm": 1.988437435997785, |
| "learning_rate": 4.339072014078879e-06, |
| "loss": 0.0745, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.0869565217391304, |
| "grad_norm": 1.9394483469983206, |
| "learning_rate": 4.319166754518768e-06, |
| "loss": 0.0699, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0895140664961638, |
| "grad_norm": 1.6091679987577228, |
| "learning_rate": 4.299272483154963e-06, |
| "loss": 0.0834, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.092071611253197, |
| "grad_norm": 2.1504108683482728, |
| "learning_rate": 4.27938952106779e-06, |
| "loss": 0.0975, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.0946291560102301, |
| "grad_norm": 1.2129263595961266, |
| "learning_rate": 4.259518189155049e-06, |
| "loss": 0.056, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.0971867007672633, |
| "grad_norm": 1.5136569144328267, |
| "learning_rate": 4.2396588081268355e-06, |
| "loss": 0.0583, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.0997442455242967, |
| "grad_norm": 2.0636857667444883, |
| "learning_rate": 4.219811698500365e-06, |
| "loss": 0.1227, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.10230179028133, |
| "grad_norm": 1.555373954241444, |
| "learning_rate": 4.199977180594807e-06, |
| "loss": 0.0618, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.104859335038363, |
| "grad_norm": 2.234721374673342, |
| "learning_rate": 4.1801555745261025e-06, |
| "loss": 0.0719, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.1074168797953965, |
| "grad_norm": 1.4891492181079513, |
| "learning_rate": 4.160347200201804e-06, |
| "loss": 0.0848, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.1099744245524297, |
| "grad_norm": 1.7086838274547151, |
| "learning_rate": 4.140552377315918e-06, |
| "loss": 0.085, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.1125319693094629, |
| "grad_norm": 1.7588027037354494, |
| "learning_rate": 4.120771425343733e-06, |
| "loss": 0.0599, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.1150895140664963, |
| "grad_norm": 1.9369520926526567, |
| "learning_rate": 4.101004663536675e-06, |
| "loss": 0.0757, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.1176470588235294, |
| "grad_norm": 1.5573730150751601, |
| "learning_rate": 4.081252410917148e-06, |
| "loss": 0.0727, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.1202046035805626, |
| "grad_norm": 1.7157376858170956, |
| "learning_rate": 4.061514986273391e-06, |
| "loss": 0.0716, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.1227621483375958, |
| "grad_norm": 2.3412942307866293, |
| "learning_rate": 4.041792708154321e-06, |
| "loss": 0.1166, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.1253196930946292, |
| "grad_norm": 1.3827537076323062, |
| "learning_rate": 4.022085894864408e-06, |
| "loss": 0.0542, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1278772378516624, |
| "grad_norm": 1.4810971648201303, |
| "learning_rate": 4.0023948644585294e-06, |
| "loss": 0.0627, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.1304347826086956, |
| "grad_norm": 2.475105933705708, |
| "learning_rate": 3.982719934736832e-06, |
| "loss": 0.0747, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.132992327365729, |
| "grad_norm": 6.967435074371647, |
| "learning_rate": 3.963061423239612e-06, |
| "loss": 0.0885, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.1355498721227621, |
| "grad_norm": 2.362944377096876, |
| "learning_rate": 3.943419647242189e-06, |
| "loss": 0.0927, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.1381074168797953, |
| "grad_norm": 3.417514515542525, |
| "learning_rate": 3.923794923749775e-06, |
| "loss": 0.1051, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.1406649616368287, |
| "grad_norm": 1.5628055684744868, |
| "learning_rate": 3.904187569492373e-06, |
| "loss": 0.0609, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.143222506393862, |
| "grad_norm": 1.853834244292138, |
| "learning_rate": 3.884597900919656e-06, |
| "loss": 0.0743, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.145780051150895, |
| "grad_norm": 1.5370022269889596, |
| "learning_rate": 3.865026234195863e-06, |
| "loss": 0.0645, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.1483375959079285, |
| "grad_norm": 1.7852407621323894, |
| "learning_rate": 3.8454728851946885e-06, |
| "loss": 0.074, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.1508951406649617, |
| "grad_norm": 1.5201740396511454, |
| "learning_rate": 3.8259381694942e-06, |
| "loss": 0.0711, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1534526854219949, |
| "grad_norm": 1.5688527808393755, |
| "learning_rate": 3.806422402371733e-06, |
| "loss": 0.0658, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.156010230179028, |
| "grad_norm": 1.854516547291142, |
| "learning_rate": 3.786925898798801e-06, |
| "loss": 0.0864, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.1585677749360614, |
| "grad_norm": 1.4304724295860949, |
| "learning_rate": 3.767448973436021e-06, |
| "loss": 0.0701, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.1611253196930946, |
| "grad_norm": 2.124867526109046, |
| "learning_rate": 3.7479919406280334e-06, |
| "loss": 0.0687, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.1636828644501278, |
| "grad_norm": 1.9407356546402628, |
| "learning_rate": 3.728555114398419e-06, |
| "loss": 0.0693, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.1662404092071612, |
| "grad_norm": 2.038902868728145, |
| "learning_rate": 3.709138808444641e-06, |
| "loss": 0.075, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.1687979539641944, |
| "grad_norm": 1.4775851732019487, |
| "learning_rate": 3.689743336132982e-06, |
| "loss": 0.0548, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.1713554987212276, |
| "grad_norm": 1.780023396964712, |
| "learning_rate": 3.6703690104934806e-06, |
| "loss": 0.0597, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.1739130434782608, |
| "grad_norm": 1.9428570066179016, |
| "learning_rate": 3.6510161442148783e-06, |
| "loss": 0.0922, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 1.4660436947208233, |
| "learning_rate": 3.6316850496395863e-06, |
| "loss": 0.0702, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1790281329923273, |
| "grad_norm": 1.492258925362867, |
| "learning_rate": 3.6123760387586265e-06, |
| "loss": 0.0533, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.1815856777493605, |
| "grad_norm": 1.6080846234395203, |
| "learning_rate": 3.5930894232066072e-06, |
| "loss": 0.0613, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.184143222506394, |
| "grad_norm": 1.8762491421887413, |
| "learning_rate": 3.5738255142566912e-06, |
| "loss": 0.111, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.186700767263427, |
| "grad_norm": 1.6721148093003684, |
| "learning_rate": 3.5545846228155743e-06, |
| "loss": 0.0695, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.1892583120204603, |
| "grad_norm": 1.6557906824838298, |
| "learning_rate": 3.5353670594184623e-06, |
| "loss": 0.0648, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.1918158567774937, |
| "grad_norm": 1.6970610387683387, |
| "learning_rate": 3.516173134224059e-06, |
| "loss": 0.0808, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.1943734015345269, |
| "grad_norm": 1.7210699030954981, |
| "learning_rate": 3.4970031570095707e-06, |
| "loss": 0.0623, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.19693094629156, |
| "grad_norm": 1.540393787825596, |
| "learning_rate": 3.477857437165694e-06, |
| "loss": 0.0664, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.1994884910485935, |
| "grad_norm": 1.8901582305083606, |
| "learning_rate": 3.458736283691626e-06, |
| "loss": 0.0834, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.2020460358056266, |
| "grad_norm": 1.4585029838366994, |
| "learning_rate": 3.4396400051900846e-06, |
| "loss": 0.0547, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.2046035805626598, |
| "grad_norm": 1.2256677190585374, |
| "learning_rate": 3.4205689098623195e-06, |
| "loss": 0.0479, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.207161125319693, |
| "grad_norm": 1.055212675096316, |
| "learning_rate": 3.401523305503139e-06, |
| "loss": 0.0349, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.2097186700767264, |
| "grad_norm": 2.29047957899524, |
| "learning_rate": 3.3825034994959445e-06, |
| "loss": 0.1002, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.2122762148337596, |
| "grad_norm": 1.5122951574759327, |
| "learning_rate": 3.3635097988077724e-06, |
| "loss": 0.0697, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.2148337595907928, |
| "grad_norm": 1.8598398658091704, |
| "learning_rate": 3.3445425099843343e-06, |
| "loss": 0.0759, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2173913043478262, |
| "grad_norm": 4.06734950441362, |
| "learning_rate": 3.3256019391450696e-06, |
| "loss": 0.064, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.2199488491048593, |
| "grad_norm": 1.8126115952101831, |
| "learning_rate": 3.3066883919782116e-06, |
| "loss": 0.0905, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.2225063938618925, |
| "grad_norm": 1.9695485499932877, |
| "learning_rate": 3.287802173735848e-06, |
| "loss": 0.073, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.2250639386189257, |
| "grad_norm": 1.8734705052479046, |
| "learning_rate": 3.268943589228992e-06, |
| "loss": 0.0765, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.227621483375959, |
| "grad_norm": 1.8227153781564451, |
| "learning_rate": 3.250112942822673e-06, |
| "loss": 0.0647, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.2301790281329923, |
| "grad_norm": 1.927223581889169, |
| "learning_rate": 3.231310538431015e-06, |
| "loss": 0.1003, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.2327365728900257, |
| "grad_norm": 1.8013350113471298, |
| "learning_rate": 3.212536679512332e-06, |
| "loss": 0.0733, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.2352941176470589, |
| "grad_norm": 1.5008823838511212, |
| "learning_rate": 3.1937916690642356e-06, |
| "loss": 0.0594, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.237851662404092, |
| "grad_norm": 2.1113680142657314, |
| "learning_rate": 3.1750758096187446e-06, |
| "loss": 0.0968, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.2404092071611252, |
| "grad_norm": 1.6422275231634988, |
| "learning_rate": 3.1563894032373977e-06, |
| "loss": 0.0648, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.2429667519181586, |
| "grad_norm": 1.5640327931569575, |
| "learning_rate": 3.137732751506376e-06, |
| "loss": 0.0614, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.2455242966751918, |
| "grad_norm": 1.931470233335117, |
| "learning_rate": 3.1191061555316503e-06, |
| "loss": 0.0752, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.248081841432225, |
| "grad_norm": 1.5026655446391683, |
| "learning_rate": 3.1005099159341044e-06, |
| "loss": 0.0672, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.2506393861892584, |
| "grad_norm": 1.7505740735081963, |
| "learning_rate": 3.08194433284469e-06, |
| "loss": 0.0914, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.2531969309462916, |
| "grad_norm": 1.4586747212773692, |
| "learning_rate": 3.0634097058995877e-06, |
| "loss": 0.0579, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2557544757033248, |
| "grad_norm": 2.6271299593430375, |
| "learning_rate": 3.0449063342353635e-06, |
| "loss": 0.0833, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.258312020460358, |
| "grad_norm": 1.9675276646986675, |
| "learning_rate": 3.0264345164841426e-06, |
| "loss": 0.0803, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.2608695652173914, |
| "grad_norm": 1.7237694658208633, |
| "learning_rate": 3.007994550768793e-06, |
| "loss": 0.0652, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.2634271099744245, |
| "grad_norm": 1.7092819026417294, |
| "learning_rate": 2.989586734698113e-06, |
| "loss": 0.068, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.265984654731458, |
| "grad_norm": 1.4116147134742372, |
| "learning_rate": 2.971211365362028e-06, |
| "loss": 0.061, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.2685421994884911, |
| "grad_norm": 1.3575667916154275, |
| "learning_rate": 2.9528687393267865e-06, |
| "loss": 0.0685, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.2710997442455243, |
| "grad_norm": 1.802170523220636, |
| "learning_rate": 2.934559152630192e-06, |
| "loss": 0.0753, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.2736572890025575, |
| "grad_norm": 1.6262195523928797, |
| "learning_rate": 2.9162829007768103e-06, |
| "loss": 0.0941, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.2762148337595907, |
| "grad_norm": 1.3516369552001424, |
| "learning_rate": 2.898040278733203e-06, |
| "loss": 0.0683, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.278772378516624, |
| "grad_norm": 1.477326290875936, |
| "learning_rate": 2.879831580923176e-06, |
| "loss": 0.0629, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.278772378516624, |
| "eval_loss": 0.21294504404067993, |
| "eval_runtime": 4.6492, |
| "eval_samples_per_second": 6.883, |
| "eval_steps_per_second": 1.721, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2813299232736572, |
| "grad_norm": 1.7566143836444967, |
| "learning_rate": 2.8616571012230134e-06, |
| "loss": 0.0858, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.2838874680306906, |
| "grad_norm": 1.6822488345485334, |
| "learning_rate": 2.843517132956742e-06, |
| "loss": 0.0858, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.2864450127877238, |
| "grad_norm": 1.5083738241472828, |
| "learning_rate": 2.8254119688914017e-06, |
| "loss": 0.0727, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.289002557544757, |
| "grad_norm": 1.7803194664288695, |
| "learning_rate": 2.8073419012323154e-06, |
| "loss": 0.0982, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.2915601023017902, |
| "grad_norm": 1.4987869744478313, |
| "learning_rate": 2.789307221618369e-06, |
| "loss": 0.0679, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.2941176470588236, |
| "grad_norm": 2.020271271839902, |
| "learning_rate": 2.771308221117309e-06, |
| "loss": 0.0969, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.2966751918158568, |
| "grad_norm": 1.2722299672920545, |
| "learning_rate": 2.7533451902210512e-06, |
| "loss": 0.0503, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.29923273657289, |
| "grad_norm": 1.7822467291822328, |
| "learning_rate": 2.7354184188409773e-06, |
| "loss": 0.0852, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.3017902813299234, |
| "grad_norm": 1.6127587678726962, |
| "learning_rate": 2.71752819630327e-06, |
| "loss": 0.0659, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.3043478260869565, |
| "grad_norm": 1.9796442834814172, |
| "learning_rate": 2.6996748113442397e-06, |
| "loss": 0.0651, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.3069053708439897, |
| "grad_norm": 1.3367286391456314, |
| "learning_rate": 2.6818585521056573e-06, |
| "loss": 0.057, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.309462915601023, |
| "grad_norm": 1.6215431890082224, |
| "learning_rate": 2.66407970613011e-06, |
| "loss": 0.0644, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.3120204603580563, |
| "grad_norm": 1.5423150036952755, |
| "learning_rate": 2.646338560356363e-06, |
| "loss": 0.0487, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.3145780051150895, |
| "grad_norm": 1.4322758913833975, |
| "learning_rate": 2.6286354011147252e-06, |
| "loss": 0.054, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.317135549872123, |
| "grad_norm": 2.0705557289206933, |
| "learning_rate": 2.6109705141224255e-06, |
| "loss": 0.0906, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.319693094629156, |
| "grad_norm": 1.6956814937463585, |
| "learning_rate": 2.593344184479003e-06, |
| "loss": 0.0741, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.3222506393861893, |
| "grad_norm": 1.6447905074286118, |
| "learning_rate": 2.575756696661713e-06, |
| "loss": 0.0613, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.3248081841432224, |
| "grad_norm": 1.2857330835107534, |
| "learning_rate": 2.5582083345209217e-06, |
| "loss": 0.0573, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.3273657289002558, |
| "grad_norm": 1.6996787609477517, |
| "learning_rate": 2.540699381275539e-06, |
| "loss": 0.0721, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.329923273657289, |
| "grad_norm": 1.406431219311043, |
| "learning_rate": 2.5232301195084395e-06, |
| "loss": 0.0538, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.3324808184143222, |
| "grad_norm": 2.021505231989548, |
| "learning_rate": 2.5058008311619035e-06, |
| "loss": 0.0804, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.3350383631713556, |
| "grad_norm": 1.5049328024585746, |
| "learning_rate": 2.488411797533064e-06, |
| "loss": 0.0452, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.3375959079283888, |
| "grad_norm": 1.9992178757785442, |
| "learning_rate": 2.4710632992693737e-06, |
| "loss": 0.0722, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.340153452685422, |
| "grad_norm": 1.3839067422641855, |
| "learning_rate": 2.4537556163640726e-06, |
| "loss": 0.0578, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.3427109974424551, |
| "grad_norm": 1.8578230421356159, |
| "learning_rate": 2.4364890281516633e-06, |
| "loss": 0.0769, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3452685421994885, |
| "grad_norm": 1.2638070516424367, |
| "learning_rate": 2.4192638133034074e-06, |
| "loss": 0.0433, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.3478260869565217, |
| "grad_norm": 1.5514020868657306, |
| "learning_rate": 2.4020802498228333e-06, |
| "loss": 0.0638, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.350383631713555, |
| "grad_norm": 1.1234091073689203, |
| "learning_rate": 2.384938615041238e-06, |
| "loss": 0.0478, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.3529411764705883, |
| "grad_norm": 1.394137463574114, |
| "learning_rate": 2.3678391856132203e-06, |
| "loss": 0.0532, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.3554987212276215, |
| "grad_norm": 1.1526941031608549, |
| "learning_rate": 2.350782237512215e-06, |
| "loss": 0.0439, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.3580562659846547, |
| "grad_norm": 1.8181630565833733, |
| "learning_rate": 2.3337680460260314e-06, |
| "loss": 0.0672, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.3606138107416879, |
| "grad_norm": 3.123874639693688, |
| "learning_rate": 2.316796885752415e-06, |
| "loss": 0.1681, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.3631713554987213, |
| "grad_norm": 2.1436757050712214, |
| "learning_rate": 2.299869030594622e-06, |
| "loss": 0.0904, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.3657289002557544, |
| "grad_norm": 1.6862552799832844, |
| "learning_rate": 2.2829847537569904e-06, |
| "loss": 0.0381, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.3682864450127878, |
| "grad_norm": 2.105194788108369, |
| "learning_rate": 2.266144327740531e-06, |
| "loss": 0.0863, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.370843989769821, |
| "grad_norm": 1.752969337535574, |
| "learning_rate": 2.2493480243385298e-06, |
| "loss": 0.0702, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.3734015345268542, |
| "grad_norm": 1.854309444483952, |
| "learning_rate": 2.2325961146321683e-06, |
| "loss": 0.077, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.3759590792838874, |
| "grad_norm": 1.7507327973613351, |
| "learning_rate": 2.2158888689861434e-06, |
| "loss": 0.0683, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.3785166240409208, |
| "grad_norm": 1.3352460032358662, |
| "learning_rate": 2.1992265570442974e-06, |
| "loss": 0.0586, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.381074168797954, |
| "grad_norm": 1.6988970605905234, |
| "learning_rate": 2.182609447725279e-06, |
| "loss": 0.0789, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.3836317135549872, |
| "grad_norm": 1.8680407758940254, |
| "learning_rate": 2.1660378092181935e-06, |
| "loss": 0.0791, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.3861892583120206, |
| "grad_norm": 1.8934496530320313, |
| "learning_rate": 2.149511908978275e-06, |
| "loss": 0.0726, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.3887468030690537, |
| "grad_norm": 1.7808474646322217, |
| "learning_rate": 2.1330320137225773e-06, |
| "loss": 0.0832, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.391304347826087, |
| "grad_norm": 1.6521067201978266, |
| "learning_rate": 2.1165983894256647e-06, |
| "loss": 0.0738, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.39386189258312, |
| "grad_norm": 2.0833857927118196, |
| "learning_rate": 2.100211301315315e-06, |
| "loss": 0.0729, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.3964194373401535, |
| "grad_norm": 1.7314836100520905, |
| "learning_rate": 2.0838710138682412e-06, |
| "loss": 0.0686, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.3989769820971867, |
| "grad_norm": 1.4118884729586654, |
| "learning_rate": 2.0675777908058307e-06, |
| "loss": 0.0644, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.40153452685422, |
| "grad_norm": 1.8528901934251134, |
| "learning_rate": 2.051331895089882e-06, |
| "loss": 0.0859, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.4040920716112533, |
| "grad_norm": 1.761689983496555, |
| "learning_rate": 2.035133588918356e-06, |
| "loss": 0.0496, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.4066496163682864, |
| "grad_norm": 1.0521984322299474, |
| "learning_rate": 2.0189831337211573e-06, |
| "loss": 0.0388, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.4092071611253196, |
| "grad_norm": 1.7178382415710014, |
| "learning_rate": 2.0028807901559027e-06, |
| "loss": 0.0733, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.4117647058823528, |
| "grad_norm": 1.6002839531485389, |
| "learning_rate": 1.9868268181037186e-06, |
| "loss": 0.0803, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.4143222506393862, |
| "grad_norm": 1.692131956326154, |
| "learning_rate": 1.970821476665051e-06, |
| "loss": 0.0632, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.4168797953964194, |
| "grad_norm": 1.4170058706311646, |
| "learning_rate": 1.9548650241554812e-06, |
| "loss": 0.0543, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.4194373401534528, |
| "grad_norm": 1.58559188713565, |
| "learning_rate": 1.9389577181015496e-06, |
| "loss": 0.0492, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.421994884910486, |
| "grad_norm": 1.3463742170851551, |
| "learning_rate": 1.923099815236608e-06, |
| "loss": 0.0518, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.4245524296675192, |
| "grad_norm": 1.258601610278906, |
| "learning_rate": 1.9072915714966761e-06, |
| "loss": 0.0517, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.4271099744245523, |
| "grad_norm": 1.7090524252971389, |
| "learning_rate": 1.8915332420163074e-06, |
| "loss": 0.0528, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.4296675191815857, |
| "grad_norm": 2.002866251375377, |
| "learning_rate": 1.8758250811244682e-06, |
| "loss": 0.0528, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.432225063938619, |
| "grad_norm": 1.412125761127525, |
| "learning_rate": 1.8601673423404449e-06, |
| "loss": 0.0627, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.434782608695652, |
| "grad_norm": 2.1658626868049535, |
| "learning_rate": 1.8445602783697375e-06, |
| "loss": 0.0779, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.4373401534526855, |
| "grad_norm": 1.748655089068542, |
| "learning_rate": 1.8290041410999893e-06, |
| "loss": 0.068, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.4398976982097187, |
| "grad_norm": 1.4799736457543835, |
| "learning_rate": 1.8134991815969238e-06, |
| "loss": 0.0654, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.4424552429667519, |
| "grad_norm": 1.759726051902621, |
| "learning_rate": 1.798045650100289e-06, |
| "loss": 0.0658, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.445012787723785, |
| "grad_norm": 1.937901480013661, |
| "learning_rate": 1.782643796019814e-06, |
| "loss": 0.0698, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.4475703324808185, |
| "grad_norm": 1.8837065847513317, |
| "learning_rate": 1.7672938679311957e-06, |
| "loss": 0.0873, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.4501278772378516, |
| "grad_norm": 1.9638583603591262, |
| "learning_rate": 1.7519961135720737e-06, |
| "loss": 0.0832, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.452685421994885, |
| "grad_norm": 1.7512819835219864, |
| "learning_rate": 1.736750779838044e-06, |
| "loss": 0.067, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.4552429667519182, |
| "grad_norm": 1.3664725992956537, |
| "learning_rate": 1.7215581127786624e-06, |
| "loss": 0.0529, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.4578005115089514, |
| "grad_norm": 1.4792338584206843, |
| "learning_rate": 1.7064183575934856e-06, |
| "loss": 0.0573, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4603580562659846, |
| "grad_norm": 1.2900470420904124, |
| "learning_rate": 1.6913317586281048e-06, |
| "loss": 0.0592, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.4629156010230178, |
| "grad_norm": 1.875553204960176, |
| "learning_rate": 1.676298559370202e-06, |
| "loss": 0.082, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.4654731457800512, |
| "grad_norm": 1.7220714288517276, |
| "learning_rate": 1.6613190024456293e-06, |
| "loss": 0.0685, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.4680306905370843, |
| "grad_norm": 1.7924822734158794, |
| "learning_rate": 1.6463933296144863e-06, |
| "loss": 0.0645, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 1.86003465133243, |
| "learning_rate": 1.6315217817672142e-06, |
| "loss": 0.0768, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.473145780051151, |
| "grad_norm": 1.585398763380168, |
| "learning_rate": 1.6167045989207185e-06, |
| "loss": 0.0712, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.4757033248081841, |
| "grad_norm": 1.722372187884546, |
| "learning_rate": 1.6019420202144853e-06, |
| "loss": 0.0433, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.4782608695652173, |
| "grad_norm": 1.4746207680749763, |
| "learning_rate": 1.5872342839067305e-06, |
| "loss": 0.0498, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.4808184143222507, |
| "grad_norm": 1.3587198954026716, |
| "learning_rate": 1.5725816273705453e-06, |
| "loss": 0.0568, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.4833759590792839, |
| "grad_norm": 1.1732740250402258, |
| "learning_rate": 1.5579842870900746e-06, |
| "loss": 0.0383, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.485933503836317, |
| "grad_norm": 1.711025296622914, |
| "learning_rate": 1.5434424986566938e-06, |
| "loss": 0.0921, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.4884910485933505, |
| "grad_norm": 1.5076078335129486, |
| "learning_rate": 1.5289564967652033e-06, |
| "loss": 0.0597, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.4910485933503836, |
| "grad_norm": 1.797113511699182, |
| "learning_rate": 1.5145265152100574e-06, |
| "loss": 0.0734, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.4936061381074168, |
| "grad_norm": 1.9528985026557675, |
| "learning_rate": 1.5001527868815702e-06, |
| "loss": 0.0957, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.49616368286445, |
| "grad_norm": 2.177500928432752, |
| "learning_rate": 1.4858355437621663e-06, |
| "loss": 0.0879, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.4987212276214834, |
| "grad_norm": 1.6347388686414874, |
| "learning_rate": 1.4715750169226417e-06, |
| "loss": 0.0702, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.5012787723785166, |
| "grad_norm": 1.7485578039541358, |
| "learning_rate": 1.457371436518424e-06, |
| "loss": 0.0709, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.50383631713555, |
| "grad_norm": 1.8238722950404516, |
| "learning_rate": 1.4432250317858675e-06, |
| "loss": 0.0657, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.5063938618925832, |
| "grad_norm": 3.4081682641005533, |
| "learning_rate": 1.4291360310385455e-06, |
| "loss": 0.0785, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.5089514066496164, |
| "grad_norm": 1.813069192998236, |
| "learning_rate": 1.4151046616635727e-06, |
| "loss": 0.0663, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.5115089514066495, |
| "grad_norm": 1.8093217289893018, |
| "learning_rate": 1.4011311501179287e-06, |
| "loss": 0.0909, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.5140664961636827, |
| "grad_norm": 1.9482704657873833, |
| "learning_rate": 1.3872157219248045e-06, |
| "loss": 0.0658, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.5166240409207161, |
| "grad_norm": 1.5589428403729417, |
| "learning_rate": 1.373358601669973e-06, |
| "loss": 0.0478, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.5191815856777495, |
| "grad_norm": 1.1245451969788118, |
| "learning_rate": 1.3595600129981469e-06, |
| "loss": 0.0415, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.5217391304347827, |
| "grad_norm": 1.0485996987613, |
| "learning_rate": 1.3458201786093795e-06, |
| "loss": 0.0416, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.5242966751918159, |
| "grad_norm": 2.425494769380029, |
| "learning_rate": 1.3321393202554739e-06, |
| "loss": 0.0928, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.526854219948849, |
| "grad_norm": 1.9568801962892155, |
| "learning_rate": 1.3185176587363919e-06, |
| "loss": 0.0838, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.5294117647058822, |
| "grad_norm": 1.7500315202188572, |
| "learning_rate": 1.3049554138967052e-06, |
| "loss": 0.0499, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.5319693094629157, |
| "grad_norm": 1.9300395352394735, |
| "learning_rate": 1.2914528046220332e-06, |
| "loss": 0.074, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.5345268542199488, |
| "grad_norm": 1.0926807919484591, |
| "learning_rate": 1.278010048835523e-06, |
| "loss": 0.0328, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5370843989769822, |
| "grad_norm": 1.4162894586733985, |
| "learning_rate": 1.2646273634943195e-06, |
| "loss": 0.0696, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.5396419437340154, |
| "grad_norm": 1.7261841506252762, |
| "learning_rate": 1.2513049645860759e-06, |
| "loss": 0.0594, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.5421994884910486, |
| "grad_norm": 1.6070582499731865, |
| "learning_rate": 1.2380430671254618e-06, |
| "loss": 0.055, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.5447570332480818, |
| "grad_norm": 1.556977293191578, |
| "learning_rate": 1.224841885150691e-06, |
| "loss": 0.0532, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.547314578005115, |
| "grad_norm": 1.9310463068711319, |
| "learning_rate": 1.2117016317200702e-06, |
| "loss": 0.0932, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.5498721227621484, |
| "grad_norm": 1.4350515399793062, |
| "learning_rate": 1.1986225189085627e-06, |
| "loss": 0.0577, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.5524296675191815, |
| "grad_norm": 1.9134498925809693, |
| "learning_rate": 1.185604757804359e-06, |
| "loss": 0.0675, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.554987212276215, |
| "grad_norm": 1.9607164063866107, |
| "learning_rate": 1.172648558505477e-06, |
| "loss": 0.0622, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.5575447570332481, |
| "grad_norm": 1.4616026782036957, |
| "learning_rate": 1.1597541301163655e-06, |
| "loss": 0.0553, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.5601023017902813, |
| "grad_norm": 1.5571131885670004, |
| "learning_rate": 1.1469216807445348e-06, |
| "loss": 0.0521, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.5626598465473145, |
| "grad_norm": 1.7401879912173752, |
| "learning_rate": 1.1341514174971907e-06, |
| "loss": 0.0662, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.5652173913043477, |
| "grad_norm": 1.8564899739837128, |
| "learning_rate": 1.1214435464779006e-06, |
| "loss": 0.1065, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.567774936061381, |
| "grad_norm": 1.6935400217458705, |
| "learning_rate": 1.1087982727832613e-06, |
| "loss": 0.0899, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.5703324808184145, |
| "grad_norm": 1.8949613358675155, |
| "learning_rate": 1.0962158004995893e-06, |
| "loss": 0.0987, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.5728900255754477, |
| "grad_norm": 1.3271033110720252, |
| "learning_rate": 1.083696332699628e-06, |
| "loss": 0.0447, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.5754475703324808, |
| "grad_norm": 1.0282720447492992, |
| "learning_rate": 1.0712400714392723e-06, |
| "loss": 0.0364, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.578005115089514, |
| "grad_norm": 1.7054143756676137, |
| "learning_rate": 1.058847217754303e-06, |
| "loss": 0.0545, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.5805626598465472, |
| "grad_norm": 2.7824759917467836, |
| "learning_rate": 1.0465179716571467e-06, |
| "loss": 0.0394, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.5831202046035806, |
| "grad_norm": 1.4507251252027922, |
| "learning_rate": 1.034252532133646e-06, |
| "loss": 0.0551, |
| "step": 619 |
| }, |
| { |
| "epoch": 1.5856777493606138, |
| "grad_norm": 1.403863371004816, |
| "learning_rate": 1.0220510971398473e-06, |
| "loss": 0.058, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.5882352941176472, |
| "grad_norm": 2.0464027459211693, |
| "learning_rate": 1.0099138635988026e-06, |
| "loss": 0.0729, |
| "step": 621 |
| }, |
| { |
| "epoch": 1.5907928388746804, |
| "grad_norm": 1.5187023072209234, |
| "learning_rate": 9.978410273974015e-07, |
| "loss": 0.0541, |
| "step": 622 |
| }, |
| { |
| "epoch": 1.5933503836317136, |
| "grad_norm": 1.9867877118323218, |
| "learning_rate": 9.858327833832004e-07, |
| "loss": 0.0654, |
| "step": 623 |
| }, |
| { |
| "epoch": 1.5959079283887467, |
| "grad_norm": 1.4942485190173453, |
| "learning_rate": 9.738893253612808e-07, |
| "loss": 0.0616, |
| "step": 624 |
| }, |
| { |
| "epoch": 1.59846547314578, |
| "grad_norm": 1.6023660849896257, |
| "learning_rate": 9.620108460911181e-07, |
| "loss": 0.0599, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.6010230179028133, |
| "grad_norm": 1.3308092300012233, |
| "learning_rate": 9.50197537283481e-07, |
| "loss": 0.0494, |
| "step": 626 |
| }, |
| { |
| "epoch": 1.6035805626598465, |
| "grad_norm": 1.6437726206607308, |
| "learning_rate": 9.384495895973227e-07, |
| "loss": 0.0689, |
| "step": 627 |
| }, |
| { |
| "epoch": 1.60613810741688, |
| "grad_norm": 1.8936628687028338, |
| "learning_rate": 9.267671926367166e-07, |
| "loss": 0.0705, |
| "step": 628 |
| }, |
| { |
| "epoch": 1.608695652173913, |
| "grad_norm": 1.693700691427945, |
| "learning_rate": 9.151505349477901e-07, |
| "loss": 0.0759, |
| "step": 629 |
| }, |
| { |
| "epoch": 1.6112531969309463, |
| "grad_norm": 1.6481559811475006, |
| "learning_rate": 9.035998040156801e-07, |
| "loss": 0.0681, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.6138107416879794, |
| "grad_norm": 1.4547018046696156, |
| "learning_rate": 8.921151862615091e-07, |
| "loss": 0.0504, |
| "step": 631 |
| }, |
| { |
| "epoch": 1.6163682864450126, |
| "grad_norm": 2.2460334650503064, |
| "learning_rate": 8.806968670393801e-07, |
| "loss": 0.0972, |
| "step": 632 |
| }, |
| { |
| "epoch": 1.618925831202046, |
| "grad_norm": 1.3530428469383553, |
| "learning_rate": 8.693450306333818e-07, |
| "loss": 0.0589, |
| "step": 633 |
| }, |
| { |
| "epoch": 1.6214833759590794, |
| "grad_norm": 2.09210545875247, |
| "learning_rate": 8.580598602546109e-07, |
| "loss": 0.0852, |
| "step": 634 |
| }, |
| { |
| "epoch": 1.6240409207161126, |
| "grad_norm": 1.3820789409264775, |
| "learning_rate": 8.4684153803822e-07, |
| "loss": 0.0439, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.6265984654731458, |
| "grad_norm": 1.451999661618841, |
| "learning_rate": 8.356902450404792e-07, |
| "loss": 0.0509, |
| "step": 636 |
| }, |
| { |
| "epoch": 1.629156010230179, |
| "grad_norm": 1.3950789498456047, |
| "learning_rate": 8.246061612358475e-07, |
| "loss": 0.0573, |
| "step": 637 |
| }, |
| { |
| "epoch": 1.6317135549872122, |
| "grad_norm": 1.5996434417326504, |
| "learning_rate": 8.135894655140758e-07, |
| "loss": 0.0626, |
| "step": 638 |
| }, |
| { |
| "epoch": 1.6342710997442456, |
| "grad_norm": 1.8147870347577688, |
| "learning_rate": 8.026403356773161e-07, |
| "loss": 0.0779, |
| "step": 639 |
| }, |
| { |
| "epoch": 1.6368286445012787, |
| "grad_norm": 1.9345750885275885, |
| "learning_rate": 7.91758948437249e-07, |
| "loss": 0.0654, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.6393861892583121, |
| "grad_norm": 1.4948901027842707, |
| "learning_rate": 7.809454794122346e-07, |
| "loss": 0.064, |
| "step": 641 |
| }, |
| { |
| "epoch": 1.6419437340153453, |
| "grad_norm": 1.9598445552473587, |
| "learning_rate": 7.702001031244816e-07, |
| "loss": 0.0633, |
| "step": 642 |
| }, |
| { |
| "epoch": 1.6445012787723785, |
| "grad_norm": 1.3819361805500896, |
| "learning_rate": 7.595229929972253e-07, |
| "loss": 0.0677, |
| "step": 643 |
| }, |
| { |
| "epoch": 1.6470588235294117, |
| "grad_norm": 1.2518200685718097, |
| "learning_rate": 7.489143213519301e-07, |
| "loss": 0.0365, |
| "step": 644 |
| }, |
| { |
| "epoch": 1.6496163682864449, |
| "grad_norm": 1.2701439132468373, |
| "learning_rate": 7.383742594055077e-07, |
| "loss": 0.0546, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.6521739130434783, |
| "grad_norm": 1.7913524756425139, |
| "learning_rate": 7.279029772675572e-07, |
| "loss": 0.0726, |
| "step": 646 |
| }, |
| { |
| "epoch": 1.6547314578005117, |
| "grad_norm": 1.6279279703395964, |
| "learning_rate": 7.17500643937617e-07, |
| "loss": 0.0496, |
| "step": 647 |
| }, |
| { |
| "epoch": 1.6572890025575449, |
| "grad_norm": 1.6036454756063958, |
| "learning_rate": 7.071674273024353e-07, |
| "loss": 0.0604, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.659846547314578, |
| "grad_norm": 2.2697032372331827, |
| "learning_rate": 6.969034941332664e-07, |
| "loss": 0.1019, |
| "step": 649 |
| }, |
| { |
| "epoch": 1.6624040920716112, |
| "grad_norm": 1.6139143967181218, |
| "learning_rate": 6.86709010083172e-07, |
| "loss": 0.0417, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.6649616368286444, |
| "grad_norm": 1.7145079712414455, |
| "learning_rate": 6.765841396843514e-07, |
| "loss": 0.0635, |
| "step": 651 |
| }, |
| { |
| "epoch": 1.6675191815856778, |
| "grad_norm": 1.7988272014623832, |
| "learning_rate": 6.665290463454882e-07, |
| "loss": 0.0709, |
| "step": 652 |
| }, |
| { |
| "epoch": 1.670076726342711, |
| "grad_norm": 2.2968330630918956, |
| "learning_rate": 6.565438923491102e-07, |
| "loss": 0.0872, |
| "step": 653 |
| }, |
| { |
| "epoch": 1.6726342710997444, |
| "grad_norm": 1.6427468344680394, |
| "learning_rate": 6.466288388489689e-07, |
| "loss": 0.074, |
| "step": 654 |
| }, |
| { |
| "epoch": 1.6751918158567776, |
| "grad_norm": 1.49253726984557, |
| "learning_rate": 6.367840458674401e-07, |
| "loss": 0.0656, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.6777493606138107, |
| "grad_norm": 1.3826162361150278, |
| "learning_rate": 6.270096722929442e-07, |
| "loss": 0.0432, |
| "step": 656 |
| }, |
| { |
| "epoch": 1.680306905370844, |
| "grad_norm": 1.6151703090317941, |
| "learning_rate": 6.173058758773775e-07, |
| "loss": 0.0671, |
| "step": 657 |
| }, |
| { |
| "epoch": 1.682864450127877, |
| "grad_norm": 1.3594356593102335, |
| "learning_rate": 6.076728132335669e-07, |
| "loss": 0.049, |
| "step": 658 |
| }, |
| { |
| "epoch": 1.6854219948849105, |
| "grad_norm": 1.841090424763904, |
| "learning_rate": 5.981106398327463e-07, |
| "loss": 0.0746, |
| "step": 659 |
| }, |
| { |
| "epoch": 1.6879795396419437, |
| "grad_norm": 1.8813485121032132, |
| "learning_rate": 5.886195100020408e-07, |
| "loss": 0.0877, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.690537084398977, |
| "grad_norm": 1.8959291749076537, |
| "learning_rate": 5.7919957692198e-07, |
| "loss": 0.0643, |
| "step": 661 |
| }, |
| { |
| "epoch": 1.6930946291560103, |
| "grad_norm": 1.4397051152119766, |
| "learning_rate": 5.698509926240275e-07, |
| "loss": 0.0672, |
| "step": 662 |
| }, |
| { |
| "epoch": 1.6956521739130435, |
| "grad_norm": 1.6352253687560279, |
| "learning_rate": 5.60573907988124e-07, |
| "loss": 0.0704, |
| "step": 663 |
| }, |
| { |
| "epoch": 1.6982097186700766, |
| "grad_norm": 1.2272314127587733, |
| "learning_rate": 5.513684727402529e-07, |
| "loss": 0.0419, |
| "step": 664 |
| }, |
| { |
| "epoch": 1.7007672634271098, |
| "grad_norm": 1.2859127827275485, |
| "learning_rate": 5.422348354500217e-07, |
| "loss": 0.0482, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.7033248081841432, |
| "grad_norm": 2.3723626704904075, |
| "learning_rate": 5.331731435282705e-07, |
| "loss": 0.0711, |
| "step": 666 |
| }, |
| { |
| "epoch": 1.7058823529411766, |
| "grad_norm": 1.5034836312998365, |
| "learning_rate": 5.241835432246888e-07, |
| "loss": 0.0505, |
| "step": 667 |
| }, |
| { |
| "epoch": 1.7084398976982098, |
| "grad_norm": 1.4998179842301858, |
| "learning_rate": 5.152661796254505e-07, |
| "loss": 0.0428, |
| "step": 668 |
| }, |
| { |
| "epoch": 1.710997442455243, |
| "grad_norm": 1.884454942561692, |
| "learning_rate": 5.064211966508837e-07, |
| "loss": 0.0575, |
| "step": 669 |
| }, |
| { |
| "epoch": 1.7135549872122762, |
| "grad_norm": 1.8377467328078436, |
| "learning_rate": 4.976487370531352e-07, |
| "loss": 0.0699, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.7161125319693094, |
| "grad_norm": 1.7964279986811604, |
| "learning_rate": 4.88948942413876e-07, |
| "loss": 0.0804, |
| "step": 671 |
| }, |
| { |
| "epoch": 1.7186700767263428, |
| "grad_norm": 1.7189535141584609, |
| "learning_rate": 4.803219531420128e-07, |
| "loss": 0.0564, |
| "step": 672 |
| }, |
| { |
| "epoch": 1.721227621483376, |
| "grad_norm": 1.5505230731351738, |
| "learning_rate": 4.717679084714222e-07, |
| "loss": 0.05, |
| "step": 673 |
| }, |
| { |
| "epoch": 1.7237851662404093, |
| "grad_norm": 1.4214080005147414, |
| "learning_rate": 4.6328694645870254e-07, |
| "loss": 0.0644, |
| "step": 674 |
| }, |
| { |
| "epoch": 1.7263427109974425, |
| "grad_norm": 1.8241456107542313, |
| "learning_rate": 4.5487920398094465e-07, |
| "loss": 0.0679, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.7289002557544757, |
| "grad_norm": 1.3282311854023192, |
| "learning_rate": 4.46544816733529e-07, |
| "loss": 0.0503, |
| "step": 676 |
| }, |
| { |
| "epoch": 1.7314578005115089, |
| "grad_norm": 1.4122224176940064, |
| "learning_rate": 4.382839192279303e-07, |
| "loss": 0.0627, |
| "step": 677 |
| }, |
| { |
| "epoch": 1.734015345268542, |
| "grad_norm": 1.5345073928364736, |
| "learning_rate": 4.3009664478954384e-07, |
| "loss": 0.0468, |
| "step": 678 |
| }, |
| { |
| "epoch": 1.7365728900255755, |
| "grad_norm": 1.9191873098525403, |
| "learning_rate": 4.219831255555423e-07, |
| "loss": 0.0679, |
| "step": 679 |
| }, |
| { |
| "epoch": 1.7391304347826086, |
| "grad_norm": 1.4130394259241825, |
| "learning_rate": 4.139434924727359e-07, |
| "loss": 0.0372, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.741687979539642, |
| "grad_norm": 1.6440930896633388, |
| "learning_rate": 4.059778752954607e-07, |
| "loss": 0.0588, |
| "step": 681 |
| }, |
| { |
| "epoch": 1.7442455242966752, |
| "grad_norm": 1.6483657156936842, |
| "learning_rate": 3.9808640258348686e-07, |
| "loss": 0.0554, |
| "step": 682 |
| }, |
| { |
| "epoch": 1.7468030690537084, |
| "grad_norm": 1.600191039272227, |
| "learning_rate": 3.9026920169994374e-07, |
| "loss": 0.0651, |
| "step": 683 |
| }, |
| { |
| "epoch": 1.7493606138107416, |
| "grad_norm": 1.6033515242953238, |
| "learning_rate": 3.825263988092587e-07, |
| "loss": 0.0631, |
| "step": 684 |
| }, |
| { |
| "epoch": 1.7519181585677748, |
| "grad_norm": 1.412635066629858, |
| "learning_rate": 3.7485811887512714e-07, |
| "loss": 0.0572, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.7544757033248082, |
| "grad_norm": 1.7343917829704119, |
| "learning_rate": 3.672644856584928e-07, |
| "loss": 0.0653, |
| "step": 686 |
| }, |
| { |
| "epoch": 1.7570332480818416, |
| "grad_norm": 2.2332503744651544, |
| "learning_rate": 3.597456217155526e-07, |
| "loss": 0.0525, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.7595907928388748, |
| "grad_norm": 1.559262559831483, |
| "learning_rate": 3.523016483957742e-07, |
| "loss": 0.0695, |
| "step": 688 |
| }, |
| { |
| "epoch": 1.762148337595908, |
| "grad_norm": 1.3735832247322428, |
| "learning_rate": 3.4493268583994434e-07, |
| "loss": 0.0514, |
| "step": 689 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 1.1390421075670185, |
| "learning_rate": 3.3763885297822153e-07, |
| "loss": 0.026, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.7672634271099743, |
| "grad_norm": 1.6138957382346333, |
| "learning_rate": 3.3042026752822254e-07, |
| "loss": 0.0727, |
| "step": 691 |
| }, |
| { |
| "epoch": 1.7698209718670077, |
| "grad_norm": 1.637531444373419, |
| "learning_rate": 3.2327704599312283e-07, |
| "loss": 0.0615, |
| "step": 692 |
| }, |
| { |
| "epoch": 1.772378516624041, |
| "grad_norm": 1.7910759929217885, |
| "learning_rate": 3.16209303659773e-07, |
| "loss": 0.062, |
| "step": 693 |
| }, |
| { |
| "epoch": 1.7749360613810743, |
| "grad_norm": 1.5204492548400286, |
| "learning_rate": 3.0921715459683753e-07, |
| "loss": 0.048, |
| "step": 694 |
| }, |
| { |
| "epoch": 1.7774936061381075, |
| "grad_norm": 1.623940846315055, |
| "learning_rate": 3.0230071165295804e-07, |
| "loss": 0.0518, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.7800511508951407, |
| "grad_norm": 1.7492311424019007, |
| "learning_rate": 2.95460086454929e-07, |
| "loss": 0.0599, |
| "step": 696 |
| }, |
| { |
| "epoch": 1.7826086956521738, |
| "grad_norm": 1.8374659643706557, |
| "learning_rate": 2.88695389405898e-07, |
| "loss": 0.0682, |
| "step": 697 |
| }, |
| { |
| "epoch": 1.785166240409207, |
| "grad_norm": 1.291638322992873, |
| "learning_rate": 2.820067296835799e-07, |
| "loss": 0.0459, |
| "step": 698 |
| }, |
| { |
| "epoch": 1.7877237851662404, |
| "grad_norm": 1.6932187157901324, |
| "learning_rate": 2.753942152385014e-07, |
| "loss": 0.0568, |
| "step": 699 |
| }, |
| { |
| "epoch": 1.7902813299232738, |
| "grad_norm": 1.5799764190543009, |
| "learning_rate": 2.688579527922514e-07, |
| "loss": 0.0474, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.792838874680307, |
| "grad_norm": 1.506629182713951, |
| "learning_rate": 2.6239804783576294e-07, |
| "loss": 0.0557, |
| "step": 701 |
| }, |
| { |
| "epoch": 1.7953964194373402, |
| "grad_norm": 1.9630080012456346, |
| "learning_rate": 2.560146046276135e-07, |
| "loss": 0.0818, |
| "step": 702 |
| }, |
| { |
| "epoch": 1.7979539641943734, |
| "grad_norm": 1.6836710184467403, |
| "learning_rate": 2.4970772619233475e-07, |
| "loss": 0.0743, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.8005115089514065, |
| "grad_norm": 2.0916698780327025, |
| "learning_rate": 2.4347751431875453e-07, |
| "loss": 0.0797, |
| "step": 704 |
| }, |
| { |
| "epoch": 1.80306905370844, |
| "grad_norm": 1.7885774176909262, |
| "learning_rate": 2.373240695583534e-07, |
| "loss": 0.0772, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.8056265984654731, |
| "grad_norm": 1.4871829837803723, |
| "learning_rate": 2.3124749122364286e-07, |
| "loss": 0.064, |
| "step": 706 |
| }, |
| { |
| "epoch": 1.8081841432225065, |
| "grad_norm": 1.5766080256272554, |
| "learning_rate": 2.2524787738656073e-07, |
| "loss": 0.0584, |
| "step": 707 |
| }, |
| { |
| "epoch": 1.8107416879795397, |
| "grad_norm": 1.6532530455773482, |
| "learning_rate": 2.1932532487688784e-07, |
| "loss": 0.0528, |
| "step": 708 |
| }, |
| { |
| "epoch": 1.813299232736573, |
| "grad_norm": 1.4446085749985063, |
| "learning_rate": 2.1347992928068884e-07, |
| "loss": 0.0541, |
| "step": 709 |
| }, |
| { |
| "epoch": 1.815856777493606, |
| "grad_norm": 0.975992197968072, |
| "learning_rate": 2.0771178493876387e-07, |
| "loss": 0.03, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.8184143222506393, |
| "grad_norm": 1.870335934306222, |
| "learning_rate": 2.0202098494513157e-07, |
| "loss": 0.0782, |
| "step": 711 |
| }, |
| { |
| "epoch": 1.8209718670076727, |
| "grad_norm": 1.5422113588288993, |
| "learning_rate": 1.964076211455246e-07, |
| "loss": 0.0528, |
| "step": 712 |
| }, |
| { |
| "epoch": 1.8235294117647058, |
| "grad_norm": 1.805338069694364, |
| "learning_rate": 1.908717841359048e-07, |
| "loss": 0.0602, |
| "step": 713 |
| }, |
| { |
| "epoch": 1.8260869565217392, |
| "grad_norm": 1.7608744884671308, |
| "learning_rate": 1.8541356326100436e-07, |
| "loss": 0.0512, |
| "step": 714 |
| }, |
| { |
| "epoch": 1.8286445012787724, |
| "grad_norm": 1.7866867532191502, |
| "learning_rate": 1.800330466128808e-07, |
| "loss": 0.0685, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.8312020460358056, |
| "grad_norm": 1.2787390793668814, |
| "learning_rate": 1.7473032102949983e-07, |
| "loss": 0.0427, |
| "step": 716 |
| }, |
| { |
| "epoch": 1.8337595907928388, |
| "grad_norm": 2.347960248720825, |
| "learning_rate": 1.695054720933309e-07, |
| "loss": 0.0937, |
| "step": 717 |
| }, |
| { |
| "epoch": 1.836317135549872, |
| "grad_norm": 1.830326258197173, |
| "learning_rate": 1.6435858412996275e-07, |
| "loss": 0.0752, |
| "step": 718 |
| }, |
| { |
| "epoch": 1.8388746803069054, |
| "grad_norm": 2.0209211652014583, |
| "learning_rate": 1.5928974020674947e-07, |
| "loss": 0.0645, |
| "step": 719 |
| }, |
| { |
| "epoch": 1.8414322250639388, |
| "grad_norm": 1.483717555265772, |
| "learning_rate": 1.542990221314644e-07, |
| "loss": 0.0429, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.843989769820972, |
| "grad_norm": 1.6080693955603764, |
| "learning_rate": 1.4938651045098174e-07, |
| "loss": 0.0402, |
| "step": 721 |
| }, |
| { |
| "epoch": 1.8465473145780051, |
| "grad_norm": 1.6364487443092068, |
| "learning_rate": 1.445522844499775e-07, |
| "loss": 0.0515, |
| "step": 722 |
| }, |
| { |
| "epoch": 1.8491048593350383, |
| "grad_norm": 2.2286488872159835, |
| "learning_rate": 1.3979642214964728e-07, |
| "loss": 0.0792, |
| "step": 723 |
| }, |
| { |
| "epoch": 1.8516624040920715, |
| "grad_norm": 1.8358051982313959, |
| "learning_rate": 1.3511900030644954e-07, |
| "loss": 0.0718, |
| "step": 724 |
| }, |
| { |
| "epoch": 1.854219948849105, |
| "grad_norm": 3.327820933989785, |
| "learning_rate": 1.3052009441086533e-07, |
| "loss": 0.0665, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.856777493606138, |
| "grad_norm": 1.54562580037595, |
| "learning_rate": 1.2599977868618052e-07, |
| "loss": 0.0488, |
| "step": 726 |
| }, |
| { |
| "epoch": 1.8593350383631715, |
| "grad_norm": 1.768894170651948, |
| "learning_rate": 1.215581260872889e-07, |
| "loss": 0.0727, |
| "step": 727 |
| }, |
| { |
| "epoch": 1.8618925831202047, |
| "grad_norm": 1.1097850346664122, |
| "learning_rate": 1.1719520829951203e-07, |
| "loss": 0.0361, |
| "step": 728 |
| }, |
| { |
| "epoch": 1.8644501278772379, |
| "grad_norm": 1.4290151454014999, |
| "learning_rate": 1.1291109573744574e-07, |
| "loss": 0.0378, |
| "step": 729 |
| }, |
| { |
| "epoch": 1.867007672634271, |
| "grad_norm": 1.8281387881477926, |
| "learning_rate": 1.087058575438199e-07, |
| "loss": 0.0734, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8695652173913042, |
| "grad_norm": 1.4987669648542046, |
| "learning_rate": 1.0457956158838545e-07, |
| "loss": 0.0649, |
| "step": 731 |
| }, |
| { |
| "epoch": 1.8721227621483376, |
| "grad_norm": 1.2852375949245314, |
| "learning_rate": 1.0053227446681912e-07, |
| "loss": 0.0502, |
| "step": 732 |
| }, |
| { |
| "epoch": 1.8746803069053708, |
| "grad_norm": 1.819977919497317, |
| "learning_rate": 9.656406149964548e-08, |
| "loss": 0.0784, |
| "step": 733 |
| }, |
| { |
| "epoch": 1.8772378516624042, |
| "grad_norm": 1.668856802723362, |
| "learning_rate": 9.267498673118547e-08, |
| "loss": 0.0702, |
| "step": 734 |
| }, |
| { |
| "epoch": 1.8797953964194374, |
| "grad_norm": 1.5831651881175812, |
| "learning_rate": 8.886511292852395e-08, |
| "loss": 0.0635, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 1.690650435572024, |
| "learning_rate": 8.513450158049109e-08, |
| "loss": 0.0664, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.8849104859335037, |
| "grad_norm": 1.5261545346775505, |
| "learning_rate": 8.148321289667749e-08, |
| "loss": 0.0562, |
| "step": 737 |
| }, |
| { |
| "epoch": 1.887468030690537, |
| "grad_norm": 1.1138146339084107, |
| "learning_rate": 7.791130580645623e-08, |
| "loss": 0.0414, |
| "step": 738 |
| }, |
| { |
| "epoch": 1.8900255754475703, |
| "grad_norm": 1.5520967732615027, |
| "learning_rate": 7.441883795803462e-08, |
| "loss": 0.0551, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.8925831202046037, |
| "grad_norm": 1.5613125152011387, |
| "learning_rate": 7.100586571752444e-08, |
| "loss": 0.0384, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.895140664961637, |
| "grad_norm": 1.316176028402755, |
| "learning_rate": 6.767244416802988e-08, |
| "loss": 0.0455, |
| "step": 741 |
| }, |
| { |
| "epoch": 1.89769820971867, |
| "grad_norm": 1.6995534435523871, |
| "learning_rate": 6.441862710876102e-08, |
| "loss": 0.0664, |
| "step": 742 |
| }, |
| { |
| "epoch": 1.9002557544757033, |
| "grad_norm": 1.4435916963264959, |
| "learning_rate": 6.124446705416343e-08, |
| "loss": 0.0433, |
| "step": 743 |
| }, |
| { |
| "epoch": 1.9028132992327365, |
| "grad_norm": 1.3955512049918404, |
| "learning_rate": 5.815001523307162e-08, |
| "loss": 0.0523, |
| "step": 744 |
| }, |
| { |
| "epoch": 1.9053708439897699, |
| "grad_norm": 1.5549298687122213, |
| "learning_rate": 5.513532158788193e-08, |
| "loss": 0.0632, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.907928388746803, |
| "grad_norm": 2.0565454206759366, |
| "learning_rate": 5.220043477374759e-08, |
| "loss": 0.0906, |
| "step": 746 |
| }, |
| { |
| "epoch": 1.9104859335038364, |
| "grad_norm": 1.781632120564299, |
| "learning_rate": 4.934540215779271e-08, |
| "loss": 0.0575, |
| "step": 747 |
| }, |
| { |
| "epoch": 1.9130434782608696, |
| "grad_norm": 1.7209181847754342, |
| "learning_rate": 4.657026981834623e-08, |
| "loss": 0.0758, |
| "step": 748 |
| }, |
| { |
| "epoch": 1.9156010230179028, |
| "grad_norm": 1.5843098732810394, |
| "learning_rate": 4.3875082544201364e-08, |
| "loss": 0.0652, |
| "step": 749 |
| }, |
| { |
| "epoch": 1.918158567774936, |
| "grad_norm": 1.694771633678275, |
| "learning_rate": 4.125988383388957e-08, |
| "loss": 0.0661, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.9207161125319692, |
| "grad_norm": 1.5171207087239054, |
| "learning_rate": 3.87247158949805e-08, |
| "loss": 0.0498, |
| "step": 751 |
| }, |
| { |
| "epoch": 1.9232736572890026, |
| "grad_norm": 1.5041197496139775, |
| "learning_rate": 3.626961964340203e-08, |
| "loss": 0.0687, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.9258312020460358, |
| "grad_norm": 1.901653590307097, |
| "learning_rate": 3.389463470277576e-08, |
| "loss": 0.0807, |
| "step": 753 |
| }, |
| { |
| "epoch": 1.9283887468030692, |
| "grad_norm": 2.073140635948293, |
| "learning_rate": 3.159979940378088e-08, |
| "loss": 0.0718, |
| "step": 754 |
| }, |
| { |
| "epoch": 1.9309462915601023, |
| "grad_norm": 1.92991294935061, |
| "learning_rate": 2.938515078353521e-08, |
| "loss": 0.0612, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.9335038363171355, |
| "grad_norm": 1.4209233459544497, |
| "learning_rate": 2.725072458499567e-08, |
| "loss": 0.0598, |
| "step": 756 |
| }, |
| { |
| "epoch": 1.9360613810741687, |
| "grad_norm": 2.270158801782062, |
| "learning_rate": 2.519655525638376e-08, |
| "loss": 0.1167, |
| "step": 757 |
| }, |
| { |
| "epoch": 1.938618925831202, |
| "grad_norm": 1.7764725333130844, |
| "learning_rate": 2.3222675950627106e-08, |
| "loss": 0.0493, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.9411764705882353, |
| "grad_norm": 1.3953632649829264, |
| "learning_rate": 2.1329118524827662e-08, |
| "loss": 0.0486, |
| "step": 759 |
| }, |
| { |
| "epoch": 1.9437340153452687, |
| "grad_norm": 1.5719732312743544, |
| "learning_rate": 1.9515913539743247e-08, |
| "loss": 0.057, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9462915601023019, |
| "grad_norm": 1.2771960619657408, |
| "learning_rate": 1.7783090259297918e-08, |
| "loss": 0.0521, |
| "step": 761 |
| }, |
| { |
| "epoch": 1.948849104859335, |
| "grad_norm": 1.5534397400552928, |
| "learning_rate": 1.613067665010959e-08, |
| "loss": 0.0479, |
| "step": 762 |
| }, |
| { |
| "epoch": 1.9514066496163682, |
| "grad_norm": 1.9209034504241524, |
| "learning_rate": 1.4558699381034825e-08, |
| "loss": 0.0744, |
| "step": 763 |
| }, |
| { |
| "epoch": 1.9539641943734014, |
| "grad_norm": 1.6535913379119196, |
| "learning_rate": 1.3067183822742525e-08, |
| "loss": 0.0505, |
| "step": 764 |
| }, |
| { |
| "epoch": 1.9565217391304348, |
| "grad_norm": 1.3997237372409204, |
| "learning_rate": 1.1656154047303691e-08, |
| "loss": 0.0364, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.959079283887468, |
| "grad_norm": 2.374063223958248, |
| "learning_rate": 1.0325632827801745e-08, |
| "loss": 0.068, |
| "step": 766 |
| }, |
| { |
| "epoch": 1.9616368286445014, |
| "grad_norm": 1.8265000509614693, |
| "learning_rate": 9.075641637964483e-09, |
| "loss": 0.0549, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.9641943734015346, |
| "grad_norm": 1.6206955235826634, |
| "learning_rate": 7.906200651819907e-09, |
| "loss": 0.045, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.9667519181585678, |
| "grad_norm": 1.7353163027983434, |
| "learning_rate": 6.817328743368712e-09, |
| "loss": 0.0597, |
| "step": 769 |
| }, |
| { |
| "epoch": 1.969309462915601, |
| "grad_norm": 1.7398008950469799, |
| "learning_rate": 5.809043486279531e-09, |
| "loss": 0.0822, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9718670076726341, |
| "grad_norm": 1.75495701677338, |
| "learning_rate": 4.881361153606934e-09, |
| "loss": 0.0518, |
| "step": 771 |
| }, |
| { |
| "epoch": 1.9744245524296675, |
| "grad_norm": 1.4225600685776296, |
| "learning_rate": 4.034296717527752e-09, |
| "loss": 0.0598, |
| "step": 772 |
| }, |
| { |
| "epoch": 1.976982097186701, |
| "grad_norm": 1.5449322680878437, |
| "learning_rate": 3.2678638490996064e-09, |
| "loss": 0.0419, |
| "step": 773 |
| }, |
| { |
| "epoch": 1.979539641943734, |
| "grad_norm": 1.5752118407330762, |
| "learning_rate": 2.5820749180388573e-09, |
| "loss": 0.069, |
| "step": 774 |
| }, |
| { |
| "epoch": 1.9820971867007673, |
| "grad_norm": 1.6779528904252332, |
| "learning_rate": 1.976940992523546e-09, |
| "loss": 0.0623, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.9846547314578005, |
| "grad_norm": 2.1188662150836888, |
| "learning_rate": 1.4524718390140913e-09, |
| "loss": 0.0973, |
| "step": 776 |
| }, |
| { |
| "epoch": 1.9872122762148337, |
| "grad_norm": 1.336481358589082, |
| "learning_rate": 1.0086759220934162e-09, |
| "loss": 0.0532, |
| "step": 777 |
| }, |
| { |
| "epoch": 1.989769820971867, |
| "grad_norm": 1.4512712789745728, |
| "learning_rate": 6.455604043331676e-10, |
| "loss": 0.0552, |
| "step": 778 |
| }, |
| { |
| "epoch": 1.9923273657289002, |
| "grad_norm": 1.0684958274484193, |
| "learning_rate": 3.631311461765874e-10, |
| "loss": 0.0324, |
| "step": 779 |
| }, |
| { |
| "epoch": 1.9948849104859336, |
| "grad_norm": 2.7803294897735515, |
| "learning_rate": 1.6139270584358823e-10, |
| "loss": 0.076, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.9974424552429668, |
| "grad_norm": 1.7051238941938596, |
| "learning_rate": 4.034833925969928e-11, |
| "loss": 0.0641, |
| "step": 781 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.9818075119000704, |
| "learning_rate": 0.0, |
| "loss": 0.0691, |
| "step": 782 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 782, |
| "total_flos": 4414597447680.0, |
| "train_loss": 0.1485485090061908, |
| "train_runtime": 1713.8206, |
| "train_samples_per_second": 3.65, |
| "train_steps_per_second": 0.456 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 782, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50000000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4414597447680.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|