diff --git "a/checkpoint-4044/trainer_state.json" "b/checkpoint-4044/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4044/trainer_state.json" @@ -0,0 +1,28341 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4044, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002472799208704253, + "grad_norm": 2.5362265715360204, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.2213, + "step": 1 + }, + { + "epoch": 0.0004945598417408506, + "grad_norm": 2.6997408857983025, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.231, + "step": 2 + }, + { + "epoch": 0.000741839762611276, + "grad_norm": 2.5867181090965676, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.249, + "step": 3 + }, + { + "epoch": 0.0009891196834817012, + "grad_norm": 2.357638460408377, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.2146, + "step": 4 + }, + { + "epoch": 0.0012363996043521265, + "grad_norm": 2.557414917391326, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.2872, + "step": 5 + }, + { + "epoch": 0.001483679525222552, + "grad_norm": 2.3734045786673423, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.2081, + "step": 6 + }, + { + "epoch": 0.0017309594460929772, + "grad_norm": 2.621127934186416, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.2374, + "step": 7 + }, + { + "epoch": 0.0019782393669634025, + "grad_norm": 2.7716435700320714, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.2364, + "step": 8 + }, + { + "epoch": 0.002225519287833828, + "grad_norm": 2.301737328244618, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.2219, + "step": 9 + }, + { + "epoch": 0.002472799208704253, + "grad_norm": 2.5519952552831398, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2372, + "step": 10 + }, + { + "epoch": 0.0027200791295746785, + "grad_norm": 2.3667701483261667, + "learning_rate": 5.5e-07, + "loss": 1.1995, + "step": 11 + }, + { + "epoch": 0.002967359050445104, + "grad_norm": 2.3437329583372604, + "learning_rate": 6.000000000000001e-07, + "loss": 1.2414, + "step": 12 + }, + { + "epoch": 0.003214638971315529, + "grad_norm": 2.060580346749902, + "learning_rate": 6.5e-07, + "loss": 1.2034, + "step": 13 + }, + { + "epoch": 0.0034619188921859545, + "grad_norm": 2.0573625343235564, + "learning_rate": 7.000000000000001e-07, + "loss": 1.2025, + "step": 14 + }, + { + "epoch": 0.00370919881305638, + "grad_norm": 2.374658307654524, + "learning_rate": 7.5e-07, + "loss": 1.2111, + "step": 15 + }, + { + "epoch": 0.003956478733926805, + "grad_norm": 2.134642801469451, + "learning_rate": 8.000000000000001e-07, + "loss": 1.227, + "step": 16 + }, + { + "epoch": 0.0042037586547972305, + "grad_norm": 2.2681061588439424, + "learning_rate": 8.500000000000001e-07, + "loss": 1.1964, + "step": 17 + }, + { + "epoch": 0.004451038575667656, + "grad_norm": 1.8506691640366166, + "learning_rate": 9.000000000000001e-07, + "loss": 1.1977, + "step": 18 + }, + { + "epoch": 0.004698318496538081, + "grad_norm": 1.948439188694478, + "learning_rate": 9.500000000000001e-07, + "loss": 1.2006, + "step": 19 + }, + { + "epoch": 0.004945598417408506, + "grad_norm": 1.6605613980729736, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.1666, + "step": 20 + }, + { + "epoch": 0.0051928783382789315, + "grad_norm": 1.6922633947680867, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.1989, + "step": 21 + }, + { + "epoch": 0.005440158259149357, + "grad_norm": 1.5971190787663854, + "learning_rate": 1.1e-06, + "loss": 1.1874, + "step": 22 + }, + { + "epoch": 0.0056874381800197825, + "grad_norm": 1.5631402047790957, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.187, + "step": 23 + }, + { + "epoch": 0.005934718100890208, + "grad_norm": 1.4965437093134575, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1685, + "step": 24 + }, + { + "epoch": 0.006181998021760633, + "grad_norm": 1.4346303928048703, + "learning_rate": 1.25e-06, + "loss": 1.1712, + "step": 25 + }, + { + "epoch": 0.006429277942631058, + "grad_norm": 1.4575013024426748, + "learning_rate": 1.3e-06, + "loss": 1.1598, + "step": 26 + }, + { + "epoch": 0.0066765578635014835, + "grad_norm": 1.328401240687131, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.1159, + "step": 27 + }, + { + "epoch": 0.006923837784371909, + "grad_norm": 1.4155504497318074, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.1471, + "step": 28 + }, + { + "epoch": 0.0071711177052423344, + "grad_norm": 1.2794817067434705, + "learning_rate": 1.45e-06, + "loss": 1.1289, + "step": 29 + }, + { + "epoch": 0.00741839762611276, + "grad_norm": 1.2986984654256946, + "learning_rate": 1.5e-06, + "loss": 1.1103, + "step": 30 + }, + { + "epoch": 0.007665677546983185, + "grad_norm": 1.1904601251518023, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.1071, + "step": 31 + }, + { + "epoch": 0.00791295746785361, + "grad_norm": 1.211314439028396, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1066, + "step": 32 + }, + { + "epoch": 0.008160237388724036, + "grad_norm": 1.1889594075249947, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.1016, + "step": 33 + }, + { + "epoch": 0.008407517309594461, + "grad_norm": 1.144650796083838, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.0846, + "step": 34 + }, + { + "epoch": 0.008654797230464886, + "grad_norm": 1.1047081836619403, + "learning_rate": 1.75e-06, + "loss": 1.052, + "step": 35 + }, + { + "epoch": 0.008902077151335312, + "grad_norm": 1.1224289563936236, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.083, + "step": 36 + }, + { + "epoch": 0.009149357072205737, + "grad_norm": 1.165835379811125, + "learning_rate": 1.85e-06, + "loss": 1.0743, + "step": 37 + }, + { + "epoch": 0.009396636993076163, + "grad_norm": 1.2275687643208097, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0349, + "step": 38 + }, + { + "epoch": 0.009643916913946587, + "grad_norm": 1.1972161192753212, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.016, + "step": 39 + }, + { + "epoch": 0.009891196834817012, + "grad_norm": 1.082058322089044, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0162, + "step": 40 + }, + { + "epoch": 0.010138476755687438, + "grad_norm": 1.0697734561517596, + "learning_rate": 2.05e-06, + "loss": 1.002, + "step": 41 + }, + { + "epoch": 0.010385756676557863, + "grad_norm": 1.0899500742647537, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9707, + "step": 42 + }, + { + "epoch": 0.01063303659742829, + "grad_norm": 1.0591744139232069, + "learning_rate": 2.15e-06, + "loss": 0.9578, + "step": 43 + }, + { + "epoch": 0.010880316518298714, + "grad_norm": 1.0360370041311253, + "learning_rate": 2.2e-06, + "loss": 0.9401, + "step": 44 + }, + { + "epoch": 0.01112759643916914, + "grad_norm": 0.9882030779618665, + "learning_rate": 2.25e-06, + "loss": 0.9465, + "step": 45 + }, + { + "epoch": 0.011374876360039565, + "grad_norm": 1.101171577063728, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.9889, + "step": 46 + }, + { + "epoch": 0.01162215628090999, + "grad_norm": 0.967072842509527, + "learning_rate": 2.35e-06, + "loss": 0.9713, + "step": 47 + }, + { + "epoch": 0.011869436201780416, + "grad_norm": 0.978495110160977, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.9296, + "step": 48 + }, + { + "epoch": 0.01211671612265084, + "grad_norm": 0.9606674955474925, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.9297, + "step": 49 + }, + { + "epoch": 0.012363996043521267, + "grad_norm": 1.0561055229672853, + "learning_rate": 2.5e-06, + "loss": 0.8987, + "step": 50 + }, + { + "epoch": 0.012611275964391691, + "grad_norm": 0.9661450584899874, + "learning_rate": 2.55e-06, + "loss": 0.9205, + "step": 51 + }, + { + "epoch": 0.012858555885262116, + "grad_norm": 0.9327207093078266, + "learning_rate": 2.6e-06, + "loss": 0.9184, + "step": 52 + }, + { + "epoch": 0.013105835806132542, + "grad_norm": 0.9416813080118679, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.9, + "step": 53 + }, + { + "epoch": 0.013353115727002967, + "grad_norm": 0.8793376312707565, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.8655, + "step": 54 + }, + { + "epoch": 0.013600395647873393, + "grad_norm": 0.8190962235195288, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8803, + "step": 55 + }, + { + "epoch": 0.013847675568743818, + "grad_norm": 0.8728021412747534, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.8669, + "step": 56 + }, + { + "epoch": 0.014094955489614243, + "grad_norm": 0.945032511017986, + "learning_rate": 2.85e-06, + "loss": 0.8457, + "step": 57 + }, + { + "epoch": 0.014342235410484669, + "grad_norm": 0.8824615112963289, + "learning_rate": 2.9e-06, + "loss": 0.8369, + "step": 58 + }, + { + "epoch": 0.014589515331355093, + "grad_norm": 0.8431299610759388, + "learning_rate": 2.95e-06, + "loss": 0.848, + "step": 59 + }, + { + "epoch": 0.01483679525222552, + "grad_norm": 0.8541187789355988, + "learning_rate": 3e-06, + "loss": 0.8253, + "step": 60 + }, + { + "epoch": 0.015084075173095944, + "grad_norm": 0.774863931803644, + "learning_rate": 3.05e-06, + "loss": 0.8426, + "step": 61 + }, + { + "epoch": 0.01533135509396637, + "grad_norm": 0.7924045073787536, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.8175, + "step": 62 + }, + { + "epoch": 0.015578635014836795, + "grad_norm": 0.7945600224177056, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.8014, + "step": 63 + }, + { + "epoch": 0.01582591493570722, + "grad_norm": 0.8081849485275809, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7814, + "step": 64 + }, + { + "epoch": 0.016073194856577645, + "grad_norm": 0.7702852209704323, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7893, + "step": 65 + }, + { + "epoch": 0.016320474777448073, + "grad_norm": 0.7383664978090273, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7989, + "step": 66 + }, + { + "epoch": 0.016567754698318497, + "grad_norm": 0.7436837377486586, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.8058, + "step": 67 + }, + { + "epoch": 0.016815034619188922, + "grad_norm": 0.7428905105848088, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.7963, + "step": 68 + }, + { + "epoch": 0.017062314540059347, + "grad_norm": 0.75729956851682, + "learning_rate": 3.45e-06, + "loss": 0.7828, + "step": 69 + }, + { + "epoch": 0.01730959446092977, + "grad_norm": 0.6918807554019548, + "learning_rate": 3.5e-06, + "loss": 0.79, + "step": 70 + }, + { + "epoch": 0.0175568743818002, + "grad_norm": 0.7250929492991405, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7984, + "step": 71 + }, + { + "epoch": 0.017804154302670624, + "grad_norm": 0.7523258703265291, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7794, + "step": 72 + }, + { + "epoch": 0.01805143422354105, + "grad_norm": 0.7596189403247041, + "learning_rate": 3.65e-06, + "loss": 0.7933, + "step": 73 + }, + { + "epoch": 0.018298714144411473, + "grad_norm": 0.7098250319729716, + "learning_rate": 3.7e-06, + "loss": 0.7639, + "step": 74 + }, + { + "epoch": 0.018545994065281898, + "grad_norm": 0.7325182854684962, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7613, + "step": 75 + }, + { + "epoch": 0.018793273986152326, + "grad_norm": 0.7656692952105425, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7481, + "step": 76 + }, + { + "epoch": 0.01904055390702275, + "grad_norm": 0.7299978668203949, + "learning_rate": 3.85e-06, + "loss": 0.7839, + "step": 77 + }, + { + "epoch": 0.019287833827893175, + "grad_norm": 0.7157606053500201, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7474, + "step": 78 + }, + { + "epoch": 0.0195351137487636, + "grad_norm": 0.7324944560450859, + "learning_rate": 3.95e-06, + "loss": 0.763, + "step": 79 + }, + { + "epoch": 0.019782393669634024, + "grad_norm": 0.7816018694216577, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7219, + "step": 80 + }, + { + "epoch": 0.020029673590504452, + "grad_norm": 0.7375563499331299, + "learning_rate": 4.05e-06, + "loss": 0.7654, + "step": 81 + }, + { + "epoch": 0.020276953511374877, + "grad_norm": 0.7026975821766478, + "learning_rate": 4.1e-06, + "loss": 0.7733, + "step": 82 + }, + { + "epoch": 0.0205242334322453, + "grad_norm": 0.6991770359109263, + "learning_rate": 4.15e-06, + "loss": 0.7457, + "step": 83 + }, + { + "epoch": 0.020771513353115726, + "grad_norm": 0.7405828710814198, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.7471, + "step": 84 + }, + { + "epoch": 0.02101879327398615, + "grad_norm": 0.7172297475686586, + "learning_rate": 4.25e-06, + "loss": 0.7201, + "step": 85 + }, + { + "epoch": 0.02126607319485658, + "grad_norm": 0.7029168304540451, + "learning_rate": 4.3e-06, + "loss": 0.7559, + "step": 86 + }, + { + "epoch": 0.021513353115727003, + "grad_norm": 0.723359349050235, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7287, + "step": 87 + }, + { + "epoch": 0.021760633036597428, + "grad_norm": 0.7357471244599104, + "learning_rate": 4.4e-06, + "loss": 0.7218, + "step": 88 + }, + { + "epoch": 0.022007912957467853, + "grad_norm": 0.7747133612650493, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7442, + "step": 89 + }, + { + "epoch": 0.02225519287833828, + "grad_norm": 0.6819171919004213, + "learning_rate": 4.5e-06, + "loss": 0.7168, + "step": 90 + }, + { + "epoch": 0.022502472799208705, + "grad_norm": 0.6919857424868104, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7271, + "step": 91 + }, + { + "epoch": 0.02274975272007913, + "grad_norm": 0.7250166917172235, + "learning_rate": 4.600000000000001e-06, + "loss": 0.7335, + "step": 92 + }, + { + "epoch": 0.022997032640949554, + "grad_norm": 0.760740711812877, + "learning_rate": 4.65e-06, + "loss": 0.705, + "step": 93 + }, + { + "epoch": 0.02324431256181998, + "grad_norm": 0.7159571116471827, + "learning_rate": 4.7e-06, + "loss": 0.7249, + "step": 94 + }, + { + "epoch": 0.023491592482690407, + "grad_norm": 0.7112056318387862, + "learning_rate": 4.75e-06, + "loss": 0.7221, + "step": 95 + }, + { + "epoch": 0.02373887240356083, + "grad_norm": 0.7407900000388679, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7094, + "step": 96 + }, + { + "epoch": 0.023986152324431256, + "grad_norm": 0.72179972775552, + "learning_rate": 4.85e-06, + "loss": 0.6995, + "step": 97 + }, + { + "epoch": 0.02423343224530168, + "grad_norm": 0.6561750079352092, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.6889, + "step": 98 + }, + { + "epoch": 0.024480712166172106, + "grad_norm": 0.7107165656217939, + "learning_rate": 4.95e-06, + "loss": 0.7065, + "step": 99 + }, + { + "epoch": 0.024727992087042534, + "grad_norm": 0.7536703835680636, + "learning_rate": 5e-06, + "loss": 0.738, + "step": 100 + }, + { + "epoch": 0.024975272007912958, + "grad_norm": 0.7212863280521654, + "learning_rate": 4.999999978871334e-06, + "loss": 0.7177, + "step": 101 + }, + { + "epoch": 0.025222551928783383, + "grad_norm": 0.7418268751267123, + "learning_rate": 4.9999999154853315e-06, + "loss": 0.7188, + "step": 102 + }, + { + "epoch": 0.025469831849653807, + "grad_norm": 0.7559916354416649, + "learning_rate": 4.999999809841997e-06, + "loss": 0.7025, + "step": 103 + }, + { + "epoch": 0.025717111770524232, + "grad_norm": 0.7811551138440429, + "learning_rate": 4.999999661941331e-06, + "loss": 0.7073, + "step": 104 + }, + { + "epoch": 0.02596439169139466, + "grad_norm": 0.7497301361586501, + "learning_rate": 4.999999471783337e-06, + "loss": 0.6692, + "step": 105 + }, + { + "epoch": 0.026211671612265085, + "grad_norm": 0.822184378582824, + "learning_rate": 4.999999239368016e-06, + "loss": 0.7119, + "step": 106 + }, + { + "epoch": 0.02645895153313551, + "grad_norm": 0.8275732064267906, + "learning_rate": 4.999998964695375e-06, + "loss": 0.724, + "step": 107 + }, + { + "epoch": 0.026706231454005934, + "grad_norm": 0.7022572200240244, + "learning_rate": 4.9999986477654165e-06, + "loss": 0.6907, + "step": 108 + }, + { + "epoch": 0.02695351137487636, + "grad_norm": 0.733280029857595, + "learning_rate": 4.999998288578146e-06, + "loss": 0.6759, + "step": 109 + }, + { + "epoch": 0.027200791295746787, + "grad_norm": 0.7526378507475411, + "learning_rate": 4.9999978871335695e-06, + "loss": 0.6685, + "step": 110 + }, + { + "epoch": 0.02744807121661721, + "grad_norm": 0.7597255866628462, + "learning_rate": 4.999997443431694e-06, + "loss": 0.7168, + "step": 111 + }, + { + "epoch": 0.027695351137487636, + "grad_norm": 0.7397163297259713, + "learning_rate": 4.999996957472529e-06, + "loss": 0.6834, + "step": 112 + }, + { + "epoch": 0.02794263105835806, + "grad_norm": 0.7434766561795023, + "learning_rate": 4.999996429256079e-06, + "loss": 0.6713, + "step": 113 + }, + { + "epoch": 0.028189910979228485, + "grad_norm": 0.688054583388141, + "learning_rate": 4.9999958587823565e-06, + "loss": 0.6771, + "step": 114 + }, + { + "epoch": 0.028437190900098913, + "grad_norm": 0.7807222470327964, + "learning_rate": 4.999995246051368e-06, + "loss": 0.6803, + "step": 115 + }, + { + "epoch": 0.028684470820969338, + "grad_norm": 0.7626145030682552, + "learning_rate": 4.999994591063126e-06, + "loss": 0.6709, + "step": 116 + }, + { + "epoch": 0.028931750741839762, + "grad_norm": 0.712857303236686, + "learning_rate": 4.999993893817641e-06, + "loss": 0.6987, + "step": 117 + }, + { + "epoch": 0.029179030662710187, + "grad_norm": 0.7175670694950973, + "learning_rate": 4.999993154314924e-06, + "loss": 0.6656, + "step": 118 + }, + { + "epoch": 0.02942631058358061, + "grad_norm": 0.8126016444934842, + "learning_rate": 4.999992372554989e-06, + "loss": 0.7236, + "step": 119 + }, + { + "epoch": 0.02967359050445104, + "grad_norm": 0.737989445104509, + "learning_rate": 4.999991548537848e-06, + "loss": 0.6728, + "step": 120 + }, + { + "epoch": 0.029920870425321464, + "grad_norm": 0.8327590394230312, + "learning_rate": 4.999990682263516e-06, + "loss": 0.6558, + "step": 121 + }, + { + "epoch": 0.03016815034619189, + "grad_norm": 0.731868642801384, + "learning_rate": 4.999989773732007e-06, + "loss": 0.6853, + "step": 122 + }, + { + "epoch": 0.030415430267062313, + "grad_norm": 0.7396773552222567, + "learning_rate": 4.999988822943335e-06, + "loss": 0.6737, + "step": 123 + }, + { + "epoch": 0.03066271018793274, + "grad_norm": 0.8637921752011817, + "learning_rate": 4.999987829897519e-06, + "loss": 0.6965, + "step": 124 + }, + { + "epoch": 0.030909990108803166, + "grad_norm": 0.7451831927050838, + "learning_rate": 4.999986794594574e-06, + "loss": 0.706, + "step": 125 + }, + { + "epoch": 0.03115727002967359, + "grad_norm": 0.7207943261718842, + "learning_rate": 4.999985717034517e-06, + "loss": 0.7053, + "step": 126 + }, + { + "epoch": 0.031404549950544015, + "grad_norm": 0.7839501309721785, + "learning_rate": 4.999984597217367e-06, + "loss": 0.6709, + "step": 127 + }, + { + "epoch": 0.03165182987141444, + "grad_norm": 0.7595007430844243, + "learning_rate": 4.999983435143142e-06, + "loss": 0.6786, + "step": 128 + }, + { + "epoch": 0.031899109792284865, + "grad_norm": 0.7454986705571136, + "learning_rate": 4.999982230811864e-06, + "loss": 0.6662, + "step": 129 + }, + { + "epoch": 0.03214638971315529, + "grad_norm": 0.7112711511564064, + "learning_rate": 4.9999809842235515e-06, + "loss": 0.6699, + "step": 130 + }, + { + "epoch": 0.032393669634025714, + "grad_norm": 0.7659959221982169, + "learning_rate": 4.999979695378226e-06, + "loss": 0.6898, + "step": 131 + }, + { + "epoch": 0.032640949554896145, + "grad_norm": 0.7641892451832198, + "learning_rate": 4.999978364275908e-06, + "loss": 0.6594, + "step": 132 + }, + { + "epoch": 0.03288822947576657, + "grad_norm": 0.7591516753531807, + "learning_rate": 4.999976990916622e-06, + "loss": 0.6683, + "step": 133 + }, + { + "epoch": 0.033135509396636995, + "grad_norm": 0.746675595620454, + "learning_rate": 4.9999755753003905e-06, + "loss": 0.641, + "step": 134 + }, + { + "epoch": 0.03338278931750742, + "grad_norm": 0.7654447092133061, + "learning_rate": 4.999974117427238e-06, + "loss": 0.6755, + "step": 135 + }, + { + "epoch": 0.033630069238377844, + "grad_norm": 0.7675507124577308, + "learning_rate": 4.999972617297187e-06, + "loss": 0.6462, + "step": 136 + }, + { + "epoch": 0.03387734915924827, + "grad_norm": 0.7331037612104742, + "learning_rate": 4.999971074910266e-06, + "loss": 0.683, + "step": 137 + }, + { + "epoch": 0.03412462908011869, + "grad_norm": 0.7870574637199415, + "learning_rate": 4.999969490266498e-06, + "loss": 0.651, + "step": 138 + }, + { + "epoch": 0.03437190900098912, + "grad_norm": 0.7088258626006124, + "learning_rate": 4.999967863365912e-06, + "loss": 0.6635, + "step": 139 + }, + { + "epoch": 0.03461918892185954, + "grad_norm": 0.727573343943909, + "learning_rate": 4.999966194208534e-06, + "loss": 0.6466, + "step": 140 + }, + { + "epoch": 0.034866468842729974, + "grad_norm": 0.7272152984644323, + "learning_rate": 4.999964482794394e-06, + "loss": 0.6573, + "step": 141 + }, + { + "epoch": 0.0351137487636004, + "grad_norm": 0.7802927520295474, + "learning_rate": 4.999962729123519e-06, + "loss": 0.6306, + "step": 142 + }, + { + "epoch": 0.03536102868447082, + "grad_norm": 0.7516061535051545, + "learning_rate": 4.99996093319594e-06, + "loss": 0.6544, + "step": 143 + }, + { + "epoch": 0.03560830860534125, + "grad_norm": 0.7714071852861921, + "learning_rate": 4.9999590950116865e-06, + "loss": 0.6428, + "step": 144 + }, + { + "epoch": 0.03585558852621167, + "grad_norm": 0.7888111176086061, + "learning_rate": 4.99995721457079e-06, + "loss": 0.6436, + "step": 145 + }, + { + "epoch": 0.0361028684470821, + "grad_norm": 0.8086998275264007, + "learning_rate": 4.999955291873282e-06, + "loss": 0.6447, + "step": 146 + }, + { + "epoch": 0.03635014836795252, + "grad_norm": 0.7680250966004125, + "learning_rate": 4.999953326919195e-06, + "loss": 0.6522, + "step": 147 + }, + { + "epoch": 0.036597428288822946, + "grad_norm": 0.7903118774805085, + "learning_rate": 4.999951319708562e-06, + "loss": 0.6537, + "step": 148 + }, + { + "epoch": 0.03684470820969337, + "grad_norm": 0.7554875512244171, + "learning_rate": 4.999949270241418e-06, + "loss": 0.6793, + "step": 149 + }, + { + "epoch": 0.037091988130563795, + "grad_norm": 0.801330159672659, + "learning_rate": 4.999947178517798e-06, + "loss": 0.6412, + "step": 150 + }, + { + "epoch": 0.03733926805143423, + "grad_norm": 0.7520769967798256, + "learning_rate": 4.999945044537735e-06, + "loss": 0.6612, + "step": 151 + }, + { + "epoch": 0.03758654797230465, + "grad_norm": 0.7142492113389125, + "learning_rate": 4.999942868301266e-06, + "loss": 0.6648, + "step": 152 + }, + { + "epoch": 0.037833827893175076, + "grad_norm": 0.7702143241683127, + "learning_rate": 4.999940649808429e-06, + "loss": 0.6117, + "step": 153 + }, + { + "epoch": 0.0380811078140455, + "grad_norm": 0.7774985935001439, + "learning_rate": 4.999938389059261e-06, + "loss": 0.6226, + "step": 154 + }, + { + "epoch": 0.038328387734915925, + "grad_norm": 0.7960037248541736, + "learning_rate": 4.999936086053799e-06, + "loss": 0.6324, + "step": 155 + }, + { + "epoch": 0.03857566765578635, + "grad_norm": 0.7699076528180318, + "learning_rate": 4.9999337407920836e-06, + "loss": 0.6512, + "step": 156 + }, + { + "epoch": 0.038822947576656774, + "grad_norm": 0.8048775835752526, + "learning_rate": 4.999931353274153e-06, + "loss": 0.6475, + "step": 157 + }, + { + "epoch": 0.0390702274975272, + "grad_norm": 0.8160115405346294, + "learning_rate": 4.9999289235000495e-06, + "loss": 0.6422, + "step": 158 + }, + { + "epoch": 0.039317507418397624, + "grad_norm": 0.7275555749791285, + "learning_rate": 4.9999264514698124e-06, + "loss": 0.6369, + "step": 159 + }, + { + "epoch": 0.03956478733926805, + "grad_norm": 0.7689207556493872, + "learning_rate": 4.999923937183483e-06, + "loss": 0.6508, + "step": 160 + }, + { + "epoch": 0.03981206726013848, + "grad_norm": 0.8348108853775594, + "learning_rate": 4.999921380641105e-06, + "loss": 0.6671, + "step": 161 + }, + { + "epoch": 0.040059347181008904, + "grad_norm": 0.7248271704801716, + "learning_rate": 4.999918781842722e-06, + "loss": 0.6491, + "step": 162 + }, + { + "epoch": 0.04030662710187933, + "grad_norm": 0.7315284811526714, + "learning_rate": 4.999916140788377e-06, + "loss": 0.659, + "step": 163 + }, + { + "epoch": 0.040553907022749754, + "grad_norm": 0.8578196886547811, + "learning_rate": 4.999913457478115e-06, + "loss": 0.6436, + "step": 164 + }, + { + "epoch": 0.04080118694362018, + "grad_norm": 0.8963614849939332, + "learning_rate": 4.999910731911981e-06, + "loss": 0.6514, + "step": 165 + }, + { + "epoch": 0.0410484668644906, + "grad_norm": 0.7739062484186896, + "learning_rate": 4.999907964090022e-06, + "loss": 0.6104, + "step": 166 + }, + { + "epoch": 0.04129574678536103, + "grad_norm": 0.8074813391715496, + "learning_rate": 4.999905154012284e-06, + "loss": 0.617, + "step": 167 + }, + { + "epoch": 0.04154302670623145, + "grad_norm": 0.7762599913589127, + "learning_rate": 4.999902301678815e-06, + "loss": 0.6375, + "step": 168 + }, + { + "epoch": 0.04179030662710188, + "grad_norm": 0.7748970536822192, + "learning_rate": 4.999899407089662e-06, + "loss": 0.6184, + "step": 169 + }, + { + "epoch": 0.0420375865479723, + "grad_norm": 0.7558565035852893, + "learning_rate": 4.999896470244875e-06, + "loss": 0.6369, + "step": 170 + }, + { + "epoch": 0.04228486646884273, + "grad_norm": 0.8341248646777939, + "learning_rate": 4.999893491144504e-06, + "loss": 0.6418, + "step": 171 + }, + { + "epoch": 0.04253214638971316, + "grad_norm": 0.7710490534111422, + "learning_rate": 4.999890469788598e-06, + "loss": 0.6382, + "step": 172 + }, + { + "epoch": 0.04277942631058358, + "grad_norm": 0.7753712584186033, + "learning_rate": 4.99988740617721e-06, + "loss": 0.6553, + "step": 173 + }, + { + "epoch": 0.04302670623145401, + "grad_norm": 0.8067411687298269, + "learning_rate": 4.999884300310389e-06, + "loss": 0.6542, + "step": 174 + }, + { + "epoch": 0.04327398615232443, + "grad_norm": 0.7965266139793583, + "learning_rate": 4.999881152188191e-06, + "loss": 0.6297, + "step": 175 + }, + { + "epoch": 0.043521266073194856, + "grad_norm": 0.7800110890457195, + "learning_rate": 4.999877961810667e-06, + "loss": 0.6328, + "step": 176 + }, + { + "epoch": 0.04376854599406528, + "grad_norm": 0.7555029707902032, + "learning_rate": 4.99987472917787e-06, + "loss": 0.6567, + "step": 177 + }, + { + "epoch": 0.044015825914935705, + "grad_norm": 0.8247765083336221, + "learning_rate": 4.9998714542898566e-06, + "loss": 0.6552, + "step": 178 + }, + { + "epoch": 0.04426310583580613, + "grad_norm": 0.7853035050103945, + "learning_rate": 4.999868137146682e-06, + "loss": 0.6487, + "step": 179 + }, + { + "epoch": 0.04451038575667656, + "grad_norm": 0.792588023124722, + "learning_rate": 4.999864777748401e-06, + "loss": 0.6343, + "step": 180 + }, + { + "epoch": 0.044757665677546986, + "grad_norm": 0.7562657466952013, + "learning_rate": 4.999861376095072e-06, + "loss": 0.6372, + "step": 181 + }, + { + "epoch": 0.04500494559841741, + "grad_norm": 0.7725421864498705, + "learning_rate": 4.999857932186751e-06, + "loss": 0.5965, + "step": 182 + }, + { + "epoch": 0.045252225519287835, + "grad_norm": 0.801959316358281, + "learning_rate": 4.999854446023496e-06, + "loss": 0.6316, + "step": 183 + }, + { + "epoch": 0.04549950544015826, + "grad_norm": 0.7854145905631205, + "learning_rate": 4.999850917605369e-06, + "loss": 0.6432, + "step": 184 + }, + { + "epoch": 0.045746785361028684, + "grad_norm": 0.7625652300415934, + "learning_rate": 4.999847346932426e-06, + "loss": 0.6409, + "step": 185 + }, + { + "epoch": 0.04599406528189911, + "grad_norm": 0.7070242573198564, + "learning_rate": 4.999843734004729e-06, + "loss": 0.6081, + "step": 186 + }, + { + "epoch": 0.046241345202769533, + "grad_norm": 0.843090988301474, + "learning_rate": 4.999840078822339e-06, + "loss": 0.6494, + "step": 187 + }, + { + "epoch": 0.04648862512363996, + "grad_norm": 0.7812267791142424, + "learning_rate": 4.9998363813853175e-06, + "loss": 0.6243, + "step": 188 + }, + { + "epoch": 0.04673590504451038, + "grad_norm": 0.7932584261056079, + "learning_rate": 4.999832641693727e-06, + "loss": 0.6167, + "step": 189 + }, + { + "epoch": 0.046983184965380814, + "grad_norm": 0.7886667271093307, + "learning_rate": 4.999828859747631e-06, + "loss": 0.6095, + "step": 190 + }, + { + "epoch": 0.04723046488625124, + "grad_norm": 0.8023765397410424, + "learning_rate": 4.999825035547093e-06, + "loss": 0.6256, + "step": 191 + }, + { + "epoch": 0.04747774480712166, + "grad_norm": 0.7820939097202719, + "learning_rate": 4.999821169092178e-06, + "loss": 0.6266, + "step": 192 + }, + { + "epoch": 0.04772502472799209, + "grad_norm": 0.7864299627678912, + "learning_rate": 4.9998172603829515e-06, + "loss": 0.6168, + "step": 193 + }, + { + "epoch": 0.04797230464886251, + "grad_norm": 0.7551224745595048, + "learning_rate": 4.999813309419479e-06, + "loss": 0.6034, + "step": 194 + }, + { + "epoch": 0.04821958456973294, + "grad_norm": 0.7538626457900578, + "learning_rate": 4.999809316201828e-06, + "loss": 0.6154, + "step": 195 + }, + { + "epoch": 0.04846686449060336, + "grad_norm": 0.7449552121210857, + "learning_rate": 4.999805280730066e-06, + "loss": 0.6332, + "step": 196 + }, + { + "epoch": 0.048714144411473786, + "grad_norm": 0.8177138196697615, + "learning_rate": 4.99980120300426e-06, + "loss": 0.6385, + "step": 197 + }, + { + "epoch": 0.04896142433234421, + "grad_norm": 0.7760055206345433, + "learning_rate": 4.99979708302448e-06, + "loss": 0.6227, + "step": 198 + }, + { + "epoch": 0.049208704253214636, + "grad_norm": 0.7599944781350494, + "learning_rate": 4.999792920790795e-06, + "loss": 0.5753, + "step": 199 + }, + { + "epoch": 0.04945598417408507, + "grad_norm": 0.7513478755596612, + "learning_rate": 4.999788716303276e-06, + "loss": 0.5782, + "step": 200 + }, + { + "epoch": 0.04970326409495549, + "grad_norm": 0.7969737018661929, + "learning_rate": 4.999784469561994e-06, + "loss": 0.6112, + "step": 201 + }, + { + "epoch": 0.049950544015825916, + "grad_norm": 0.7685838090874655, + "learning_rate": 4.9997801805670204e-06, + "loss": 0.6047, + "step": 202 + }, + { + "epoch": 0.05019782393669634, + "grad_norm": 0.7733765759727036, + "learning_rate": 4.9997758493184276e-06, + "loss": 0.6378, + "step": 203 + }, + { + "epoch": 0.050445103857566766, + "grad_norm": 0.7884015810991322, + "learning_rate": 4.99977147581629e-06, + "loss": 0.6162, + "step": 204 + }, + { + "epoch": 0.05069238377843719, + "grad_norm": 0.7876715243321647, + "learning_rate": 4.999767060060679e-06, + "loss": 0.6017, + "step": 205 + }, + { + "epoch": 0.050939663699307615, + "grad_norm": 0.76121613723978, + "learning_rate": 4.999762602051673e-06, + "loss": 0.6229, + "step": 206 + }, + { + "epoch": 0.05118694362017804, + "grad_norm": 0.7837425671543329, + "learning_rate": 4.9997581017893436e-06, + "loss": 0.6376, + "step": 207 + }, + { + "epoch": 0.051434223541048464, + "grad_norm": 0.7176819978947346, + "learning_rate": 4.999753559273769e-06, + "loss": 0.6102, + "step": 208 + }, + { + "epoch": 0.05168150346191889, + "grad_norm": 0.7929682985852649, + "learning_rate": 4.999748974505026e-06, + "loss": 0.619, + "step": 209 + }, + { + "epoch": 0.05192878338278932, + "grad_norm": 0.8166155545756911, + "learning_rate": 4.999744347483191e-06, + "loss": 0.5881, + "step": 210 + }, + { + "epoch": 0.052176063303659745, + "grad_norm": 0.7925632446286965, + "learning_rate": 4.999739678208343e-06, + "loss": 0.6097, + "step": 211 + }, + { + "epoch": 0.05242334322453017, + "grad_norm": 0.7724585651765387, + "learning_rate": 4.99973496668056e-06, + "loss": 0.6133, + "step": 212 + }, + { + "epoch": 0.052670623145400594, + "grad_norm": 0.9395217619393431, + "learning_rate": 4.999730212899923e-06, + "loss": 0.5962, + "step": 213 + }, + { + "epoch": 0.05291790306627102, + "grad_norm": 0.7846009290846587, + "learning_rate": 4.999725416866512e-06, + "loss": 0.5853, + "step": 214 + }, + { + "epoch": 0.05316518298714144, + "grad_norm": 0.7538698383114204, + "learning_rate": 4.999720578580407e-06, + "loss": 0.6018, + "step": 215 + }, + { + "epoch": 0.05341246290801187, + "grad_norm": 0.811056416448552, + "learning_rate": 4.999715698041691e-06, + "loss": 0.6114, + "step": 216 + }, + { + "epoch": 0.05365974282888229, + "grad_norm": 0.8205077516037962, + "learning_rate": 4.999710775250446e-06, + "loss": 0.6306, + "step": 217 + }, + { + "epoch": 0.05390702274975272, + "grad_norm": 0.8557907074033965, + "learning_rate": 4.999705810206755e-06, + "loss": 0.607, + "step": 218 + }, + { + "epoch": 0.05415430267062315, + "grad_norm": 0.8333258626700865, + "learning_rate": 4.999700802910702e-06, + "loss": 0.6412, + "step": 219 + }, + { + "epoch": 0.05440158259149357, + "grad_norm": 0.7928603799058208, + "learning_rate": 4.999695753362372e-06, + "loss": 0.6132, + "step": 220 + }, + { + "epoch": 0.054648862512364, + "grad_norm": 0.809471093310714, + "learning_rate": 4.99969066156185e-06, + "loss": 0.5894, + "step": 221 + }, + { + "epoch": 0.05489614243323442, + "grad_norm": 0.748135133878315, + "learning_rate": 4.999685527509223e-06, + "loss": 0.5785, + "step": 222 + }, + { + "epoch": 0.05514342235410485, + "grad_norm": 0.7994273959936483, + "learning_rate": 4.9996803512045756e-06, + "loss": 0.611, + "step": 223 + }, + { + "epoch": 0.05539070227497527, + "grad_norm": 0.8196600789722335, + "learning_rate": 4.999675132647998e-06, + "loss": 0.6116, + "step": 224 + }, + { + "epoch": 0.055637982195845696, + "grad_norm": 0.7909171176460179, + "learning_rate": 4.999669871839577e-06, + "loss": 0.6217, + "step": 225 + }, + { + "epoch": 0.05588526211671612, + "grad_norm": 0.8567648736166711, + "learning_rate": 4.999664568779401e-06, + "loss": 0.6112, + "step": 226 + }, + { + "epoch": 0.056132542037586546, + "grad_norm": 0.7690407907423776, + "learning_rate": 4.99965922346756e-06, + "loss": 0.5871, + "step": 227 + }, + { + "epoch": 0.05637982195845697, + "grad_norm": 0.7444628727728402, + "learning_rate": 4.999653835904145e-06, + "loss": 0.5681, + "step": 228 + }, + { + "epoch": 0.0566271018793274, + "grad_norm": 0.7607893092647576, + "learning_rate": 4.999648406089247e-06, + "loss": 0.6139, + "step": 229 + }, + { + "epoch": 0.056874381800197826, + "grad_norm": 0.7777597816934657, + "learning_rate": 4.999642934022957e-06, + "loss": 0.6332, + "step": 230 + }, + { + "epoch": 0.05712166172106825, + "grad_norm": 0.82148546775061, + "learning_rate": 4.999637419705369e-06, + "loss": 0.5841, + "step": 231 + }, + { + "epoch": 0.057368941641938676, + "grad_norm": 0.8010419071826838, + "learning_rate": 4.9996318631365735e-06, + "loss": 0.5925, + "step": 232 + }, + { + "epoch": 0.0576162215628091, + "grad_norm": 0.7530690429731, + "learning_rate": 4.9996262643166674e-06, + "loss": 0.5951, + "step": 233 + }, + { + "epoch": 0.057863501483679525, + "grad_norm": 0.7675728737861399, + "learning_rate": 4.999620623245743e-06, + "loss": 0.5849, + "step": 234 + }, + { + "epoch": 0.05811078140454995, + "grad_norm": 0.7724654094858604, + "learning_rate": 4.999614939923897e-06, + "loss": 0.6169, + "step": 235 + }, + { + "epoch": 0.058358061325420374, + "grad_norm": 0.7978696650914271, + "learning_rate": 4.999609214351226e-06, + "loss": 0.5964, + "step": 236 + }, + { + "epoch": 0.0586053412462908, + "grad_norm": 0.7721298767744407, + "learning_rate": 4.999603446527826e-06, + "loss": 0.6255, + "step": 237 + }, + { + "epoch": 0.05885262116716122, + "grad_norm": 0.7488208157668992, + "learning_rate": 4.999597636453793e-06, + "loss": 0.5971, + "step": 238 + }, + { + "epoch": 0.059099901088031655, + "grad_norm": 0.8049217267637012, + "learning_rate": 4.999591784129228e-06, + "loss": 0.6221, + "step": 239 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 0.7979752646183047, + "learning_rate": 4.999585889554227e-06, + "loss": 0.6165, + "step": 240 + }, + { + "epoch": 0.059594460929772504, + "grad_norm": 0.7448820329141358, + "learning_rate": 4.999579952728892e-06, + "loss": 0.6154, + "step": 241 + }, + { + "epoch": 0.05984174085064293, + "grad_norm": 0.8083942465217746, + "learning_rate": 4.999573973653322e-06, + "loss": 0.6086, + "step": 242 + }, + { + "epoch": 0.06008902077151335, + "grad_norm": 0.7606973644680061, + "learning_rate": 4.99956795232762e-06, + "loss": 0.5945, + "step": 243 + }, + { + "epoch": 0.06033630069238378, + "grad_norm": 0.812611247006916, + "learning_rate": 4.999561888751885e-06, + "loss": 0.5904, + "step": 244 + }, + { + "epoch": 0.0605835806132542, + "grad_norm": 0.7727468846105328, + "learning_rate": 4.9995557829262215e-06, + "loss": 0.61, + "step": 245 + }, + { + "epoch": 0.06083086053412463, + "grad_norm": 0.8133475657364307, + "learning_rate": 4.999549634850732e-06, + "loss": 0.5946, + "step": 246 + }, + { + "epoch": 0.06107814045499505, + "grad_norm": 0.815731831489138, + "learning_rate": 4.99954344452552e-06, + "loss": 0.584, + "step": 247 + }, + { + "epoch": 0.06132542037586548, + "grad_norm": 0.828023713294489, + "learning_rate": 4.999537211950692e-06, + "loss": 0.5963, + "step": 248 + }, + { + "epoch": 0.06157270029673591, + "grad_norm": 0.7659802540309084, + "learning_rate": 4.99953093712635e-06, + "loss": 0.601, + "step": 249 + }, + { + "epoch": 0.06181998021760633, + "grad_norm": 0.8129348699504775, + "learning_rate": 4.999524620052603e-06, + "loss": 0.5957, + "step": 250 + }, + { + "epoch": 0.06206726013847676, + "grad_norm": 0.8344335464060704, + "learning_rate": 4.999518260729557e-06, + "loss": 0.5834, + "step": 251 + }, + { + "epoch": 0.06231454005934718, + "grad_norm": 0.8014316640205381, + "learning_rate": 4.999511859157319e-06, + "loss": 0.5922, + "step": 252 + }, + { + "epoch": 0.0625618199802176, + "grad_norm": 0.7753876743359669, + "learning_rate": 4.999505415335998e-06, + "loss": 0.5933, + "step": 253 + }, + { + "epoch": 0.06280909990108803, + "grad_norm": 0.7817463507788848, + "learning_rate": 4.9994989292657015e-06, + "loss": 0.6109, + "step": 254 + }, + { + "epoch": 0.06305637982195846, + "grad_norm": 0.828120989023736, + "learning_rate": 4.99949240094654e-06, + "loss": 0.6008, + "step": 255 + }, + { + "epoch": 0.06330365974282888, + "grad_norm": 0.8407040487143337, + "learning_rate": 4.999485830378625e-06, + "loss": 0.5813, + "step": 256 + }, + { + "epoch": 0.0635509396636993, + "grad_norm": 0.8114133897332183, + "learning_rate": 4.999479217562066e-06, + "loss": 0.5617, + "step": 257 + }, + { + "epoch": 0.06379821958456973, + "grad_norm": 0.8420383888196065, + "learning_rate": 4.999472562496975e-06, + "loss": 0.5782, + "step": 258 + }, + { + "epoch": 0.06404549950544015, + "grad_norm": 0.8146897788412562, + "learning_rate": 4.999465865183465e-06, + "loss": 0.6189, + "step": 259 + }, + { + "epoch": 0.06429277942631058, + "grad_norm": 0.8609511362836474, + "learning_rate": 4.999459125621649e-06, + "loss": 0.6164, + "step": 260 + }, + { + "epoch": 0.064540059347181, + "grad_norm": 0.8348638690296436, + "learning_rate": 4.99945234381164e-06, + "loss": 0.5344, + "step": 261 + }, + { + "epoch": 0.06478733926805143, + "grad_norm": 0.772480507007813, + "learning_rate": 4.999445519753555e-06, + "loss": 0.5732, + "step": 262 + }, + { + "epoch": 0.06503461918892187, + "grad_norm": 0.7745975874520878, + "learning_rate": 4.999438653447507e-06, + "loss": 0.5781, + "step": 263 + }, + { + "epoch": 0.06528189910979229, + "grad_norm": 0.8688380494260995, + "learning_rate": 4.999431744893613e-06, + "loss": 0.5795, + "step": 264 + }, + { + "epoch": 0.06552917903066272, + "grad_norm": 0.8584578845191142, + "learning_rate": 4.999424794091989e-06, + "loss": 0.6207, + "step": 265 + }, + { + "epoch": 0.06577645895153314, + "grad_norm": 0.8130950950717318, + "learning_rate": 4.9994178010427544e-06, + "loss": 0.5666, + "step": 266 + }, + { + "epoch": 0.06602373887240356, + "grad_norm": 0.8060107564461553, + "learning_rate": 4.999410765746026e-06, + "loss": 0.5419, + "step": 267 + }, + { + "epoch": 0.06627101879327399, + "grad_norm": 0.7783985362196268, + "learning_rate": 4.999403688201921e-06, + "loss": 0.5613, + "step": 268 + }, + { + "epoch": 0.06651829871414441, + "grad_norm": 0.8888599834984435, + "learning_rate": 4.999396568410563e-06, + "loss": 0.5607, + "step": 269 + }, + { + "epoch": 0.06676557863501484, + "grad_norm": 0.8652022418369328, + "learning_rate": 4.999389406372069e-06, + "loss": 0.6023, + "step": 270 + }, + { + "epoch": 0.06701285855588526, + "grad_norm": 0.8232756652188512, + "learning_rate": 4.999382202086562e-06, + "loss": 0.6064, + "step": 271 + }, + { + "epoch": 0.06726013847675569, + "grad_norm": 0.783102206467771, + "learning_rate": 4.9993749555541635e-06, + "loss": 0.5697, + "step": 272 + }, + { + "epoch": 0.06750741839762611, + "grad_norm": 0.8069785228514532, + "learning_rate": 4.999367666774995e-06, + "loss": 0.6105, + "step": 273 + }, + { + "epoch": 0.06775469831849654, + "grad_norm": 0.7893414940765818, + "learning_rate": 4.99936033574918e-06, + "loss": 0.6075, + "step": 274 + }, + { + "epoch": 0.06800197823936696, + "grad_norm": 0.8117544067424143, + "learning_rate": 4.999352962476843e-06, + "loss": 0.6306, + "step": 275 + }, + { + "epoch": 0.06824925816023739, + "grad_norm": 0.7939328491906092, + "learning_rate": 4.999345546958109e-06, + "loss": 0.5861, + "step": 276 + }, + { + "epoch": 0.06849653808110781, + "grad_norm": 0.8271668917371283, + "learning_rate": 4.999338089193102e-06, + "loss": 0.5754, + "step": 277 + }, + { + "epoch": 0.06874381800197824, + "grad_norm": 0.8290077242370982, + "learning_rate": 4.999330589181948e-06, + "loss": 0.581, + "step": 278 + }, + { + "epoch": 0.06899109792284866, + "grad_norm": 0.8239908337223516, + "learning_rate": 4.999323046924776e-06, + "loss": 0.5883, + "step": 279 + }, + { + "epoch": 0.06923837784371908, + "grad_norm": 0.7867379951699298, + "learning_rate": 4.999315462421711e-06, + "loss": 0.5676, + "step": 280 + }, + { + "epoch": 0.06948565776458951, + "grad_norm": 0.794330642065807, + "learning_rate": 4.9993078356728816e-06, + "loss": 0.5744, + "step": 281 + }, + { + "epoch": 0.06973293768545995, + "grad_norm": 0.7920567183819462, + "learning_rate": 4.999300166678419e-06, + "loss": 0.5884, + "step": 282 + }, + { + "epoch": 0.06998021760633037, + "grad_norm": 0.7934949697459773, + "learning_rate": 4.99929245543845e-06, + "loss": 0.6065, + "step": 283 + }, + { + "epoch": 0.0702274975272008, + "grad_norm": 0.8421805349854998, + "learning_rate": 4.999284701953106e-06, + "loss": 0.5577, + "step": 284 + }, + { + "epoch": 0.07047477744807122, + "grad_norm": 0.8124908525725804, + "learning_rate": 4.9992769062225185e-06, + "loss": 0.6129, + "step": 285 + }, + { + "epoch": 0.07072205736894165, + "grad_norm": 0.7973073951210279, + "learning_rate": 4.999269068246818e-06, + "loss": 0.5694, + "step": 286 + }, + { + "epoch": 0.07096933728981207, + "grad_norm": 0.8076738575720998, + "learning_rate": 4.999261188026139e-06, + "loss": 0.5669, + "step": 287 + }, + { + "epoch": 0.0712166172106825, + "grad_norm": 0.7949443064726974, + "learning_rate": 4.999253265560614e-06, + "loss": 0.5859, + "step": 288 + }, + { + "epoch": 0.07146389713155292, + "grad_norm": 0.785428609248276, + "learning_rate": 4.999245300850375e-06, + "loss": 0.5573, + "step": 289 + }, + { + "epoch": 0.07171117705242334, + "grad_norm": 0.787601105135159, + "learning_rate": 4.9992372938955595e-06, + "loss": 0.5658, + "step": 290 + }, + { + "epoch": 0.07195845697329377, + "grad_norm": 0.8181542303646591, + "learning_rate": 4.999229244696301e-06, + "loss": 0.5991, + "step": 291 + }, + { + "epoch": 0.0722057368941642, + "grad_norm": 0.8058133731985243, + "learning_rate": 4.9992211532527355e-06, + "loss": 0.5862, + "step": 292 + }, + { + "epoch": 0.07245301681503462, + "grad_norm": 0.7995192339831612, + "learning_rate": 4.999213019565001e-06, + "loss": 0.5738, + "step": 293 + }, + { + "epoch": 0.07270029673590504, + "grad_norm": 0.83397514939666, + "learning_rate": 4.999204843633234e-06, + "loss": 0.6086, + "step": 294 + }, + { + "epoch": 0.07294757665677547, + "grad_norm": 0.8022353428397413, + "learning_rate": 4.9991966254575726e-06, + "loss": 0.5817, + "step": 295 + }, + { + "epoch": 0.07319485657764589, + "grad_norm": 0.7857047281743428, + "learning_rate": 4.999188365038156e-06, + "loss": 0.5678, + "step": 296 + }, + { + "epoch": 0.07344213649851632, + "grad_norm": 0.8338345529151808, + "learning_rate": 4.999180062375124e-06, + "loss": 0.5902, + "step": 297 + }, + { + "epoch": 0.07368941641938674, + "grad_norm": 0.788795960009146, + "learning_rate": 4.999171717468617e-06, + "loss": 0.5621, + "step": 298 + }, + { + "epoch": 0.07393669634025717, + "grad_norm": 0.8007543016695616, + "learning_rate": 4.999163330318777e-06, + "loss": 0.5909, + "step": 299 + }, + { + "epoch": 0.07418397626112759, + "grad_norm": 0.793092974341733, + "learning_rate": 4.999154900925743e-06, + "loss": 0.6027, + "step": 300 + }, + { + "epoch": 0.07443125618199802, + "grad_norm": 0.7973449991777471, + "learning_rate": 4.99914642928966e-06, + "loss": 0.5768, + "step": 301 + }, + { + "epoch": 0.07467853610286845, + "grad_norm": 0.8095789680023529, + "learning_rate": 4.99913791541067e-06, + "loss": 0.5654, + "step": 302 + }, + { + "epoch": 0.07492581602373888, + "grad_norm": 0.8960253123671273, + "learning_rate": 4.9991293592889174e-06, + "loss": 0.5943, + "step": 303 + }, + { + "epoch": 0.0751730959446093, + "grad_norm": 0.8324065932447318, + "learning_rate": 4.999120760924547e-06, + "loss": 0.5958, + "step": 304 + }, + { + "epoch": 0.07542037586547973, + "grad_norm": 0.7774414096188346, + "learning_rate": 4.999112120317703e-06, + "loss": 0.5753, + "step": 305 + }, + { + "epoch": 0.07566765578635015, + "grad_norm": 0.7876740890991246, + "learning_rate": 4.9991034374685335e-06, + "loss": 0.5706, + "step": 306 + }, + { + "epoch": 0.07591493570722058, + "grad_norm": 0.8560375531506265, + "learning_rate": 4.9990947123771825e-06, + "loss": 0.5766, + "step": 307 + }, + { + "epoch": 0.076162215628091, + "grad_norm": 0.8459626380953854, + "learning_rate": 4.9990859450438e-06, + "loss": 0.567, + "step": 308 + }, + { + "epoch": 0.07640949554896143, + "grad_norm": 0.7880315987652132, + "learning_rate": 4.999077135468533e-06, + "loss": 0.5995, + "step": 309 + }, + { + "epoch": 0.07665677546983185, + "grad_norm": 0.8955800578597224, + "learning_rate": 4.9990682836515305e-06, + "loss": 0.5843, + "step": 310 + }, + { + "epoch": 0.07690405539070228, + "grad_norm": 0.8617447390840817, + "learning_rate": 4.999059389592943e-06, + "loss": 0.5698, + "step": 311 + }, + { + "epoch": 0.0771513353115727, + "grad_norm": 0.8222678465858798, + "learning_rate": 4.999050453292918e-06, + "loss": 0.5792, + "step": 312 + }, + { + "epoch": 0.07739861523244312, + "grad_norm": 0.8425038084425801, + "learning_rate": 4.999041474751611e-06, + "loss": 0.5654, + "step": 313 + }, + { + "epoch": 0.07764589515331355, + "grad_norm": 0.7939986484417906, + "learning_rate": 4.999032453969171e-06, + "loss": 0.5844, + "step": 314 + }, + { + "epoch": 0.07789317507418397, + "grad_norm": 0.8068900357493951, + "learning_rate": 4.999023390945749e-06, + "loss": 0.5842, + "step": 315 + }, + { + "epoch": 0.0781404549950544, + "grad_norm": 0.8467336955386116, + "learning_rate": 4.9990142856815015e-06, + "loss": 0.5829, + "step": 316 + }, + { + "epoch": 0.07838773491592482, + "grad_norm": 0.833281068277134, + "learning_rate": 4.999005138176581e-06, + "loss": 0.5519, + "step": 317 + }, + { + "epoch": 0.07863501483679525, + "grad_norm": 0.8225381669617556, + "learning_rate": 4.9989959484311415e-06, + "loss": 0.5421, + "step": 318 + }, + { + "epoch": 0.07888229475766567, + "grad_norm": 0.7395261056652702, + "learning_rate": 4.998986716445339e-06, + "loss": 0.5629, + "step": 319 + }, + { + "epoch": 0.0791295746785361, + "grad_norm": 0.7829197261891223, + "learning_rate": 4.99897744221933e-06, + "loss": 0.5359, + "step": 320 + }, + { + "epoch": 0.07937685459940653, + "grad_norm": 0.7979559607833192, + "learning_rate": 4.998968125753271e-06, + "loss": 0.5674, + "step": 321 + }, + { + "epoch": 0.07962413452027696, + "grad_norm": 0.8490795677253689, + "learning_rate": 4.998958767047319e-06, + "loss": 0.5803, + "step": 322 + }, + { + "epoch": 0.07987141444114738, + "grad_norm": 0.8033454860164779, + "learning_rate": 4.998949366101631e-06, + "loss": 0.577, + "step": 323 + }, + { + "epoch": 0.08011869436201781, + "grad_norm": 0.8513966976989501, + "learning_rate": 4.998939922916368e-06, + "loss": 0.6031, + "step": 324 + }, + { + "epoch": 0.08036597428288823, + "grad_norm": 0.86730395021255, + "learning_rate": 4.998930437491689e-06, + "loss": 0.5957, + "step": 325 + }, + { + "epoch": 0.08061325420375866, + "grad_norm": 0.742412468911964, + "learning_rate": 4.9989209098277545e-06, + "loss": 0.5954, + "step": 326 + }, + { + "epoch": 0.08086053412462908, + "grad_norm": 0.7672495419030062, + "learning_rate": 4.998911339924726e-06, + "loss": 0.5563, + "step": 327 + }, + { + "epoch": 0.08110781404549951, + "grad_norm": 0.7723454315350652, + "learning_rate": 4.998901727782763e-06, + "loss": 0.5604, + "step": 328 + }, + { + "epoch": 0.08135509396636993, + "grad_norm": 0.7687143526719443, + "learning_rate": 4.99889207340203e-06, + "loss": 0.575, + "step": 329 + }, + { + "epoch": 0.08160237388724036, + "grad_norm": 0.8232392369424539, + "learning_rate": 4.99888237678269e-06, + "loss": 0.5772, + "step": 330 + }, + { + "epoch": 0.08184965380811078, + "grad_norm": 0.7766370860070179, + "learning_rate": 4.998872637924906e-06, + "loss": 0.6117, + "step": 331 + }, + { + "epoch": 0.0820969337289812, + "grad_norm": 0.7777679878898526, + "learning_rate": 4.998862856828844e-06, + "loss": 0.5678, + "step": 332 + }, + { + "epoch": 0.08234421364985163, + "grad_norm": 0.8402362535184865, + "learning_rate": 4.998853033494668e-06, + "loss": 0.5627, + "step": 333 + }, + { + "epoch": 0.08259149357072205, + "grad_norm": 0.8106043298842506, + "learning_rate": 4.998843167922546e-06, + "loss": 0.6011, + "step": 334 + }, + { + "epoch": 0.08283877349159248, + "grad_norm": 0.8036741854165277, + "learning_rate": 4.998833260112642e-06, + "loss": 0.5678, + "step": 335 + }, + { + "epoch": 0.0830860534124629, + "grad_norm": 0.8067745020147583, + "learning_rate": 4.998823310065125e-06, + "loss": 0.5808, + "step": 336 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.7877781611533857, + "learning_rate": 4.9988133177801625e-06, + "loss": 0.5735, + "step": 337 + }, + { + "epoch": 0.08358061325420375, + "grad_norm": 0.8373696159220352, + "learning_rate": 4.9988032832579245e-06, + "loss": 0.5613, + "step": 338 + }, + { + "epoch": 0.08382789317507418, + "grad_norm": 0.8908331464192217, + "learning_rate": 4.99879320649858e-06, + "loss": 0.5427, + "step": 339 + }, + { + "epoch": 0.0840751730959446, + "grad_norm": 0.7798491353155664, + "learning_rate": 4.9987830875022995e-06, + "loss": 0.5979, + "step": 340 + }, + { + "epoch": 0.08432245301681504, + "grad_norm": 0.7836443525132174, + "learning_rate": 4.998772926269254e-06, + "loss": 0.5554, + "step": 341 + }, + { + "epoch": 0.08456973293768547, + "grad_norm": 0.8875330119592878, + "learning_rate": 4.998762722799615e-06, + "loss": 0.5773, + "step": 342 + }, + { + "epoch": 0.08481701285855589, + "grad_norm": 0.8458805310586042, + "learning_rate": 4.9987524770935546e-06, + "loss": 0.5576, + "step": 343 + }, + { + "epoch": 0.08506429277942631, + "grad_norm": 0.8266480197072542, + "learning_rate": 4.998742189151247e-06, + "loss": 0.5821, + "step": 344 + }, + { + "epoch": 0.08531157270029674, + "grad_norm": 0.7601244118882214, + "learning_rate": 4.998731858972865e-06, + "loss": 0.5631, + "step": 345 + }, + { + "epoch": 0.08555885262116716, + "grad_norm": 0.8861885854250049, + "learning_rate": 4.998721486558584e-06, + "loss": 0.556, + "step": 346 + }, + { + "epoch": 0.08580613254203759, + "grad_norm": 0.8790472177377368, + "learning_rate": 4.998711071908579e-06, + "loss": 0.5602, + "step": 347 + }, + { + "epoch": 0.08605341246290801, + "grad_norm": 0.7871616565771263, + "learning_rate": 4.998700615023027e-06, + "loss": 0.5881, + "step": 348 + }, + { + "epoch": 0.08630069238377844, + "grad_norm": 0.8602679302620617, + "learning_rate": 4.9986901159021036e-06, + "loss": 0.5692, + "step": 349 + }, + { + "epoch": 0.08654797230464886, + "grad_norm": 0.869429029408227, + "learning_rate": 4.998679574545986e-06, + "loss": 0.555, + "step": 350 + }, + { + "epoch": 0.08679525222551929, + "grad_norm": 0.8357333919438975, + "learning_rate": 4.998668990954854e-06, + "loss": 0.5494, + "step": 351 + }, + { + "epoch": 0.08704253214638971, + "grad_norm": 0.8133863569483283, + "learning_rate": 4.998658365128884e-06, + "loss": 0.5666, + "step": 352 + }, + { + "epoch": 0.08728981206726014, + "grad_norm": 0.8325395334428511, + "learning_rate": 4.998647697068258e-06, + "loss": 0.5591, + "step": 353 + }, + { + "epoch": 0.08753709198813056, + "grad_norm": 0.8366416586559107, + "learning_rate": 4.998636986773156e-06, + "loss": 0.5428, + "step": 354 + }, + { + "epoch": 0.08778437190900099, + "grad_norm": 0.8180821666277713, + "learning_rate": 4.9986262342437566e-06, + "loss": 0.5565, + "step": 355 + }, + { + "epoch": 0.08803165182987141, + "grad_norm": 0.8302958602395372, + "learning_rate": 4.9986154394802445e-06, + "loss": 0.5754, + "step": 356 + }, + { + "epoch": 0.08827893175074183, + "grad_norm": 0.908023409390746, + "learning_rate": 4.998604602482801e-06, + "loss": 0.5517, + "step": 357 + }, + { + "epoch": 0.08852621167161226, + "grad_norm": 0.8461839551694633, + "learning_rate": 4.998593723251609e-06, + "loss": 0.5716, + "step": 358 + }, + { + "epoch": 0.08877349159248268, + "grad_norm": 0.863219314588194, + "learning_rate": 4.9985828017868534e-06, + "loss": 0.5403, + "step": 359 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 0.8146799690434612, + "learning_rate": 4.998571838088717e-06, + "loss": 0.5643, + "step": 360 + }, + { + "epoch": 0.08926805143422355, + "grad_norm": 0.7935582497855002, + "learning_rate": 4.9985608321573864e-06, + "loss": 0.5698, + "step": 361 + }, + { + "epoch": 0.08951533135509397, + "grad_norm": 0.8476507277896043, + "learning_rate": 4.998549783993048e-06, + "loss": 0.5528, + "step": 362 + }, + { + "epoch": 0.0897626112759644, + "grad_norm": 0.8208621602814976, + "learning_rate": 4.998538693595888e-06, + "loss": 0.5786, + "step": 363 + }, + { + "epoch": 0.09000989119683482, + "grad_norm": 0.882433464188905, + "learning_rate": 4.998527560966094e-06, + "loss": 0.5727, + "step": 364 + }, + { + "epoch": 0.09025717111770525, + "grad_norm": 0.9015805731333748, + "learning_rate": 4.9985163861038535e-06, + "loss": 0.5669, + "step": 365 + }, + { + "epoch": 0.09050445103857567, + "grad_norm": 0.8174154358479824, + "learning_rate": 4.998505169009356e-06, + "loss": 0.5541, + "step": 366 + }, + { + "epoch": 0.0907517309594461, + "grad_norm": 0.7879889004369333, + "learning_rate": 4.998493909682791e-06, + "loss": 0.5377, + "step": 367 + }, + { + "epoch": 0.09099901088031652, + "grad_norm": 0.8134430346957511, + "learning_rate": 4.99848260812435e-06, + "loss": 0.5673, + "step": 368 + }, + { + "epoch": 0.09124629080118694, + "grad_norm": 0.7822338294968185, + "learning_rate": 4.998471264334222e-06, + "loss": 0.5747, + "step": 369 + }, + { + "epoch": 0.09149357072205737, + "grad_norm": 0.846007678741242, + "learning_rate": 4.998459878312598e-06, + "loss": 0.5382, + "step": 370 + }, + { + "epoch": 0.0917408506429278, + "grad_norm": 0.7834419615329641, + "learning_rate": 4.998448450059674e-06, + "loss": 0.5802, + "step": 371 + }, + { + "epoch": 0.09198813056379822, + "grad_norm": 0.8462903242903708, + "learning_rate": 4.998436979575641e-06, + "loss": 0.5637, + "step": 372 + }, + { + "epoch": 0.09223541048466864, + "grad_norm": 0.8512867722701203, + "learning_rate": 4.998425466860692e-06, + "loss": 0.5289, + "step": 373 + }, + { + "epoch": 0.09248269040553907, + "grad_norm": 0.8021787954595253, + "learning_rate": 4.998413911915025e-06, + "loss": 0.5851, + "step": 374 + }, + { + "epoch": 0.09272997032640949, + "grad_norm": 0.7994136334382167, + "learning_rate": 4.998402314738831e-06, + "loss": 0.5461, + "step": 375 + }, + { + "epoch": 0.09297725024727992, + "grad_norm": 0.8676794377546366, + "learning_rate": 4.998390675332308e-06, + "loss": 0.5374, + "step": 376 + }, + { + "epoch": 0.09322453016815034, + "grad_norm": 0.8618752560499016, + "learning_rate": 4.9983789936956535e-06, + "loss": 0.5235, + "step": 377 + }, + { + "epoch": 0.09347181008902077, + "grad_norm": 0.8116452824280147, + "learning_rate": 4.998367269829065e-06, + "loss": 0.563, + "step": 378 + }, + { + "epoch": 0.09371909000989119, + "grad_norm": 0.8601437613985123, + "learning_rate": 4.998355503732739e-06, + "loss": 0.5612, + "step": 379 + }, + { + "epoch": 0.09396636993076163, + "grad_norm": 0.8155800361584301, + "learning_rate": 4.9983436954068755e-06, + "loss": 0.5712, + "step": 380 + }, + { + "epoch": 0.09421364985163205, + "grad_norm": 0.8124067408872788, + "learning_rate": 4.998331844851674e-06, + "loss": 0.5323, + "step": 381 + }, + { + "epoch": 0.09446092977250248, + "grad_norm": 0.8103826549978994, + "learning_rate": 4.9983199520673345e-06, + "loss": 0.5433, + "step": 382 + }, + { + "epoch": 0.0947082096933729, + "grad_norm": 0.8323514299027938, + "learning_rate": 4.998308017054059e-06, + "loss": 0.5691, + "step": 383 + }, + { + "epoch": 0.09495548961424333, + "grad_norm": 0.8000615986590797, + "learning_rate": 4.998296039812047e-06, + "loss": 0.5553, + "step": 384 + }, + { + "epoch": 0.09520276953511375, + "grad_norm": 0.8339284242974745, + "learning_rate": 4.9982840203415035e-06, + "loss": 0.5579, + "step": 385 + }, + { + "epoch": 0.09545004945598418, + "grad_norm": 0.8174909593172076, + "learning_rate": 4.99827195864263e-06, + "loss": 0.5556, + "step": 386 + }, + { + "epoch": 0.0956973293768546, + "grad_norm": 0.7975261451643757, + "learning_rate": 4.998259854715631e-06, + "loss": 0.5661, + "step": 387 + }, + { + "epoch": 0.09594460929772503, + "grad_norm": 0.7916331855598239, + "learning_rate": 4.998247708560712e-06, + "loss": 0.5409, + "step": 388 + }, + { + "epoch": 0.09619188921859545, + "grad_norm": 0.8426876014737761, + "learning_rate": 4.998235520178076e-06, + "loss": 0.5245, + "step": 389 + }, + { + "epoch": 0.09643916913946587, + "grad_norm": 0.7464229384648007, + "learning_rate": 4.998223289567931e-06, + "loss": 0.5494, + "step": 390 + }, + { + "epoch": 0.0966864490603363, + "grad_norm": 0.8575385758360788, + "learning_rate": 4.998211016730483e-06, + "loss": 0.5247, + "step": 391 + }, + { + "epoch": 0.09693372898120672, + "grad_norm": 0.8239224282844174, + "learning_rate": 4.99819870166594e-06, + "loss": 0.5592, + "step": 392 + }, + { + "epoch": 0.09718100890207715, + "grad_norm": 0.949667436501745, + "learning_rate": 4.998186344374509e-06, + "loss": 0.5425, + "step": 393 + }, + { + "epoch": 0.09742828882294757, + "grad_norm": 0.8371983680182382, + "learning_rate": 4.9981739448564005e-06, + "loss": 0.5675, + "step": 394 + }, + { + "epoch": 0.097675568743818, + "grad_norm": 0.8190906217429191, + "learning_rate": 4.998161503111822e-06, + "loss": 0.6126, + "step": 395 + }, + { + "epoch": 0.09792284866468842, + "grad_norm": 0.814594253339599, + "learning_rate": 4.998149019140987e-06, + "loss": 0.5475, + "step": 396 + }, + { + "epoch": 0.09817012858555885, + "grad_norm": 0.881386637857726, + "learning_rate": 4.998136492944102e-06, + "loss": 0.5577, + "step": 397 + }, + { + "epoch": 0.09841740850642927, + "grad_norm": 0.8200193748866078, + "learning_rate": 4.998123924521383e-06, + "loss": 0.5585, + "step": 398 + }, + { + "epoch": 0.09866468842729971, + "grad_norm": 0.8111722627824661, + "learning_rate": 4.99811131387304e-06, + "loss": 0.5463, + "step": 399 + }, + { + "epoch": 0.09891196834817013, + "grad_norm": 0.8941732458650538, + "learning_rate": 4.9980986609992865e-06, + "loss": 0.5478, + "step": 400 + }, + { + "epoch": 0.09915924826904056, + "grad_norm": 0.9159041142286112, + "learning_rate": 4.998085965900337e-06, + "loss": 0.5607, + "step": 401 + }, + { + "epoch": 0.09940652818991098, + "grad_norm": 0.8149622380453867, + "learning_rate": 4.998073228576406e-06, + "loss": 0.5563, + "step": 402 + }, + { + "epoch": 0.09965380811078141, + "grad_norm": 0.8415024715760436, + "learning_rate": 4.998060449027709e-06, + "loss": 0.5735, + "step": 403 + }, + { + "epoch": 0.09990108803165183, + "grad_norm": 0.8549093887109618, + "learning_rate": 4.998047627254461e-06, + "loss": 0.5285, + "step": 404 + }, + { + "epoch": 0.10014836795252226, + "grad_norm": 0.8654615781606256, + "learning_rate": 4.998034763256879e-06, + "loss": 0.5543, + "step": 405 + }, + { + "epoch": 0.10039564787339268, + "grad_norm": 0.8349697955100309, + "learning_rate": 4.998021857035181e-06, + "loss": 0.5738, + "step": 406 + }, + { + "epoch": 0.1006429277942631, + "grad_norm": 0.8519615357866014, + "learning_rate": 4.998008908589586e-06, + "loss": 0.5524, + "step": 407 + }, + { + "epoch": 0.10089020771513353, + "grad_norm": 0.9177041756133535, + "learning_rate": 4.9979959179203095e-06, + "loss": 0.5927, + "step": 408 + }, + { + "epoch": 0.10113748763600396, + "grad_norm": 0.8578558789111481, + "learning_rate": 4.997982885027575e-06, + "loss": 0.5373, + "step": 409 + }, + { + "epoch": 0.10138476755687438, + "grad_norm": 0.8249578416532023, + "learning_rate": 4.997969809911601e-06, + "loss": 0.5582, + "step": 410 + }, + { + "epoch": 0.1016320474777448, + "grad_norm": 0.8322799534441788, + "learning_rate": 4.997956692572609e-06, + "loss": 0.5436, + "step": 411 + }, + { + "epoch": 0.10187932739861523, + "grad_norm": 0.7946202487427521, + "learning_rate": 4.9979435330108195e-06, + "loss": 0.5561, + "step": 412 + }, + { + "epoch": 0.10212660731948565, + "grad_norm": 0.8311385424691226, + "learning_rate": 4.997930331226456e-06, + "loss": 0.518, + "step": 413 + }, + { + "epoch": 0.10237388724035608, + "grad_norm": 0.8304279162491168, + "learning_rate": 4.997917087219741e-06, + "loss": 0.5412, + "step": 414 + }, + { + "epoch": 0.1026211671612265, + "grad_norm": 0.8877748788219127, + "learning_rate": 4.9979038009909e-06, + "loss": 0.517, + "step": 415 + }, + { + "epoch": 0.10286844708209693, + "grad_norm": 0.8062638798591899, + "learning_rate": 4.997890472540156e-06, + "loss": 0.5317, + "step": 416 + }, + { + "epoch": 0.10311572700296735, + "grad_norm": 0.8638613858007713, + "learning_rate": 4.997877101867734e-06, + "loss": 0.565, + "step": 417 + }, + { + "epoch": 0.10336300692383778, + "grad_norm": 0.8549274979700249, + "learning_rate": 4.997863688973862e-06, + "loss": 0.5333, + "step": 418 + }, + { + "epoch": 0.10361028684470822, + "grad_norm": 0.8238596502819312, + "learning_rate": 4.997850233858765e-06, + "loss": 0.5664, + "step": 419 + }, + { + "epoch": 0.10385756676557864, + "grad_norm": 0.8179769972244695, + "learning_rate": 4.99783673652267e-06, + "loss": 0.5494, + "step": 420 + }, + { + "epoch": 0.10410484668644907, + "grad_norm": 0.872760822491279, + "learning_rate": 4.997823196965806e-06, + "loss": 0.5686, + "step": 421 + }, + { + "epoch": 0.10435212660731949, + "grad_norm": 0.8247079449845919, + "learning_rate": 4.997809615188403e-06, + "loss": 0.5761, + "step": 422 + }, + { + "epoch": 0.10459940652818991, + "grad_norm": 0.8274894849461595, + "learning_rate": 4.9977959911906885e-06, + "loss": 0.546, + "step": 423 + }, + { + "epoch": 0.10484668644906034, + "grad_norm": 0.8200195602952358, + "learning_rate": 4.997782324972894e-06, + "loss": 0.5487, + "step": 424 + }, + { + "epoch": 0.10509396636993076, + "grad_norm": 0.8424425368017951, + "learning_rate": 4.99776861653525e-06, + "loss": 0.5284, + "step": 425 + }, + { + "epoch": 0.10534124629080119, + "grad_norm": 0.8228965203175329, + "learning_rate": 4.9977548658779885e-06, + "loss": 0.5416, + "step": 426 + }, + { + "epoch": 0.10558852621167161, + "grad_norm": 0.8329995359888007, + "learning_rate": 4.997741073001342e-06, + "loss": 0.5271, + "step": 427 + }, + { + "epoch": 0.10583580613254204, + "grad_norm": 0.7637313695891461, + "learning_rate": 4.997727237905543e-06, + "loss": 0.5543, + "step": 428 + }, + { + "epoch": 0.10608308605341246, + "grad_norm": 0.8469415248863096, + "learning_rate": 4.9977133605908264e-06, + "loss": 0.5194, + "step": 429 + }, + { + "epoch": 0.10633036597428289, + "grad_norm": 0.8032002919054423, + "learning_rate": 4.997699441057427e-06, + "loss": 0.5175, + "step": 430 + }, + { + "epoch": 0.10657764589515331, + "grad_norm": 0.789050525965101, + "learning_rate": 4.997685479305577e-06, + "loss": 0.5706, + "step": 431 + }, + { + "epoch": 0.10682492581602374, + "grad_norm": 0.8025536621297401, + "learning_rate": 4.997671475335517e-06, + "loss": 0.5336, + "step": 432 + }, + { + "epoch": 0.10707220573689416, + "grad_norm": 0.8376249206772568, + "learning_rate": 4.99765742914748e-06, + "loss": 0.5502, + "step": 433 + }, + { + "epoch": 0.10731948565776459, + "grad_norm": 0.8226860382728634, + "learning_rate": 4.9976433407417056e-06, + "loss": 0.5352, + "step": 434 + }, + { + "epoch": 0.10756676557863501, + "grad_norm": 0.8196965592986617, + "learning_rate": 4.9976292101184305e-06, + "loss": 0.5642, + "step": 435 + }, + { + "epoch": 0.10781404549950543, + "grad_norm": 0.8027935956184445, + "learning_rate": 4.997615037277894e-06, + "loss": 0.5367, + "step": 436 + }, + { + "epoch": 0.10806132542037586, + "grad_norm": 0.8412182776715479, + "learning_rate": 4.997600822220336e-06, + "loss": 0.5545, + "step": 437 + }, + { + "epoch": 0.1083086053412463, + "grad_norm": 0.7967652156249115, + "learning_rate": 4.997586564945998e-06, + "loss": 0.538, + "step": 438 + }, + { + "epoch": 0.10855588526211672, + "grad_norm": 0.8273469063469596, + "learning_rate": 4.997572265455118e-06, + "loss": 0.5349, + "step": 439 + }, + { + "epoch": 0.10880316518298715, + "grad_norm": 0.816757505466961, + "learning_rate": 4.9975579237479396e-06, + "loss": 0.5598, + "step": 440 + }, + { + "epoch": 0.10905044510385757, + "grad_norm": 0.8179328405367462, + "learning_rate": 4.997543539824706e-06, + "loss": 0.5461, + "step": 441 + }, + { + "epoch": 0.109297725024728, + "grad_norm": 0.8095408289755744, + "learning_rate": 4.997529113685659e-06, + "loss": 0.5515, + "step": 442 + }, + { + "epoch": 0.10954500494559842, + "grad_norm": 0.8916427729760883, + "learning_rate": 4.997514645331042e-06, + "loss": 0.5217, + "step": 443 + }, + { + "epoch": 0.10979228486646884, + "grad_norm": 0.7917353212761363, + "learning_rate": 4.9975001347611005e-06, + "loss": 0.5371, + "step": 444 + }, + { + "epoch": 0.11003956478733927, + "grad_norm": 0.8146465036810688, + "learning_rate": 4.997485581976079e-06, + "loss": 0.5987, + "step": 445 + }, + { + "epoch": 0.1102868447082097, + "grad_norm": 0.8128773978395029, + "learning_rate": 4.997470986976225e-06, + "loss": 0.5481, + "step": 446 + }, + { + "epoch": 0.11053412462908012, + "grad_norm": 0.7961091137875655, + "learning_rate": 4.997456349761783e-06, + "loss": 0.5582, + "step": 447 + }, + { + "epoch": 0.11078140454995054, + "grad_norm": 0.813481787539708, + "learning_rate": 4.997441670333003e-06, + "loss": 0.5443, + "step": 448 + }, + { + "epoch": 0.11102868447082097, + "grad_norm": 0.8158459985879026, + "learning_rate": 4.997426948690131e-06, + "loss": 0.5536, + "step": 449 + }, + { + "epoch": 0.11127596439169139, + "grad_norm": 0.8327146727995968, + "learning_rate": 4.997412184833417e-06, + "loss": 0.5326, + "step": 450 + }, + { + "epoch": 0.11152324431256182, + "grad_norm": 0.7905392611324622, + "learning_rate": 4.99739737876311e-06, + "loss": 0.5415, + "step": 451 + }, + { + "epoch": 0.11177052423343224, + "grad_norm": 0.832901783841983, + "learning_rate": 4.99738253047946e-06, + "loss": 0.5263, + "step": 452 + }, + { + "epoch": 0.11201780415430267, + "grad_norm": 0.8309586500909867, + "learning_rate": 4.997367639982719e-06, + "loss": 0.564, + "step": 453 + }, + { + "epoch": 0.11226508407517309, + "grad_norm": 0.7896942371421249, + "learning_rate": 4.997352707273138e-06, + "loss": 0.5688, + "step": 454 + }, + { + "epoch": 0.11251236399604352, + "grad_norm": 0.8159292622067303, + "learning_rate": 4.9973377323509694e-06, + "loss": 0.5397, + "step": 455 + }, + { + "epoch": 0.11275964391691394, + "grad_norm": 0.8743313979782678, + "learning_rate": 4.997322715216467e-06, + "loss": 0.5425, + "step": 456 + }, + { + "epoch": 0.11300692383778438, + "grad_norm": 0.8145131073267003, + "learning_rate": 4.997307655869883e-06, + "loss": 0.5279, + "step": 457 + }, + { + "epoch": 0.1132542037586548, + "grad_norm": 0.8498643017964868, + "learning_rate": 4.997292554311474e-06, + "loss": 0.5439, + "step": 458 + }, + { + "epoch": 0.11350148367952523, + "grad_norm": 0.876725589506382, + "learning_rate": 4.997277410541493e-06, + "loss": 0.5584, + "step": 459 + }, + { + "epoch": 0.11374876360039565, + "grad_norm": 0.8392765984431413, + "learning_rate": 4.9972622245601986e-06, + "loss": 0.5696, + "step": 460 + }, + { + "epoch": 0.11399604352126608, + "grad_norm": 0.8074407171452053, + "learning_rate": 4.997246996367845e-06, + "loss": 0.5587, + "step": 461 + }, + { + "epoch": 0.1142433234421365, + "grad_norm": 0.8471548223920936, + "learning_rate": 4.997231725964692e-06, + "loss": 0.5244, + "step": 462 + }, + { + "epoch": 0.11449060336300693, + "grad_norm": 0.8650548673944289, + "learning_rate": 4.9972164133509955e-06, + "loss": 0.5689, + "step": 463 + }, + { + "epoch": 0.11473788328387735, + "grad_norm": 0.8510165250718368, + "learning_rate": 4.997201058527016e-06, + "loss": 0.5348, + "step": 464 + }, + { + "epoch": 0.11498516320474778, + "grad_norm": 0.8119157561874207, + "learning_rate": 4.997185661493011e-06, + "loss": 0.5405, + "step": 465 + }, + { + "epoch": 0.1152324431256182, + "grad_norm": 0.8063983800119408, + "learning_rate": 4.997170222249244e-06, + "loss": 0.5366, + "step": 466 + }, + { + "epoch": 0.11547972304648862, + "grad_norm": 0.8326109923777355, + "learning_rate": 4.997154740795972e-06, + "loss": 0.5725, + "step": 467 + }, + { + "epoch": 0.11572700296735905, + "grad_norm": 0.8157543563831605, + "learning_rate": 4.99713921713346e-06, + "loss": 0.5288, + "step": 468 + }, + { + "epoch": 0.11597428288822947, + "grad_norm": 0.8035319772872705, + "learning_rate": 4.997123651261969e-06, + "loss": 0.5257, + "step": 469 + }, + { + "epoch": 0.1162215628090999, + "grad_norm": 0.7921032125004203, + "learning_rate": 4.997108043181762e-06, + "loss": 0.5396, + "step": 470 + }, + { + "epoch": 0.11646884272997032, + "grad_norm": 0.8305520689275583, + "learning_rate": 4.9970923928931026e-06, + "loss": 0.5511, + "step": 471 + }, + { + "epoch": 0.11671612265084075, + "grad_norm": 0.8548918005506956, + "learning_rate": 4.997076700396256e-06, + "loss": 0.5318, + "step": 472 + }, + { + "epoch": 0.11696340257171117, + "grad_norm": 0.8154176124090147, + "learning_rate": 4.997060965691488e-06, + "loss": 0.576, + "step": 473 + }, + { + "epoch": 0.1172106824925816, + "grad_norm": 0.8412409936805049, + "learning_rate": 4.9970451887790626e-06, + "loss": 0.53, + "step": 474 + }, + { + "epoch": 0.11745796241345202, + "grad_norm": 0.8226202940805938, + "learning_rate": 4.997029369659249e-06, + "loss": 0.521, + "step": 475 + }, + { + "epoch": 0.11770524233432245, + "grad_norm": 0.8110352535755092, + "learning_rate": 4.997013508332312e-06, + "loss": 0.5311, + "step": 476 + }, + { + "epoch": 0.11795252225519288, + "grad_norm": 0.86153650176347, + "learning_rate": 4.996997604798522e-06, + "loss": 0.5388, + "step": 477 + }, + { + "epoch": 0.11819980217606331, + "grad_norm": 0.8483377928162458, + "learning_rate": 4.996981659058146e-06, + "loss": 0.5471, + "step": 478 + }, + { + "epoch": 0.11844708209693373, + "grad_norm": 0.8624161681686172, + "learning_rate": 4.9969656711114546e-06, + "loss": 0.528, + "step": 479 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 0.8400291073199923, + "learning_rate": 4.996949640958718e-06, + "loss": 0.5488, + "step": 480 + }, + { + "epoch": 0.11894164193867458, + "grad_norm": 0.8311370315597272, + "learning_rate": 4.996933568600206e-06, + "loss": 0.5563, + "step": 481 + }, + { + "epoch": 0.11918892185954501, + "grad_norm": 0.8870073351460207, + "learning_rate": 4.996917454036192e-06, + "loss": 0.5087, + "step": 482 + }, + { + "epoch": 0.11943620178041543, + "grad_norm": 0.8713695523993678, + "learning_rate": 4.996901297266947e-06, + "loss": 0.5275, + "step": 483 + }, + { + "epoch": 0.11968348170128586, + "grad_norm": 0.9038628652413889, + "learning_rate": 4.996885098292745e-06, + "loss": 0.5439, + "step": 484 + }, + { + "epoch": 0.11993076162215628, + "grad_norm": 0.9167026612271514, + "learning_rate": 4.99686885711386e-06, + "loss": 0.5677, + "step": 485 + }, + { + "epoch": 0.1201780415430267, + "grad_norm": 0.8720438532820466, + "learning_rate": 4.996852573730565e-06, + "loss": 0.5236, + "step": 486 + }, + { + "epoch": 0.12042532146389713, + "grad_norm": 0.7557209197955111, + "learning_rate": 4.996836248143138e-06, + "loss": 0.5104, + "step": 487 + }, + { + "epoch": 0.12067260138476756, + "grad_norm": 0.7989706682687766, + "learning_rate": 4.996819880351851e-06, + "loss": 0.5853, + "step": 488 + }, + { + "epoch": 0.12091988130563798, + "grad_norm": 0.8984411166999768, + "learning_rate": 4.996803470356984e-06, + "loss": 0.5287, + "step": 489 + }, + { + "epoch": 0.1211671612265084, + "grad_norm": 0.8648551900935894, + "learning_rate": 4.996787018158813e-06, + "loss": 0.5419, + "step": 490 + }, + { + "epoch": 0.12141444114737883, + "grad_norm": 0.804324060258821, + "learning_rate": 4.996770523757616e-06, + "loss": 0.5205, + "step": 491 + }, + { + "epoch": 0.12166172106824925, + "grad_norm": 0.9032754593092449, + "learning_rate": 4.996753987153673e-06, + "loss": 0.5159, + "step": 492 + }, + { + "epoch": 0.12190900098911968, + "grad_norm": 0.9533551364220325, + "learning_rate": 4.996737408347262e-06, + "loss": 0.5226, + "step": 493 + }, + { + "epoch": 0.1221562809099901, + "grad_norm": 0.8713670416215546, + "learning_rate": 4.996720787338663e-06, + "loss": 0.5618, + "step": 494 + }, + { + "epoch": 0.12240356083086053, + "grad_norm": 0.8684842954756598, + "learning_rate": 4.996704124128159e-06, + "loss": 0.5639, + "step": 495 + }, + { + "epoch": 0.12265084075173097, + "grad_norm": 0.8952885323193862, + "learning_rate": 4.996687418716031e-06, + "loss": 0.5079, + "step": 496 + }, + { + "epoch": 0.12289812067260139, + "grad_norm": 0.8075323957819724, + "learning_rate": 4.9966706711025596e-06, + "loss": 0.5321, + "step": 497 + }, + { + "epoch": 0.12314540059347182, + "grad_norm": 0.8494422449585496, + "learning_rate": 4.996653881288029e-06, + "loss": 0.5001, + "step": 498 + }, + { + "epoch": 0.12339268051434224, + "grad_norm": 0.8875548481591807, + "learning_rate": 4.996637049272724e-06, + "loss": 0.5364, + "step": 499 + }, + { + "epoch": 0.12363996043521266, + "grad_norm": 0.9020942067293436, + "learning_rate": 4.996620175056928e-06, + "loss": 0.5257, + "step": 500 + }, + { + "epoch": 0.12388724035608309, + "grad_norm": 0.8675810693187886, + "learning_rate": 4.9966032586409264e-06, + "loss": 0.5365, + "step": 501 + }, + { + "epoch": 0.12413452027695351, + "grad_norm": 0.8714654026651982, + "learning_rate": 4.996586300025005e-06, + "loss": 0.5008, + "step": 502 + }, + { + "epoch": 0.12438180019782394, + "grad_norm": 0.8399592723027427, + "learning_rate": 4.99656929920945e-06, + "loss": 0.5151, + "step": 503 + }, + { + "epoch": 0.12462908011869436, + "grad_norm": 0.7778730144888366, + "learning_rate": 4.996552256194551e-06, + "loss": 0.5302, + "step": 504 + }, + { + "epoch": 0.12487636003956479, + "grad_norm": 0.8260924301670444, + "learning_rate": 4.996535170980593e-06, + "loss": 0.5182, + "step": 505 + }, + { + "epoch": 0.1251236399604352, + "grad_norm": 0.8999624536805119, + "learning_rate": 4.996518043567868e-06, + "loss": 0.5232, + "step": 506 + }, + { + "epoch": 0.12537091988130564, + "grad_norm": 0.8691087323090587, + "learning_rate": 4.9965008739566615e-06, + "loss": 0.5532, + "step": 507 + }, + { + "epoch": 0.12561819980217606, + "grad_norm": 0.8672773310284954, + "learning_rate": 4.9964836621472674e-06, + "loss": 0.5627, + "step": 508 + }, + { + "epoch": 0.1258654797230465, + "grad_norm": 0.790002975589831, + "learning_rate": 4.996466408139975e-06, + "loss": 0.5469, + "step": 509 + }, + { + "epoch": 0.1261127596439169, + "grad_norm": 0.8660874103550955, + "learning_rate": 4.996449111935075e-06, + "loss": 0.5392, + "step": 510 + }, + { + "epoch": 0.12636003956478734, + "grad_norm": 0.8196685228026358, + "learning_rate": 4.996431773532863e-06, + "loss": 0.5347, + "step": 511 + }, + { + "epoch": 0.12660731948565776, + "grad_norm": 0.8679109946021679, + "learning_rate": 4.996414392933629e-06, + "loss": 0.5019, + "step": 512 + }, + { + "epoch": 0.12685459940652818, + "grad_norm": 0.8309043787360044, + "learning_rate": 4.996396970137668e-06, + "loss": 0.5288, + "step": 513 + }, + { + "epoch": 0.1271018793273986, + "grad_norm": 0.8410088998221761, + "learning_rate": 4.9963795051452736e-06, + "loss": 0.5466, + "step": 514 + }, + { + "epoch": 0.12734915924826903, + "grad_norm": 0.8358751326108991, + "learning_rate": 4.996361997956743e-06, + "loss": 0.5428, + "step": 515 + }, + { + "epoch": 0.12759643916913946, + "grad_norm": 0.8819431687194557, + "learning_rate": 4.996344448572369e-06, + "loss": 0.5364, + "step": 516 + }, + { + "epoch": 0.12784371909000988, + "grad_norm": 0.8490128009282251, + "learning_rate": 4.9963268569924515e-06, + "loss": 0.5394, + "step": 517 + }, + { + "epoch": 0.1280909990108803, + "grad_norm": 0.8244195316603701, + "learning_rate": 4.996309223217285e-06, + "loss": 0.5176, + "step": 518 + }, + { + "epoch": 0.12833827893175073, + "grad_norm": 0.8235400710586557, + "learning_rate": 4.99629154724717e-06, + "loss": 0.5364, + "step": 519 + }, + { + "epoch": 0.12858555885262116, + "grad_norm": 0.8301561516400873, + "learning_rate": 4.996273829082404e-06, + "loss": 0.5339, + "step": 520 + }, + { + "epoch": 0.12883283877349158, + "grad_norm": 0.8383832053104039, + "learning_rate": 4.996256068723287e-06, + "loss": 0.5079, + "step": 521 + }, + { + "epoch": 0.129080118694362, + "grad_norm": 0.8312860024324458, + "learning_rate": 4.996238266170118e-06, + "loss": 0.5199, + "step": 522 + }, + { + "epoch": 0.12932739861523243, + "grad_norm": 0.8799017293310935, + "learning_rate": 4.9962204214232005e-06, + "loss": 0.499, + "step": 523 + }, + { + "epoch": 0.12957467853610286, + "grad_norm": 0.8835474145873032, + "learning_rate": 4.996202534482832e-06, + "loss": 0.4991, + "step": 524 + }, + { + "epoch": 0.1298219584569733, + "grad_norm": 0.8039735973515206, + "learning_rate": 4.9961846053493194e-06, + "loss": 0.5355, + "step": 525 + }, + { + "epoch": 0.13006923837784373, + "grad_norm": 0.8361160383616958, + "learning_rate": 4.9961666340229635e-06, + "loss": 0.5466, + "step": 526 + }, + { + "epoch": 0.13031651829871416, + "grad_norm": 0.8569201635197241, + "learning_rate": 4.996148620504067e-06, + "loss": 0.5349, + "step": 527 + }, + { + "epoch": 0.13056379821958458, + "grad_norm": 0.8802630630022769, + "learning_rate": 4.996130564792936e-06, + "loss": 0.5228, + "step": 528 + }, + { + "epoch": 0.130811078140455, + "grad_norm": 0.8030195373159202, + "learning_rate": 4.996112466889876e-06, + "loss": 0.5689, + "step": 529 + }, + { + "epoch": 0.13105835806132543, + "grad_norm": 0.7747458285259381, + "learning_rate": 4.996094326795192e-06, + "loss": 0.5297, + "step": 530 + }, + { + "epoch": 0.13130563798219586, + "grad_norm": 0.808534915335967, + "learning_rate": 4.996076144509191e-06, + "loss": 0.5333, + "step": 531 + }, + { + "epoch": 0.13155291790306628, + "grad_norm": 0.8248147055112679, + "learning_rate": 4.996057920032179e-06, + "loss": 0.5338, + "step": 532 + }, + { + "epoch": 0.1318001978239367, + "grad_norm": 0.8369441009968227, + "learning_rate": 4.996039653364466e-06, + "loss": 0.5249, + "step": 533 + }, + { + "epoch": 0.13204747774480713, + "grad_norm": 0.8122213605964168, + "learning_rate": 4.99602134450636e-06, + "loss": 0.535, + "step": 534 + }, + { + "epoch": 0.13229475766567755, + "grad_norm": 0.8156674463115173, + "learning_rate": 4.9960029934581706e-06, + "loss": 0.5331, + "step": 535 + }, + { + "epoch": 0.13254203758654798, + "grad_norm": 0.7744707960410743, + "learning_rate": 4.9959846002202075e-06, + "loss": 0.5362, + "step": 536 + }, + { + "epoch": 0.1327893175074184, + "grad_norm": 0.8384560596557845, + "learning_rate": 4.995966164792782e-06, + "loss": 0.5453, + "step": 537 + }, + { + "epoch": 0.13303659742828883, + "grad_norm": 0.8058804582138253, + "learning_rate": 4.9959476871762055e-06, + "loss": 0.5157, + "step": 538 + }, + { + "epoch": 0.13328387734915925, + "grad_norm": 0.8294745593661792, + "learning_rate": 4.995929167370791e-06, + "loss": 0.5766, + "step": 539 + }, + { + "epoch": 0.13353115727002968, + "grad_norm": 0.8396457626210513, + "learning_rate": 4.99591060537685e-06, + "loss": 0.53, + "step": 540 + }, + { + "epoch": 0.1337784371909001, + "grad_norm": 0.8422569925721224, + "learning_rate": 4.995892001194699e-06, + "loss": 0.5293, + "step": 541 + }, + { + "epoch": 0.13402571711177053, + "grad_norm": 0.8097797303451486, + "learning_rate": 4.995873354824649e-06, + "loss": 0.5193, + "step": 542 + }, + { + "epoch": 0.13427299703264095, + "grad_norm": 0.8282914521596959, + "learning_rate": 4.995854666267017e-06, + "loss": 0.517, + "step": 543 + }, + { + "epoch": 0.13452027695351138, + "grad_norm": 0.8430683316926538, + "learning_rate": 4.99583593552212e-06, + "loss": 0.5572, + "step": 544 + }, + { + "epoch": 0.1347675568743818, + "grad_norm": 0.8118636383334119, + "learning_rate": 4.995817162590273e-06, + "loss": 0.5268, + "step": 545 + }, + { + "epoch": 0.13501483679525222, + "grad_norm": 0.7958627214231268, + "learning_rate": 4.995798347471793e-06, + "loss": 0.5271, + "step": 546 + }, + { + "epoch": 0.13526211671612265, + "grad_norm": 0.8098286264528141, + "learning_rate": 4.995779490166999e-06, + "loss": 0.5318, + "step": 547 + }, + { + "epoch": 0.13550939663699307, + "grad_norm": 0.8301867014748663, + "learning_rate": 4.995760590676209e-06, + "loss": 0.5593, + "step": 548 + }, + { + "epoch": 0.1357566765578635, + "grad_norm": 0.7834537582380657, + "learning_rate": 4.995741648999744e-06, + "loss": 0.5145, + "step": 549 + }, + { + "epoch": 0.13600395647873392, + "grad_norm": 0.8100081692650491, + "learning_rate": 4.995722665137923e-06, + "loss": 0.5353, + "step": 550 + }, + { + "epoch": 0.13625123639960435, + "grad_norm": 0.80189918046591, + "learning_rate": 4.995703639091067e-06, + "loss": 0.5322, + "step": 551 + }, + { + "epoch": 0.13649851632047477, + "grad_norm": 0.8350298859457816, + "learning_rate": 4.995684570859497e-06, + "loss": 0.5705, + "step": 552 + }, + { + "epoch": 0.1367457962413452, + "grad_norm": 0.8182785573997086, + "learning_rate": 4.995665460443536e-06, + "loss": 0.5083, + "step": 553 + }, + { + "epoch": 0.13699307616221562, + "grad_norm": 0.9006658583356458, + "learning_rate": 4.995646307843508e-06, + "loss": 0.54, + "step": 554 + }, + { + "epoch": 0.13724035608308605, + "grad_norm": 0.7948803191646967, + "learning_rate": 4.995627113059734e-06, + "loss": 0.5562, + "step": 555 + }, + { + "epoch": 0.13748763600395647, + "grad_norm": 0.8282246520634873, + "learning_rate": 4.995607876092541e-06, + "loss": 0.5289, + "step": 556 + }, + { + "epoch": 0.1377349159248269, + "grad_norm": 0.8931269666220943, + "learning_rate": 4.995588596942254e-06, + "loss": 0.5478, + "step": 557 + }, + { + "epoch": 0.13798219584569732, + "grad_norm": 0.8291498125431265, + "learning_rate": 4.995569275609197e-06, + "loss": 0.5269, + "step": 558 + }, + { + "epoch": 0.13822947576656774, + "grad_norm": 0.824472874788768, + "learning_rate": 4.995549912093698e-06, + "loss": 0.5203, + "step": 559 + }, + { + "epoch": 0.13847675568743817, + "grad_norm": 0.8380697493482778, + "learning_rate": 4.995530506396084e-06, + "loss": 0.5343, + "step": 560 + }, + { + "epoch": 0.1387240356083086, + "grad_norm": 0.8197126786744611, + "learning_rate": 4.995511058516683e-06, + "loss": 0.5252, + "step": 561 + }, + { + "epoch": 0.13897131552917902, + "grad_norm": 0.8478247444958412, + "learning_rate": 4.995491568455824e-06, + "loss": 0.5136, + "step": 562 + }, + { + "epoch": 0.13921859545004944, + "grad_norm": 0.8005226390245985, + "learning_rate": 4.9954720362138365e-06, + "loss": 0.5269, + "step": 563 + }, + { + "epoch": 0.1394658753709199, + "grad_norm": 0.8227428725452879, + "learning_rate": 4.995452461791049e-06, + "loss": 0.5127, + "step": 564 + }, + { + "epoch": 0.13971315529179032, + "grad_norm": 0.8065785300544993, + "learning_rate": 4.995432845187796e-06, + "loss": 0.5156, + "step": 565 + }, + { + "epoch": 0.13996043521266074, + "grad_norm": 0.8094394004280067, + "learning_rate": 4.9954131864044055e-06, + "loss": 0.5167, + "step": 566 + }, + { + "epoch": 0.14020771513353117, + "grad_norm": 0.8659413683091203, + "learning_rate": 4.995393485441211e-06, + "loss": 0.5371, + "step": 567 + }, + { + "epoch": 0.1404549950544016, + "grad_norm": 0.8210244711493505, + "learning_rate": 4.995373742298545e-06, + "loss": 0.517, + "step": 568 + }, + { + "epoch": 0.14070227497527202, + "grad_norm": 0.7525169650037641, + "learning_rate": 4.995353956976743e-06, + "loss": 0.5185, + "step": 569 + }, + { + "epoch": 0.14094955489614244, + "grad_norm": 0.844339650153545, + "learning_rate": 4.995334129476137e-06, + "loss": 0.4857, + "step": 570 + }, + { + "epoch": 0.14119683481701287, + "grad_norm": 0.8248404365036152, + "learning_rate": 4.995314259797065e-06, + "loss": 0.5301, + "step": 571 + }, + { + "epoch": 0.1414441147378833, + "grad_norm": 0.8040746643584687, + "learning_rate": 4.99529434793986e-06, + "loss": 0.5283, + "step": 572 + }, + { + "epoch": 0.14169139465875372, + "grad_norm": 0.8189779701343372, + "learning_rate": 4.995274393904861e-06, + "loss": 0.5132, + "step": 573 + }, + { + "epoch": 0.14193867457962414, + "grad_norm": 0.8739231936574126, + "learning_rate": 4.995254397692403e-06, + "loss": 0.497, + "step": 574 + }, + { + "epoch": 0.14218595450049457, + "grad_norm": 0.8348211644398329, + "learning_rate": 4.995234359302825e-06, + "loss": 0.5216, + "step": 575 + }, + { + "epoch": 0.142433234421365, + "grad_norm": 0.8033507653933986, + "learning_rate": 4.995214278736467e-06, + "loss": 0.5134, + "step": 576 + }, + { + "epoch": 0.14268051434223541, + "grad_norm": 0.8666478902841073, + "learning_rate": 4.9951941559936655e-06, + "loss": 0.4913, + "step": 577 + }, + { + "epoch": 0.14292779426310584, + "grad_norm": 0.8121736161408633, + "learning_rate": 4.995173991074764e-06, + "loss": 0.5204, + "step": 578 + }, + { + "epoch": 0.14317507418397626, + "grad_norm": 0.8130603144768949, + "learning_rate": 4.995153783980101e-06, + "loss": 0.51, + "step": 579 + }, + { + "epoch": 0.1434223541048467, + "grad_norm": 0.8376602599719988, + "learning_rate": 4.995133534710018e-06, + "loss": 0.5286, + "step": 580 + }, + { + "epoch": 0.1436696340257171, + "grad_norm": 0.8028003299616209, + "learning_rate": 4.995113243264859e-06, + "loss": 0.5154, + "step": 581 + }, + { + "epoch": 0.14391691394658754, + "grad_norm": 0.7863768226347286, + "learning_rate": 4.995092909644966e-06, + "loss": 0.5024, + "step": 582 + }, + { + "epoch": 0.14416419386745796, + "grad_norm": 0.8021001800040607, + "learning_rate": 4.995072533850682e-06, + "loss": 0.5354, + "step": 583 + }, + { + "epoch": 0.1444114737883284, + "grad_norm": 0.8402628321453506, + "learning_rate": 4.995052115882353e-06, + "loss": 0.5249, + "step": 584 + }, + { + "epoch": 0.1446587537091988, + "grad_norm": 0.8331277575923502, + "learning_rate": 4.9950316557403235e-06, + "loss": 0.4983, + "step": 585 + }, + { + "epoch": 0.14490603363006924, + "grad_norm": 0.8188619596157297, + "learning_rate": 4.9950111534249375e-06, + "loss": 0.5357, + "step": 586 + }, + { + "epoch": 0.14515331355093966, + "grad_norm": 0.7985060052509381, + "learning_rate": 4.994990608936544e-06, + "loss": 0.5091, + "step": 587 + }, + { + "epoch": 0.14540059347181009, + "grad_norm": 0.7655405143056095, + "learning_rate": 4.99497002227549e-06, + "loss": 0.5356, + "step": 588 + }, + { + "epoch": 0.1456478733926805, + "grad_norm": 0.8256783148504685, + "learning_rate": 4.9949493934421226e-06, + "loss": 0.5148, + "step": 589 + }, + { + "epoch": 0.14589515331355093, + "grad_norm": 0.8228995595458576, + "learning_rate": 4.99492872243679e-06, + "loss": 0.5198, + "step": 590 + }, + { + "epoch": 0.14614243323442136, + "grad_norm": 0.7983002453848045, + "learning_rate": 4.994908009259843e-06, + "loss": 0.5196, + "step": 591 + }, + { + "epoch": 0.14638971315529178, + "grad_norm": 0.8146201436352071, + "learning_rate": 4.994887253911631e-06, + "loss": 0.5521, + "step": 592 + }, + { + "epoch": 0.1466369930761622, + "grad_norm": 0.8252581518039378, + "learning_rate": 4.9948664563925054e-06, + "loss": 0.5505, + "step": 593 + }, + { + "epoch": 0.14688427299703263, + "grad_norm": 0.8172097763249438, + "learning_rate": 4.994845616702817e-06, + "loss": 0.5245, + "step": 594 + }, + { + "epoch": 0.14713155291790306, + "grad_norm": 0.8308363666090501, + "learning_rate": 4.994824734842918e-06, + "loss": 0.5291, + "step": 595 + }, + { + "epoch": 0.14737883283877348, + "grad_norm": 0.8236367133690201, + "learning_rate": 4.994803810813161e-06, + "loss": 0.5274, + "step": 596 + }, + { + "epoch": 0.1476261127596439, + "grad_norm": 0.8921360057125517, + "learning_rate": 4.9947828446139016e-06, + "loss": 0.5239, + "step": 597 + }, + { + "epoch": 0.14787339268051433, + "grad_norm": 0.8519369558875952, + "learning_rate": 4.994761836245492e-06, + "loss": 0.5568, + "step": 598 + }, + { + "epoch": 0.14812067260138476, + "grad_norm": 0.8593306021580139, + "learning_rate": 4.994740785708289e-06, + "loss": 0.5183, + "step": 599 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 0.921637121737583, + "learning_rate": 4.994719693002646e-06, + "loss": 0.5235, + "step": 600 + }, + { + "epoch": 0.1486152324431256, + "grad_norm": 0.8399998695591865, + "learning_rate": 4.994698558128923e-06, + "loss": 0.566, + "step": 601 + }, + { + "epoch": 0.14886251236399603, + "grad_norm": 0.7904438558914851, + "learning_rate": 4.994677381087475e-06, + "loss": 0.5184, + "step": 602 + }, + { + "epoch": 0.14910979228486648, + "grad_norm": 0.8107257376120282, + "learning_rate": 4.99465616187866e-06, + "loss": 0.5232, + "step": 603 + }, + { + "epoch": 0.1493570722057369, + "grad_norm": 0.8466653719532753, + "learning_rate": 4.994634900502837e-06, + "loss": 0.5461, + "step": 604 + }, + { + "epoch": 0.14960435212660733, + "grad_norm": 0.8332150966857227, + "learning_rate": 4.994613596960366e-06, + "loss": 0.543, + "step": 605 + }, + { + "epoch": 0.14985163204747776, + "grad_norm": 0.8494372220903758, + "learning_rate": 4.994592251251606e-06, + "loss": 0.5414, + "step": 606 + }, + { + "epoch": 0.15009891196834818, + "grad_norm": 0.836739182677242, + "learning_rate": 4.994570863376918e-06, + "loss": 0.4868, + "step": 607 + }, + { + "epoch": 0.1503461918892186, + "grad_norm": 0.8560168407660868, + "learning_rate": 4.994549433336664e-06, + "loss": 0.5357, + "step": 608 + }, + { + "epoch": 0.15059347181008903, + "grad_norm": 0.8248533428465916, + "learning_rate": 4.9945279611312066e-06, + "loss": 0.5371, + "step": 609 + }, + { + "epoch": 0.15084075173095945, + "grad_norm": 0.8279650051807312, + "learning_rate": 4.9945064467609076e-06, + "loss": 0.5133, + "step": 610 + }, + { + "epoch": 0.15108803165182988, + "grad_norm": 0.8207095863487788, + "learning_rate": 4.994484890226132e-06, + "loss": 0.555, + "step": 611 + }, + { + "epoch": 0.1513353115727003, + "grad_norm": 0.8473352308749146, + "learning_rate": 4.9944632915272426e-06, + "loss": 0.5249, + "step": 612 + }, + { + "epoch": 0.15158259149357073, + "grad_norm": 0.8269313598527205, + "learning_rate": 4.994441650664605e-06, + "loss": 0.4928, + "step": 613 + }, + { + "epoch": 0.15182987141444115, + "grad_norm": 0.8051367309430479, + "learning_rate": 4.994419967638587e-06, + "loss": 0.5126, + "step": 614 + }, + { + "epoch": 0.15207715133531158, + "grad_norm": 0.9477532042198118, + "learning_rate": 4.994398242449552e-06, + "loss": 0.5297, + "step": 615 + }, + { + "epoch": 0.152324431256182, + "grad_norm": 0.8121271507476093, + "learning_rate": 4.994376475097869e-06, + "loss": 0.5315, + "step": 616 + }, + { + "epoch": 0.15257171117705243, + "grad_norm": 0.7962233363129194, + "learning_rate": 4.994354665583906e-06, + "loss": 0.5335, + "step": 617 + }, + { + "epoch": 0.15281899109792285, + "grad_norm": 0.8342385325804931, + "learning_rate": 4.9943328139080304e-06, + "loss": 0.5026, + "step": 618 + }, + { + "epoch": 0.15306627101879328, + "grad_norm": 0.7960492212543056, + "learning_rate": 4.994310920070613e-06, + "loss": 0.5243, + "step": 619 + }, + { + "epoch": 0.1533135509396637, + "grad_norm": 0.8219481066829437, + "learning_rate": 4.994288984072023e-06, + "loss": 0.5422, + "step": 620 + }, + { + "epoch": 0.15356083086053413, + "grad_norm": 0.8965110210914611, + "learning_rate": 4.994267005912631e-06, + "loss": 0.4988, + "step": 621 + }, + { + "epoch": 0.15380811078140455, + "grad_norm": 0.8674803487785987, + "learning_rate": 4.994244985592809e-06, + "loss": 0.5235, + "step": 622 + }, + { + "epoch": 0.15405539070227497, + "grad_norm": 0.8294901688893189, + "learning_rate": 4.99422292311293e-06, + "loss": 0.523, + "step": 623 + }, + { + "epoch": 0.1543026706231454, + "grad_norm": 0.8573169917869833, + "learning_rate": 4.994200818473365e-06, + "loss": 0.5161, + "step": 624 + }, + { + "epoch": 0.15454995054401582, + "grad_norm": 0.8239599780470138, + "learning_rate": 4.994178671674489e-06, + "loss": 0.5249, + "step": 625 + }, + { + "epoch": 0.15479723046488625, + "grad_norm": 0.8573459117967988, + "learning_rate": 4.994156482716677e-06, + "loss": 0.4955, + "step": 626 + }, + { + "epoch": 0.15504451038575667, + "grad_norm": 0.8243090403433094, + "learning_rate": 4.994134251600302e-06, + "loss": 0.5008, + "step": 627 + }, + { + "epoch": 0.1552917903066271, + "grad_norm": 0.8606707487222631, + "learning_rate": 4.994111978325741e-06, + "loss": 0.5306, + "step": 628 + }, + { + "epoch": 0.15553907022749752, + "grad_norm": 0.8525860766404425, + "learning_rate": 4.99408966289337e-06, + "loss": 0.5508, + "step": 629 + }, + { + "epoch": 0.15578635014836795, + "grad_norm": 0.9058751830432761, + "learning_rate": 4.994067305303567e-06, + "loss": 0.532, + "step": 630 + }, + { + "epoch": 0.15603363006923837, + "grad_norm": 0.9144797398646675, + "learning_rate": 4.9940449055567096e-06, + "loss": 0.5025, + "step": 631 + }, + { + "epoch": 0.1562809099901088, + "grad_norm": 0.7934911770321739, + "learning_rate": 4.994022463653176e-06, + "loss": 0.4991, + "step": 632 + }, + { + "epoch": 0.15652818991097922, + "grad_norm": 0.8508702206108024, + "learning_rate": 4.993999979593346e-06, + "loss": 0.5186, + "step": 633 + }, + { + "epoch": 0.15677546983184965, + "grad_norm": 0.9125868217774985, + "learning_rate": 4.993977453377599e-06, + "loss": 0.5141, + "step": 634 + }, + { + "epoch": 0.15702274975272007, + "grad_norm": 0.8314643342376442, + "learning_rate": 4.993954885006316e-06, + "loss": 0.5388, + "step": 635 + }, + { + "epoch": 0.1572700296735905, + "grad_norm": 0.8567507360554277, + "learning_rate": 4.9939322744798795e-06, + "loss": 0.5099, + "step": 636 + }, + { + "epoch": 0.15751730959446092, + "grad_norm": 0.9458345629100596, + "learning_rate": 4.9939096217986706e-06, + "loss": 0.5329, + "step": 637 + }, + { + "epoch": 0.15776458951533134, + "grad_norm": 0.8717178654756879, + "learning_rate": 4.993886926963072e-06, + "loss": 0.5101, + "step": 638 + }, + { + "epoch": 0.15801186943620177, + "grad_norm": 0.9099670727771642, + "learning_rate": 4.993864189973468e-06, + "loss": 0.5197, + "step": 639 + }, + { + "epoch": 0.1582591493570722, + "grad_norm": 0.8690584059000377, + "learning_rate": 4.993841410830243e-06, + "loss": 0.5129, + "step": 640 + }, + { + "epoch": 0.15850642927794262, + "grad_norm": 0.8395676636532586, + "learning_rate": 4.993818589533781e-06, + "loss": 0.5435, + "step": 641 + }, + { + "epoch": 0.15875370919881307, + "grad_norm": 0.8390162044014295, + "learning_rate": 4.993795726084469e-06, + "loss": 0.4987, + "step": 642 + }, + { + "epoch": 0.1590009891196835, + "grad_norm": 0.8628449906727201, + "learning_rate": 4.993772820482693e-06, + "loss": 0.5581, + "step": 643 + }, + { + "epoch": 0.15924826904055392, + "grad_norm": 0.8769274046379205, + "learning_rate": 4.99374987272884e-06, + "loss": 0.5253, + "step": 644 + }, + { + "epoch": 0.15949554896142434, + "grad_norm": 0.8423197065733172, + "learning_rate": 4.9937268828232974e-06, + "loss": 0.5182, + "step": 645 + }, + { + "epoch": 0.15974282888229477, + "grad_norm": 0.8495522941564868, + "learning_rate": 4.993703850766455e-06, + "loss": 0.5176, + "step": 646 + }, + { + "epoch": 0.1599901088031652, + "grad_norm": 0.8566253722599317, + "learning_rate": 4.993680776558701e-06, + "loss": 0.4967, + "step": 647 + }, + { + "epoch": 0.16023738872403562, + "grad_norm": 0.8578979605277798, + "learning_rate": 4.993657660200427e-06, + "loss": 0.5321, + "step": 648 + }, + { + "epoch": 0.16048466864490604, + "grad_norm": 0.8437931759195362, + "learning_rate": 4.993634501692022e-06, + "loss": 0.527, + "step": 649 + }, + { + "epoch": 0.16073194856577647, + "grad_norm": 0.8312148356989333, + "learning_rate": 4.993611301033878e-06, + "loss": 0.5346, + "step": 650 + }, + { + "epoch": 0.1609792284866469, + "grad_norm": 0.8583661441969466, + "learning_rate": 4.993588058226388e-06, + "loss": 0.4911, + "step": 651 + }, + { + "epoch": 0.16122650840751732, + "grad_norm": 0.835364664881372, + "learning_rate": 4.9935647732699426e-06, + "loss": 0.508, + "step": 652 + }, + { + "epoch": 0.16147378832838774, + "grad_norm": 0.9088368040786615, + "learning_rate": 4.993541446164938e-06, + "loss": 0.5452, + "step": 653 + }, + { + "epoch": 0.16172106824925817, + "grad_norm": 0.901014609282978, + "learning_rate": 4.993518076911766e-06, + "loss": 0.5111, + "step": 654 + }, + { + "epoch": 0.1619683481701286, + "grad_norm": 0.8774481271049077, + "learning_rate": 4.993494665510825e-06, + "loss": 0.5152, + "step": 655 + }, + { + "epoch": 0.16221562809099901, + "grad_norm": 0.8413271828266707, + "learning_rate": 4.993471211962508e-06, + "loss": 0.5041, + "step": 656 + }, + { + "epoch": 0.16246290801186944, + "grad_norm": 0.8387696777792335, + "learning_rate": 4.993447716267211e-06, + "loss": 0.4912, + "step": 657 + }, + { + "epoch": 0.16271018793273986, + "grad_norm": 0.8858353120807996, + "learning_rate": 4.993424178425334e-06, + "loss": 0.5147, + "step": 658 + }, + { + "epoch": 0.1629574678536103, + "grad_norm": 0.8753424782843612, + "learning_rate": 4.9934005984372725e-06, + "loss": 0.5108, + "step": 659 + }, + { + "epoch": 0.1632047477744807, + "grad_norm": 0.8503862309995437, + "learning_rate": 4.993376976303426e-06, + "loss": 0.5374, + "step": 660 + }, + { + "epoch": 0.16345202769535114, + "grad_norm": 0.9263303515730487, + "learning_rate": 4.9933533120241925e-06, + "loss": 0.5227, + "step": 661 + }, + { + "epoch": 0.16369930761622156, + "grad_norm": 0.8091103017427309, + "learning_rate": 4.993329605599974e-06, + "loss": 0.4703, + "step": 662 + }, + { + "epoch": 0.163946587537092, + "grad_norm": 0.8369441099222458, + "learning_rate": 4.99330585703117e-06, + "loss": 0.5025, + "step": 663 + }, + { + "epoch": 0.1641938674579624, + "grad_norm": 0.8541506168890921, + "learning_rate": 4.993282066318182e-06, + "loss": 0.5111, + "step": 664 + }, + { + "epoch": 0.16444114737883284, + "grad_norm": 0.8193787625042165, + "learning_rate": 4.9932582334614124e-06, + "loss": 0.5036, + "step": 665 + }, + { + "epoch": 0.16468842729970326, + "grad_norm": 0.8507283745756844, + "learning_rate": 4.993234358461264e-06, + "loss": 0.5103, + "step": 666 + }, + { + "epoch": 0.16493570722057369, + "grad_norm": 0.8357763055370562, + "learning_rate": 4.9932104413181405e-06, + "loss": 0.5099, + "step": 667 + }, + { + "epoch": 0.1651829871414441, + "grad_norm": 0.7974423447277945, + "learning_rate": 4.9931864820324445e-06, + "loss": 0.526, + "step": 668 + }, + { + "epoch": 0.16543026706231453, + "grad_norm": 0.856566840869885, + "learning_rate": 4.993162480604584e-06, + "loss": 0.5296, + "step": 669 + }, + { + "epoch": 0.16567754698318496, + "grad_norm": 0.8202309677412768, + "learning_rate": 4.993138437034963e-06, + "loss": 0.5371, + "step": 670 + }, + { + "epoch": 0.16592482690405538, + "grad_norm": 0.8537092958810982, + "learning_rate": 4.993114351323987e-06, + "loss": 0.5363, + "step": 671 + }, + { + "epoch": 0.1661721068249258, + "grad_norm": 0.8941502744703057, + "learning_rate": 4.993090223472065e-06, + "loss": 0.5275, + "step": 672 + }, + { + "epoch": 0.16641938674579623, + "grad_norm": 0.8602589105442886, + "learning_rate": 4.9930660534796046e-06, + "loss": 0.5173, + "step": 673 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.8547239627917911, + "learning_rate": 4.993041841347012e-06, + "loss": 0.5322, + "step": 674 + }, + { + "epoch": 0.16691394658753708, + "grad_norm": 0.8148644041571959, + "learning_rate": 4.9930175870747e-06, + "loss": 0.544, + "step": 675 + }, + { + "epoch": 0.1671612265084075, + "grad_norm": 0.8546772113475981, + "learning_rate": 4.992993290663076e-06, + "loss": 0.4969, + "step": 676 + }, + { + "epoch": 0.16740850642927793, + "grad_norm": 0.9273768329874028, + "learning_rate": 4.9929689521125515e-06, + "loss": 0.5229, + "step": 677 + }, + { + "epoch": 0.16765578635014836, + "grad_norm": 0.8370005835873986, + "learning_rate": 4.992944571423538e-06, + "loss": 0.4871, + "step": 678 + }, + { + "epoch": 0.16790306627101878, + "grad_norm": 0.8525307890631186, + "learning_rate": 4.992920148596447e-06, + "loss": 0.5083, + "step": 679 + }, + { + "epoch": 0.1681503461918892, + "grad_norm": 0.8290265597255343, + "learning_rate": 4.9928956836316915e-06, + "loss": 0.5309, + "step": 680 + }, + { + "epoch": 0.16839762611275966, + "grad_norm": 0.8254586728696632, + "learning_rate": 4.992871176529686e-06, + "loss": 0.5231, + "step": 681 + }, + { + "epoch": 0.16864490603363008, + "grad_norm": 0.827176590783688, + "learning_rate": 4.992846627290844e-06, + "loss": 0.5417, + "step": 682 + }, + { + "epoch": 0.1688921859545005, + "grad_norm": 0.8415975897359966, + "learning_rate": 4.99282203591558e-06, + "loss": 0.507, + "step": 683 + }, + { + "epoch": 0.16913946587537093, + "grad_norm": 0.8563520086330875, + "learning_rate": 4.99279740240431e-06, + "loss": 0.4986, + "step": 684 + }, + { + "epoch": 0.16938674579624136, + "grad_norm": 0.8614893988702672, + "learning_rate": 4.992772726757451e-06, + "loss": 0.5088, + "step": 685 + }, + { + "epoch": 0.16963402571711178, + "grad_norm": 0.7756849470892396, + "learning_rate": 4.992748008975419e-06, + "loss": 0.5599, + "step": 686 + }, + { + "epoch": 0.1698813056379822, + "grad_norm": 0.8256108345562647, + "learning_rate": 4.992723249058633e-06, + "loss": 0.4938, + "step": 687 + }, + { + "epoch": 0.17012858555885263, + "grad_norm": 0.8319058935689565, + "learning_rate": 4.992698447007511e-06, + "loss": 0.5157, + "step": 688 + }, + { + "epoch": 0.17037586547972305, + "grad_norm": 0.8374831493414546, + "learning_rate": 4.992673602822472e-06, + "loss": 0.5417, + "step": 689 + }, + { + "epoch": 0.17062314540059348, + "grad_norm": 0.8135538110680443, + "learning_rate": 4.992648716503936e-06, + "loss": 0.5134, + "step": 690 + }, + { + "epoch": 0.1708704253214639, + "grad_norm": 0.9041030573320586, + "learning_rate": 4.9926237880523235e-06, + "loss": 0.5517, + "step": 691 + }, + { + "epoch": 0.17111770524233433, + "grad_norm": 0.895349923324367, + "learning_rate": 4.9925988174680565e-06, + "loss": 0.5376, + "step": 692 + }, + { + "epoch": 0.17136498516320475, + "grad_norm": 0.94778749124042, + "learning_rate": 4.992573804751557e-06, + "loss": 0.5212, + "step": 693 + }, + { + "epoch": 0.17161226508407518, + "grad_norm": 0.8730544070580147, + "learning_rate": 4.992548749903247e-06, + "loss": 0.5119, + "step": 694 + }, + { + "epoch": 0.1718595450049456, + "grad_norm": 0.8316096946499686, + "learning_rate": 4.9925236529235495e-06, + "loss": 0.5234, + "step": 695 + }, + { + "epoch": 0.17210682492581603, + "grad_norm": 0.8724157769080901, + "learning_rate": 4.992498513812891e-06, + "loss": 0.5287, + "step": 696 + }, + { + "epoch": 0.17235410484668645, + "grad_norm": 0.919022031040849, + "learning_rate": 4.992473332571696e-06, + "loss": 0.487, + "step": 697 + }, + { + "epoch": 0.17260138476755688, + "grad_norm": 0.8603603196815058, + "learning_rate": 4.9924481092003874e-06, + "loss": 0.5009, + "step": 698 + }, + { + "epoch": 0.1728486646884273, + "grad_norm": 0.8609657848997537, + "learning_rate": 4.992422843699394e-06, + "loss": 0.537, + "step": 699 + }, + { + "epoch": 0.17309594460929772, + "grad_norm": 0.9497038354277341, + "learning_rate": 4.992397536069143e-06, + "loss": 0.5157, + "step": 700 + }, + { + "epoch": 0.17334322453016815, + "grad_norm": 0.925901592259855, + "learning_rate": 4.99237218631006e-06, + "loss": 0.5256, + "step": 701 + }, + { + "epoch": 0.17359050445103857, + "grad_norm": 0.8059602370847663, + "learning_rate": 4.992346794422576e-06, + "loss": 0.509, + "step": 702 + }, + { + "epoch": 0.173837784371909, + "grad_norm": 0.8093787052289616, + "learning_rate": 4.992321360407119e-06, + "loss": 0.5125, + "step": 703 + }, + { + "epoch": 0.17408506429277942, + "grad_norm": 0.8354283265994193, + "learning_rate": 4.992295884264119e-06, + "loss": 0.5071, + "step": 704 + }, + { + "epoch": 0.17433234421364985, + "grad_norm": 0.8442582808343858, + "learning_rate": 4.992270365994006e-06, + "loss": 0.53, + "step": 705 + }, + { + "epoch": 0.17457962413452027, + "grad_norm": 0.7836454779510944, + "learning_rate": 4.9922448055972125e-06, + "loss": 0.5214, + "step": 706 + }, + { + "epoch": 0.1748269040553907, + "grad_norm": 0.840735881666944, + "learning_rate": 4.99221920307417e-06, + "loss": 0.5171, + "step": 707 + }, + { + "epoch": 0.17507418397626112, + "grad_norm": 0.8829052650768754, + "learning_rate": 4.992193558425311e-06, + "loss": 0.5168, + "step": 708 + }, + { + "epoch": 0.17532146389713155, + "grad_norm": 0.8729350937304439, + "learning_rate": 4.9921678716510705e-06, + "loss": 0.4853, + "step": 709 + }, + { + "epoch": 0.17556874381800197, + "grad_norm": 0.9103933430094363, + "learning_rate": 4.9921421427518804e-06, + "loss": 0.4959, + "step": 710 + }, + { + "epoch": 0.1758160237388724, + "grad_norm": 0.8035835757258305, + "learning_rate": 4.992116371728176e-06, + "loss": 0.4941, + "step": 711 + }, + { + "epoch": 0.17606330365974282, + "grad_norm": 0.8305929115660557, + "learning_rate": 4.9920905585803945e-06, + "loss": 0.4876, + "step": 712 + }, + { + "epoch": 0.17631058358061324, + "grad_norm": 0.7978374484195752, + "learning_rate": 4.992064703308971e-06, + "loss": 0.5257, + "step": 713 + }, + { + "epoch": 0.17655786350148367, + "grad_norm": 0.8418807640743572, + "learning_rate": 4.992038805914343e-06, + "loss": 0.513, + "step": 714 + }, + { + "epoch": 0.1768051434223541, + "grad_norm": 0.8330452494475705, + "learning_rate": 4.992012866396948e-06, + "loss": 0.4956, + "step": 715 + }, + { + "epoch": 0.17705242334322452, + "grad_norm": 0.8661744791086904, + "learning_rate": 4.991986884757224e-06, + "loss": 0.4939, + "step": 716 + }, + { + "epoch": 0.17729970326409494, + "grad_norm": 0.8486526830433232, + "learning_rate": 4.991960860995611e-06, + "loss": 0.4879, + "step": 717 + }, + { + "epoch": 0.17754698318496537, + "grad_norm": 0.8352424841247117, + "learning_rate": 4.991934795112548e-06, + "loss": 0.4961, + "step": 718 + }, + { + "epoch": 0.1777942631058358, + "grad_norm": 0.8789343341243142, + "learning_rate": 4.991908687108477e-06, + "loss": 0.4897, + "step": 719 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 0.823652151046163, + "learning_rate": 4.991882536983839e-06, + "loss": 0.512, + "step": 720 + }, + { + "epoch": 0.17828882294757667, + "grad_norm": 0.8134645433823909, + "learning_rate": 4.991856344739073e-06, + "loss": 0.5404, + "step": 721 + }, + { + "epoch": 0.1785361028684471, + "grad_norm": 0.8501968648399841, + "learning_rate": 4.991830110374626e-06, + "loss": 0.5137, + "step": 722 + }, + { + "epoch": 0.17878338278931752, + "grad_norm": 0.801152491159058, + "learning_rate": 4.991803833890939e-06, + "loss": 0.5255, + "step": 723 + }, + { + "epoch": 0.17903066271018794, + "grad_norm": 0.8249255075508455, + "learning_rate": 4.991777515288457e-06, + "loss": 0.518, + "step": 724 + }, + { + "epoch": 0.17927794263105837, + "grad_norm": 0.8299863504018892, + "learning_rate": 4.991751154567625e-06, + "loss": 0.4974, + "step": 725 + }, + { + "epoch": 0.1795252225519288, + "grad_norm": 0.902641066855043, + "learning_rate": 4.991724751728888e-06, + "loss": 0.4966, + "step": 726 + }, + { + "epoch": 0.17977250247279922, + "grad_norm": 0.8069318501808427, + "learning_rate": 4.991698306772692e-06, + "loss": 0.5069, + "step": 727 + }, + { + "epoch": 0.18001978239366964, + "grad_norm": 0.8545994506172374, + "learning_rate": 4.991671819699484e-06, + "loss": 0.4956, + "step": 728 + }, + { + "epoch": 0.18026706231454007, + "grad_norm": 0.8164400523644465, + "learning_rate": 4.9916452905097135e-06, + "loss": 0.5065, + "step": 729 + }, + { + "epoch": 0.1805143422354105, + "grad_norm": 0.8086218873858831, + "learning_rate": 4.991618719203827e-06, + "loss": 0.5011, + "step": 730 + }, + { + "epoch": 0.18076162215628092, + "grad_norm": 0.8552002311074453, + "learning_rate": 4.991592105782274e-06, + "loss": 0.5104, + "step": 731 + }, + { + "epoch": 0.18100890207715134, + "grad_norm": 0.8717792545878794, + "learning_rate": 4.9915654502455045e-06, + "loss": 0.534, + "step": 732 + }, + { + "epoch": 0.18125618199802176, + "grad_norm": 0.7849441865788286, + "learning_rate": 4.9915387525939695e-06, + "loss": 0.4982, + "step": 733 + }, + { + "epoch": 0.1815034619188922, + "grad_norm": 0.8340027119415854, + "learning_rate": 4.99151201282812e-06, + "loss": 0.5142, + "step": 734 + }, + { + "epoch": 0.18175074183976261, + "grad_norm": 0.8413817442374992, + "learning_rate": 4.991485230948407e-06, + "loss": 0.5332, + "step": 735 + }, + { + "epoch": 0.18199802176063304, + "grad_norm": 0.8452871469316092, + "learning_rate": 4.991458406955285e-06, + "loss": 0.5022, + "step": 736 + }, + { + "epoch": 0.18224530168150346, + "grad_norm": 0.872847298100631, + "learning_rate": 4.991431540849206e-06, + "loss": 0.4867, + "step": 737 + }, + { + "epoch": 0.1824925816023739, + "grad_norm": 0.8514132061403205, + "learning_rate": 4.991404632630625e-06, + "loss": 0.5106, + "step": 738 + }, + { + "epoch": 0.1827398615232443, + "grad_norm": 0.8949447429217282, + "learning_rate": 4.991377682299996e-06, + "loss": 0.4973, + "step": 739 + }, + { + "epoch": 0.18298714144411474, + "grad_norm": 0.8326410302920144, + "learning_rate": 4.991350689857775e-06, + "loss": 0.5125, + "step": 740 + }, + { + "epoch": 0.18323442136498516, + "grad_norm": 0.8675242195523758, + "learning_rate": 4.9913236553044185e-06, + "loss": 0.4978, + "step": 741 + }, + { + "epoch": 0.1834817012858556, + "grad_norm": 0.801469598537518, + "learning_rate": 4.991296578640383e-06, + "loss": 0.5135, + "step": 742 + }, + { + "epoch": 0.183728981206726, + "grad_norm": 0.8201428570448723, + "learning_rate": 4.991269459866126e-06, + "loss": 0.5144, + "step": 743 + }, + { + "epoch": 0.18397626112759644, + "grad_norm": 0.8623718853955257, + "learning_rate": 4.991242298982107e-06, + "loss": 0.4808, + "step": 744 + }, + { + "epoch": 0.18422354104846686, + "grad_norm": 0.7946767655860458, + "learning_rate": 4.991215095988784e-06, + "loss": 0.5226, + "step": 745 + }, + { + "epoch": 0.18447082096933728, + "grad_norm": 0.8439539102467273, + "learning_rate": 4.991187850886618e-06, + "loss": 0.4925, + "step": 746 + }, + { + "epoch": 0.1847181008902077, + "grad_norm": 0.7919859124879289, + "learning_rate": 4.991160563676067e-06, + "loss": 0.4916, + "step": 747 + }, + { + "epoch": 0.18496538081107813, + "grad_norm": 0.8643732765638406, + "learning_rate": 4.991133234357595e-06, + "loss": 0.4911, + "step": 748 + }, + { + "epoch": 0.18521266073194856, + "grad_norm": 0.8750805346885794, + "learning_rate": 4.9911058629316615e-06, + "loss": 0.4655, + "step": 749 + }, + { + "epoch": 0.18545994065281898, + "grad_norm": 0.8199097338629083, + "learning_rate": 4.991078449398732e-06, + "loss": 0.523, + "step": 750 + }, + { + "epoch": 0.1857072205736894, + "grad_norm": 0.8894643167179568, + "learning_rate": 4.991050993759268e-06, + "loss": 0.4973, + "step": 751 + }, + { + "epoch": 0.18595450049455983, + "grad_norm": 0.8224624160630544, + "learning_rate": 4.991023496013734e-06, + "loss": 0.4931, + "step": 752 + }, + { + "epoch": 0.18620178041543026, + "grad_norm": 0.8215579795094418, + "learning_rate": 4.990995956162593e-06, + "loss": 0.5023, + "step": 753 + }, + { + "epoch": 0.18644906033630068, + "grad_norm": 0.8083894480373339, + "learning_rate": 4.990968374206314e-06, + "loss": 0.5224, + "step": 754 + }, + { + "epoch": 0.1866963402571711, + "grad_norm": 0.8521252726155766, + "learning_rate": 4.9909407501453625e-06, + "loss": 0.511, + "step": 755 + }, + { + "epoch": 0.18694362017804153, + "grad_norm": 0.8703568142440704, + "learning_rate": 4.990913083980202e-06, + "loss": 0.5258, + "step": 756 + }, + { + "epoch": 0.18719090009891196, + "grad_norm": 0.84010044458961, + "learning_rate": 4.990885375711304e-06, + "loss": 0.5004, + "step": 757 + }, + { + "epoch": 0.18743818001978238, + "grad_norm": 0.8318706342323815, + "learning_rate": 4.990857625339135e-06, + "loss": 0.5028, + "step": 758 + }, + { + "epoch": 0.18768545994065283, + "grad_norm": 0.8416207130288796, + "learning_rate": 4.9908298328641645e-06, + "loss": 0.5053, + "step": 759 + }, + { + "epoch": 0.18793273986152326, + "grad_norm": 0.8888518172116866, + "learning_rate": 4.9908019982868625e-06, + "loss": 0.5172, + "step": 760 + }, + { + "epoch": 0.18818001978239368, + "grad_norm": 0.8502389509777805, + "learning_rate": 4.990774121607699e-06, + "loss": 0.5059, + "step": 761 + }, + { + "epoch": 0.1884272997032641, + "grad_norm": 0.8610956280514308, + "learning_rate": 4.990746202827145e-06, + "loss": 0.535, + "step": 762 + }, + { + "epoch": 0.18867457962413453, + "grad_norm": 0.7972219564385539, + "learning_rate": 4.990718241945673e-06, + "loss": 0.5218, + "step": 763 + }, + { + "epoch": 0.18892185954500496, + "grad_norm": 0.7805412892800699, + "learning_rate": 4.990690238963756e-06, + "loss": 0.509, + "step": 764 + }, + { + "epoch": 0.18916913946587538, + "grad_norm": 0.8236493383786434, + "learning_rate": 4.990662193881865e-06, + "loss": 0.5031, + "step": 765 + }, + { + "epoch": 0.1894164193867458, + "grad_norm": 0.8999257859616762, + "learning_rate": 4.9906341067004784e-06, + "loss": 0.488, + "step": 766 + }, + { + "epoch": 0.18966369930761623, + "grad_norm": 0.8708077868377747, + "learning_rate": 4.990605977420067e-06, + "loss": 0.5233, + "step": 767 + }, + { + "epoch": 0.18991097922848665, + "grad_norm": 0.8420116856277404, + "learning_rate": 4.990577806041108e-06, + "loss": 0.5135, + "step": 768 + }, + { + "epoch": 0.19015825914935708, + "grad_norm": 0.8548814632547316, + "learning_rate": 4.990549592564076e-06, + "loss": 0.4867, + "step": 769 + }, + { + "epoch": 0.1904055390702275, + "grad_norm": 0.8268530338638478, + "learning_rate": 4.99052133698945e-06, + "loss": 0.4688, + "step": 770 + }, + { + "epoch": 0.19065281899109793, + "grad_norm": 0.8320420070519919, + "learning_rate": 4.990493039317707e-06, + "loss": 0.5263, + "step": 771 + }, + { + "epoch": 0.19090009891196835, + "grad_norm": 0.8365840878509793, + "learning_rate": 4.990464699549325e-06, + "loss": 0.4931, + "step": 772 + }, + { + "epoch": 0.19114737883283878, + "grad_norm": 0.8101506431051484, + "learning_rate": 4.990436317684782e-06, + "loss": 0.5262, + "step": 773 + }, + { + "epoch": 0.1913946587537092, + "grad_norm": 0.8647254791244104, + "learning_rate": 4.990407893724561e-06, + "loss": 0.5095, + "step": 774 + }, + { + "epoch": 0.19164193867457963, + "grad_norm": 0.813120102249058, + "learning_rate": 4.990379427669138e-06, + "loss": 0.4984, + "step": 775 + }, + { + "epoch": 0.19188921859545005, + "grad_norm": 0.8042977000267384, + "learning_rate": 4.990350919518997e-06, + "loss": 0.4978, + "step": 776 + }, + { + "epoch": 0.19213649851632048, + "grad_norm": 0.8483867646954429, + "learning_rate": 4.9903223692746196e-06, + "loss": 0.4732, + "step": 777 + }, + { + "epoch": 0.1923837784371909, + "grad_norm": 0.8647994593068955, + "learning_rate": 4.990293776936488e-06, + "loss": 0.545, + "step": 778 + }, + { + "epoch": 0.19263105835806132, + "grad_norm": 0.8740789736428879, + "learning_rate": 4.990265142505085e-06, + "loss": 0.4954, + "step": 779 + }, + { + "epoch": 0.19287833827893175, + "grad_norm": 0.8174292157713233, + "learning_rate": 4.990236465980896e-06, + "loss": 0.4935, + "step": 780 + }, + { + "epoch": 0.19312561819980217, + "grad_norm": 0.8630949715708419, + "learning_rate": 4.990207747364404e-06, + "loss": 0.5115, + "step": 781 + }, + { + "epoch": 0.1933728981206726, + "grad_norm": 0.8452101849069688, + "learning_rate": 4.9901789866560955e-06, + "loss": 0.5075, + "step": 782 + }, + { + "epoch": 0.19362017804154302, + "grad_norm": 0.8222049299567881, + "learning_rate": 4.990150183856457e-06, + "loss": 0.4868, + "step": 783 + }, + { + "epoch": 0.19386745796241345, + "grad_norm": 0.8045405700762853, + "learning_rate": 4.990121338965975e-06, + "loss": 0.4996, + "step": 784 + }, + { + "epoch": 0.19411473788328387, + "grad_norm": 0.839462282900959, + "learning_rate": 4.9900924519851354e-06, + "loss": 0.513, + "step": 785 + }, + { + "epoch": 0.1943620178041543, + "grad_norm": 0.8129960974230438, + "learning_rate": 4.990063522914429e-06, + "loss": 0.5067, + "step": 786 + }, + { + "epoch": 0.19460929772502472, + "grad_norm": 0.8498358711521139, + "learning_rate": 4.990034551754344e-06, + "loss": 0.4971, + "step": 787 + }, + { + "epoch": 0.19485657764589515, + "grad_norm": 0.8342170766695094, + "learning_rate": 4.9900055385053696e-06, + "loss": 0.5785, + "step": 788 + }, + { + "epoch": 0.19510385756676557, + "grad_norm": 0.7720839055732265, + "learning_rate": 4.9899764831679954e-06, + "loss": 0.5091, + "step": 789 + }, + { + "epoch": 0.195351137487636, + "grad_norm": 0.8448261448148362, + "learning_rate": 4.989947385742715e-06, + "loss": 0.5128, + "step": 790 + }, + { + "epoch": 0.19559841740850642, + "grad_norm": 0.8355306276455249, + "learning_rate": 4.9899182462300175e-06, + "loss": 0.5334, + "step": 791 + }, + { + "epoch": 0.19584569732937684, + "grad_norm": 0.8108028837754382, + "learning_rate": 4.989889064630397e-06, + "loss": 0.5037, + "step": 792 + }, + { + "epoch": 0.19609297725024727, + "grad_norm": 0.8099327440790443, + "learning_rate": 4.989859840944346e-06, + "loss": 0.5074, + "step": 793 + }, + { + "epoch": 0.1963402571711177, + "grad_norm": 0.8540803863374226, + "learning_rate": 4.989830575172361e-06, + "loss": 0.4785, + "step": 794 + }, + { + "epoch": 0.19658753709198812, + "grad_norm": 0.8361945024114827, + "learning_rate": 4.9898012673149325e-06, + "loss": 0.4938, + "step": 795 + }, + { + "epoch": 0.19683481701285854, + "grad_norm": 0.8603101439414645, + "learning_rate": 4.989771917372559e-06, + "loss": 0.458, + "step": 796 + }, + { + "epoch": 0.19708209693372897, + "grad_norm": 0.8623322983540171, + "learning_rate": 4.989742525345736e-06, + "loss": 0.5032, + "step": 797 + }, + { + "epoch": 0.19732937685459942, + "grad_norm": 0.8473115195100572, + "learning_rate": 4.9897130912349585e-06, + "loss": 0.4936, + "step": 798 + }, + { + "epoch": 0.19757665677546984, + "grad_norm": 0.7940684143161678, + "learning_rate": 4.9896836150407256e-06, + "loss": 0.5473, + "step": 799 + }, + { + "epoch": 0.19782393669634027, + "grad_norm": 0.8272717457807083, + "learning_rate": 4.989654096763537e-06, + "loss": 0.5171, + "step": 800 + }, + { + "epoch": 0.1980712166172107, + "grad_norm": 0.8902992860806108, + "learning_rate": 4.989624536403888e-06, + "loss": 0.5375, + "step": 801 + }, + { + "epoch": 0.19831849653808112, + "grad_norm": 0.9101693315058536, + "learning_rate": 4.989594933962281e-06, + "loss": 0.4882, + "step": 802 + }, + { + "epoch": 0.19856577645895154, + "grad_norm": 0.823665912051704, + "learning_rate": 4.989565289439216e-06, + "loss": 0.5004, + "step": 803 + }, + { + "epoch": 0.19881305637982197, + "grad_norm": 0.8537141855296777, + "learning_rate": 4.9895356028351936e-06, + "loss": 0.5057, + "step": 804 + }, + { + "epoch": 0.1990603363006924, + "grad_norm": 0.8499363471459407, + "learning_rate": 4.989505874150716e-06, + "loss": 0.5051, + "step": 805 + }, + { + "epoch": 0.19930761622156282, + "grad_norm": 0.8713007651523925, + "learning_rate": 4.989476103386285e-06, + "loss": 0.5093, + "step": 806 + }, + { + "epoch": 0.19955489614243324, + "grad_norm": 0.8878831506094113, + "learning_rate": 4.9894462905424035e-06, + "loss": 0.5067, + "step": 807 + }, + { + "epoch": 0.19980217606330367, + "grad_norm": 0.8677379139060017, + "learning_rate": 4.989416435619577e-06, + "loss": 0.5013, + "step": 808 + }, + { + "epoch": 0.2000494559841741, + "grad_norm": 0.8442842813703827, + "learning_rate": 4.98938653861831e-06, + "loss": 0.5048, + "step": 809 + }, + { + "epoch": 0.20029673590504452, + "grad_norm": 0.8111674446939234, + "learning_rate": 4.989356599539106e-06, + "loss": 0.5167, + "step": 810 + }, + { + "epoch": 0.20054401582591494, + "grad_norm": 0.8954173542551098, + "learning_rate": 4.989326618382471e-06, + "loss": 0.5147, + "step": 811 + }, + { + "epoch": 0.20079129574678536, + "grad_norm": 0.9233165216979647, + "learning_rate": 4.9892965951489154e-06, + "loss": 0.5064, + "step": 812 + }, + { + "epoch": 0.2010385756676558, + "grad_norm": 0.9134666739113686, + "learning_rate": 4.989266529838943e-06, + "loss": 0.5009, + "step": 813 + }, + { + "epoch": 0.2012858555885262, + "grad_norm": 0.8492218856969446, + "learning_rate": 4.989236422453064e-06, + "loss": 0.5124, + "step": 814 + }, + { + "epoch": 0.20153313550939664, + "grad_norm": 0.8652036940944711, + "learning_rate": 4.989206272991785e-06, + "loss": 0.5366, + "step": 815 + }, + { + "epoch": 0.20178041543026706, + "grad_norm": 0.8418660565061913, + "learning_rate": 4.9891760814556186e-06, + "loss": 0.5105, + "step": 816 + }, + { + "epoch": 0.2020276953511375, + "grad_norm": 0.8878975245185788, + "learning_rate": 4.989145847845074e-06, + "loss": 0.5132, + "step": 817 + }, + { + "epoch": 0.2022749752720079, + "grad_norm": 0.8315141988567767, + "learning_rate": 4.989115572160661e-06, + "loss": 0.5008, + "step": 818 + }, + { + "epoch": 0.20252225519287834, + "grad_norm": 0.8376037839192603, + "learning_rate": 4.989085254402892e-06, + "loss": 0.5057, + "step": 819 + }, + { + "epoch": 0.20276953511374876, + "grad_norm": 0.8037174338863466, + "learning_rate": 4.98905489457228e-06, + "loss": 0.4975, + "step": 820 + }, + { + "epoch": 0.20301681503461919, + "grad_norm": 0.8766782392881515, + "learning_rate": 4.9890244926693385e-06, + "loss": 0.4667, + "step": 821 + }, + { + "epoch": 0.2032640949554896, + "grad_norm": 0.9181666145995278, + "learning_rate": 4.98899404869458e-06, + "loss": 0.5197, + "step": 822 + }, + { + "epoch": 0.20351137487636003, + "grad_norm": 0.8421985589488679, + "learning_rate": 4.98896356264852e-06, + "loss": 0.4693, + "step": 823 + }, + { + "epoch": 0.20375865479723046, + "grad_norm": 0.8414578412087721, + "learning_rate": 4.988933034531674e-06, + "loss": 0.4959, + "step": 824 + }, + { + "epoch": 0.20400593471810088, + "grad_norm": 0.8207895326319039, + "learning_rate": 4.988902464344557e-06, + "loss": 0.4968, + "step": 825 + }, + { + "epoch": 0.2042532146389713, + "grad_norm": 0.8258058754282276, + "learning_rate": 4.988871852087687e-06, + "loss": 0.4806, + "step": 826 + }, + { + "epoch": 0.20450049455984173, + "grad_norm": 0.7911811305702641, + "learning_rate": 4.988841197761581e-06, + "loss": 0.5105, + "step": 827 + }, + { + "epoch": 0.20474777448071216, + "grad_norm": 0.8067032711527126, + "learning_rate": 4.988810501366756e-06, + "loss": 0.4988, + "step": 828 + }, + { + "epoch": 0.20499505440158258, + "grad_norm": 0.8248411254954989, + "learning_rate": 4.988779762903733e-06, + "loss": 0.4679, + "step": 829 + }, + { + "epoch": 0.205242334322453, + "grad_norm": 0.8075962457898619, + "learning_rate": 4.98874898237303e-06, + "loss": 0.4802, + "step": 830 + }, + { + "epoch": 0.20548961424332343, + "grad_norm": 0.8694350300009082, + "learning_rate": 4.988718159775168e-06, + "loss": 0.512, + "step": 831 + }, + { + "epoch": 0.20573689416419386, + "grad_norm": 0.838526356063612, + "learning_rate": 4.988687295110667e-06, + "loss": 0.4772, + "step": 832 + }, + { + "epoch": 0.20598417408506428, + "grad_norm": 0.8581376012516917, + "learning_rate": 4.98865638838005e-06, + "loss": 0.4997, + "step": 833 + }, + { + "epoch": 0.2062314540059347, + "grad_norm": 0.8508868762945916, + "learning_rate": 4.988625439583838e-06, + "loss": 0.5016, + "step": 834 + }, + { + "epoch": 0.20647873392680513, + "grad_norm": 0.8649500124082152, + "learning_rate": 4.988594448722556e-06, + "loss": 0.4915, + "step": 835 + }, + { + "epoch": 0.20672601384767555, + "grad_norm": 0.8094411363162982, + "learning_rate": 4.988563415796726e-06, + "loss": 0.5196, + "step": 836 + }, + { + "epoch": 0.206973293768546, + "grad_norm": 0.8414623331186055, + "learning_rate": 4.988532340806873e-06, + "loss": 0.5158, + "step": 837 + }, + { + "epoch": 0.20722057368941643, + "grad_norm": 0.8243945840705411, + "learning_rate": 4.9885012237535235e-06, + "loss": 0.4897, + "step": 838 + }, + { + "epoch": 0.20746785361028686, + "grad_norm": 0.9140469955319724, + "learning_rate": 4.988470064637202e-06, + "loss": 0.4759, + "step": 839 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 0.8558925215879722, + "learning_rate": 4.988438863458436e-06, + "loss": 0.5119, + "step": 840 + }, + { + "epoch": 0.2079624134520277, + "grad_norm": 0.815440252440534, + "learning_rate": 4.988407620217752e-06, + "loss": 0.4945, + "step": 841 + }, + { + "epoch": 0.20820969337289813, + "grad_norm": 0.8232200098822217, + "learning_rate": 4.988376334915679e-06, + "loss": 0.4996, + "step": 842 + }, + { + "epoch": 0.20845697329376855, + "grad_norm": 0.8691564428161908, + "learning_rate": 4.988345007552746e-06, + "loss": 0.5097, + "step": 843 + }, + { + "epoch": 0.20870425321463898, + "grad_norm": 0.8264653567410514, + "learning_rate": 4.9883136381294816e-06, + "loss": 0.5119, + "step": 844 + }, + { + "epoch": 0.2089515331355094, + "grad_norm": 0.8785002227076522, + "learning_rate": 4.988282226646417e-06, + "loss": 0.514, + "step": 845 + }, + { + "epoch": 0.20919881305637983, + "grad_norm": 0.8023528924268092, + "learning_rate": 4.988250773104083e-06, + "loss": 0.5428, + "step": 846 + }, + { + "epoch": 0.20944609297725025, + "grad_norm": 0.8211524437917096, + "learning_rate": 4.98821927750301e-06, + "loss": 0.5256, + "step": 847 + }, + { + "epoch": 0.20969337289812068, + "grad_norm": 0.8010942784871227, + "learning_rate": 4.988187739843731e-06, + "loss": 0.5346, + "step": 848 + }, + { + "epoch": 0.2099406528189911, + "grad_norm": 0.9044984773545841, + "learning_rate": 4.988156160126781e-06, + "loss": 0.5038, + "step": 849 + }, + { + "epoch": 0.21018793273986153, + "grad_norm": 0.8706964927904601, + "learning_rate": 4.98812453835269e-06, + "loss": 0.507, + "step": 850 + }, + { + "epoch": 0.21043521266073195, + "grad_norm": 0.8736523995044558, + "learning_rate": 4.988092874521996e-06, + "loss": 0.4939, + "step": 851 + }, + { + "epoch": 0.21068249258160238, + "grad_norm": 0.8851801188270471, + "learning_rate": 4.988061168635232e-06, + "loss": 0.5165, + "step": 852 + }, + { + "epoch": 0.2109297725024728, + "grad_norm": 0.853684837000266, + "learning_rate": 4.9880294206929356e-06, + "loss": 0.5153, + "step": 853 + }, + { + "epoch": 0.21117705242334323, + "grad_norm": 0.911771572862549, + "learning_rate": 4.9879976306956415e-06, + "loss": 0.4859, + "step": 854 + }, + { + "epoch": 0.21142433234421365, + "grad_norm": 0.8626964859349916, + "learning_rate": 4.987965798643889e-06, + "loss": 0.4644, + "step": 855 + }, + { + "epoch": 0.21167161226508407, + "grad_norm": 0.8572993279837756, + "learning_rate": 4.987933924538215e-06, + "loss": 0.5022, + "step": 856 + }, + { + "epoch": 0.2119188921859545, + "grad_norm": 0.8963927749480562, + "learning_rate": 4.987902008379159e-06, + "loss": 0.4736, + "step": 857 + }, + { + "epoch": 0.21216617210682492, + "grad_norm": 0.8505265096635852, + "learning_rate": 4.987870050167259e-06, + "loss": 0.509, + "step": 858 + }, + { + "epoch": 0.21241345202769535, + "grad_norm": 0.8393583930696208, + "learning_rate": 4.987838049903058e-06, + "loss": 0.5031, + "step": 859 + }, + { + "epoch": 0.21266073194856577, + "grad_norm": 0.8669348205976165, + "learning_rate": 4.987806007587094e-06, + "loss": 0.4743, + "step": 860 + }, + { + "epoch": 0.2129080118694362, + "grad_norm": 0.8272724159339694, + "learning_rate": 4.9877739232199095e-06, + "loss": 0.5207, + "step": 861 + }, + { + "epoch": 0.21315529179030662, + "grad_norm": 0.8732070794723419, + "learning_rate": 4.987741796802047e-06, + "loss": 0.5009, + "step": 862 + }, + { + "epoch": 0.21340257171117705, + "grad_norm": 0.8813154447433397, + "learning_rate": 4.987709628334051e-06, + "loss": 0.5135, + "step": 863 + }, + { + "epoch": 0.21364985163204747, + "grad_norm": 0.8325507743445312, + "learning_rate": 4.987677417816462e-06, + "loss": 0.5058, + "step": 864 + }, + { + "epoch": 0.2138971315529179, + "grad_norm": 0.8747157253538062, + "learning_rate": 4.987645165249827e-06, + "loss": 0.5056, + "step": 865 + }, + { + "epoch": 0.21414441147378832, + "grad_norm": 0.90232302210331, + "learning_rate": 4.987612870634691e-06, + "loss": 0.5056, + "step": 866 + }, + { + "epoch": 0.21439169139465875, + "grad_norm": 0.84923644734889, + "learning_rate": 4.987580533971599e-06, + "loss": 0.4947, + "step": 867 + }, + { + "epoch": 0.21463897131552917, + "grad_norm": 0.8698770676663805, + "learning_rate": 4.9875481552610975e-06, + "loss": 0.5125, + "step": 868 + }, + { + "epoch": 0.2148862512363996, + "grad_norm": 0.817003058821012, + "learning_rate": 4.9875157345037345e-06, + "loss": 0.5422, + "step": 869 + }, + { + "epoch": 0.21513353115727002, + "grad_norm": 0.8417204204341288, + "learning_rate": 4.9874832717000576e-06, + "loss": 0.4953, + "step": 870 + }, + { + "epoch": 0.21538081107814044, + "grad_norm": 0.9572380936686788, + "learning_rate": 4.9874507668506155e-06, + "loss": 0.4844, + "step": 871 + }, + { + "epoch": 0.21562809099901087, + "grad_norm": 0.834643198655235, + "learning_rate": 4.987418219955958e-06, + "loss": 0.5242, + "step": 872 + }, + { + "epoch": 0.2158753709198813, + "grad_norm": 0.8559434479885895, + "learning_rate": 4.987385631016635e-06, + "loss": 0.5118, + "step": 873 + }, + { + "epoch": 0.21612265084075172, + "grad_norm": 0.8746611306575874, + "learning_rate": 4.987353000033197e-06, + "loss": 0.4964, + "step": 874 + }, + { + "epoch": 0.21636993076162217, + "grad_norm": 0.8810491800616369, + "learning_rate": 4.987320327006196e-06, + "loss": 0.4804, + "step": 875 + }, + { + "epoch": 0.2166172106824926, + "grad_norm": 0.857857539044612, + "learning_rate": 4.987287611936185e-06, + "loss": 0.5073, + "step": 876 + }, + { + "epoch": 0.21686449060336302, + "grad_norm": 0.8268996691952446, + "learning_rate": 4.987254854823715e-06, + "loss": 0.5007, + "step": 877 + }, + { + "epoch": 0.21711177052423344, + "grad_norm": 0.828244801524196, + "learning_rate": 4.987222055669342e-06, + "loss": 0.5075, + "step": 878 + }, + { + "epoch": 0.21735905044510387, + "grad_norm": 0.8662331481181559, + "learning_rate": 4.987189214473618e-06, + "loss": 0.5054, + "step": 879 + }, + { + "epoch": 0.2176063303659743, + "grad_norm": 0.840848336618183, + "learning_rate": 4.987156331237099e-06, + "loss": 0.4954, + "step": 880 + }, + { + "epoch": 0.21785361028684472, + "grad_norm": 0.8368772772807719, + "learning_rate": 4.987123405960343e-06, + "loss": 0.5116, + "step": 881 + }, + { + "epoch": 0.21810089020771514, + "grad_norm": 0.8593385012576953, + "learning_rate": 4.987090438643904e-06, + "loss": 0.5273, + "step": 882 + }, + { + "epoch": 0.21834817012858557, + "grad_norm": 0.8364107176881544, + "learning_rate": 4.98705742928834e-06, + "loss": 0.5014, + "step": 883 + }, + { + "epoch": 0.218595450049456, + "grad_norm": 0.8227427355602717, + "learning_rate": 4.987024377894208e-06, + "loss": 0.4951, + "step": 884 + }, + { + "epoch": 0.21884272997032642, + "grad_norm": 0.8150154334336953, + "learning_rate": 4.986991284462068e-06, + "loss": 0.49, + "step": 885 + }, + { + "epoch": 0.21909000989119684, + "grad_norm": 0.8449279651032495, + "learning_rate": 4.98695814899248e-06, + "loss": 0.5147, + "step": 886 + }, + { + "epoch": 0.21933728981206727, + "grad_norm": 0.8199419337132845, + "learning_rate": 4.986924971486001e-06, + "loss": 0.5203, + "step": 887 + }, + { + "epoch": 0.2195845697329377, + "grad_norm": 0.8211194462863136, + "learning_rate": 4.986891751943196e-06, + "loss": 0.5027, + "step": 888 + }, + { + "epoch": 0.21983184965380811, + "grad_norm": 0.8510718071011489, + "learning_rate": 4.986858490364624e-06, + "loss": 0.4842, + "step": 889 + }, + { + "epoch": 0.22007912957467854, + "grad_norm": 0.8122596001789849, + "learning_rate": 4.986825186750846e-06, + "loss": 0.4882, + "step": 890 + }, + { + "epoch": 0.22032640949554896, + "grad_norm": 0.8227992726845009, + "learning_rate": 4.986791841102427e-06, + "loss": 0.4894, + "step": 891 + }, + { + "epoch": 0.2205736894164194, + "grad_norm": 0.8330978830853362, + "learning_rate": 4.986758453419931e-06, + "loss": 0.5047, + "step": 892 + }, + { + "epoch": 0.2208209693372898, + "grad_norm": 0.8289885814322678, + "learning_rate": 4.986725023703921e-06, + "loss": 0.5211, + "step": 893 + }, + { + "epoch": 0.22106824925816024, + "grad_norm": 0.7967487040612072, + "learning_rate": 4.986691551954962e-06, + "loss": 0.4961, + "step": 894 + }, + { + "epoch": 0.22131552917903066, + "grad_norm": 0.8663375538553278, + "learning_rate": 4.986658038173621e-06, + "loss": 0.51, + "step": 895 + }, + { + "epoch": 0.2215628090999011, + "grad_norm": 0.7906515926897258, + "learning_rate": 4.986624482360464e-06, + "loss": 0.5029, + "step": 896 + }, + { + "epoch": 0.2218100890207715, + "grad_norm": 0.7744820254840943, + "learning_rate": 4.986590884516057e-06, + "loss": 0.5023, + "step": 897 + }, + { + "epoch": 0.22205736894164194, + "grad_norm": 0.883399410152621, + "learning_rate": 4.98655724464097e-06, + "loss": 0.4854, + "step": 898 + }, + { + "epoch": 0.22230464886251236, + "grad_norm": 0.8630333018932411, + "learning_rate": 4.98652356273577e-06, + "loss": 0.4881, + "step": 899 + }, + { + "epoch": 0.22255192878338279, + "grad_norm": 0.8277706025700802, + "learning_rate": 4.986489838801027e-06, + "loss": 0.5004, + "step": 900 + }, + { + "epoch": 0.2227992087042532, + "grad_norm": 0.8204017170552954, + "learning_rate": 4.98645607283731e-06, + "loss": 0.4963, + "step": 901 + }, + { + "epoch": 0.22304648862512363, + "grad_norm": 0.8265861428479575, + "learning_rate": 4.986422264845191e-06, + "loss": 0.4839, + "step": 902 + }, + { + "epoch": 0.22329376854599406, + "grad_norm": 0.8276540389714453, + "learning_rate": 4.986388414825242e-06, + "loss": 0.4946, + "step": 903 + }, + { + "epoch": 0.22354104846686448, + "grad_norm": 0.8699787108810072, + "learning_rate": 4.986354522778033e-06, + "loss": 0.5113, + "step": 904 + }, + { + "epoch": 0.2237883283877349, + "grad_norm": 0.7617126884375469, + "learning_rate": 4.986320588704139e-06, + "loss": 0.5106, + "step": 905 + }, + { + "epoch": 0.22403560830860533, + "grad_norm": 0.8589768854675831, + "learning_rate": 4.986286612604132e-06, + "loss": 0.4754, + "step": 906 + }, + { + "epoch": 0.22428288822947576, + "grad_norm": 0.8161695889367072, + "learning_rate": 4.986252594478588e-06, + "loss": 0.4865, + "step": 907 + }, + { + "epoch": 0.22453016815034618, + "grad_norm": 0.8425510497631642, + "learning_rate": 4.98621853432808e-06, + "loss": 0.4961, + "step": 908 + }, + { + "epoch": 0.2247774480712166, + "grad_norm": 0.8942256571041988, + "learning_rate": 4.986184432153185e-06, + "loss": 0.5114, + "step": 909 + }, + { + "epoch": 0.22502472799208703, + "grad_norm": 0.8569701182433922, + "learning_rate": 4.986150287954479e-06, + "loss": 0.4747, + "step": 910 + }, + { + "epoch": 0.22527200791295746, + "grad_norm": 0.8342740450002151, + "learning_rate": 4.986116101732539e-06, + "loss": 0.4965, + "step": 911 + }, + { + "epoch": 0.22551928783382788, + "grad_norm": 0.906710700659202, + "learning_rate": 4.986081873487944e-06, + "loss": 0.5079, + "step": 912 + }, + { + "epoch": 0.2257665677546983, + "grad_norm": 0.8145221155521087, + "learning_rate": 4.98604760322127e-06, + "loss": 0.4913, + "step": 913 + }, + { + "epoch": 0.22601384767556876, + "grad_norm": 0.8166511496949852, + "learning_rate": 4.986013290933099e-06, + "loss": 0.472, + "step": 914 + }, + { + "epoch": 0.22626112759643918, + "grad_norm": 0.914692426611638, + "learning_rate": 4.98597893662401e-06, + "loss": 0.4695, + "step": 915 + }, + { + "epoch": 0.2265084075173096, + "grad_norm": 0.7809607473935676, + "learning_rate": 4.985944540294584e-06, + "loss": 0.5388, + "step": 916 + }, + { + "epoch": 0.22675568743818003, + "grad_norm": 0.8231717409594672, + "learning_rate": 4.9859101019454015e-06, + "loss": 0.5249, + "step": 917 + }, + { + "epoch": 0.22700296735905046, + "grad_norm": 0.8361911011029732, + "learning_rate": 4.985875621577045e-06, + "loss": 0.5121, + "step": 918 + }, + { + "epoch": 0.22725024727992088, + "grad_norm": 0.8048366668863958, + "learning_rate": 4.985841099190098e-06, + "loss": 0.4892, + "step": 919 + }, + { + "epoch": 0.2274975272007913, + "grad_norm": 0.8660579797696568, + "learning_rate": 4.985806534785143e-06, + "loss": 0.4799, + "step": 920 + }, + { + "epoch": 0.22774480712166173, + "grad_norm": 0.8318167298034147, + "learning_rate": 4.9857719283627635e-06, + "loss": 0.498, + "step": 921 + }, + { + "epoch": 0.22799208704253215, + "grad_norm": 0.8605410511892199, + "learning_rate": 4.985737279923547e-06, + "loss": 0.5138, + "step": 922 + }, + { + "epoch": 0.22823936696340258, + "grad_norm": 0.8703326428337433, + "learning_rate": 4.9857025894680775e-06, + "loss": 0.4784, + "step": 923 + }, + { + "epoch": 0.228486646884273, + "grad_norm": 0.8370959017594702, + "learning_rate": 4.9856678569969415e-06, + "loss": 0.488, + "step": 924 + }, + { + "epoch": 0.22873392680514343, + "grad_norm": 0.8693400164771248, + "learning_rate": 4.985633082510727e-06, + "loss": 0.5124, + "step": 925 + }, + { + "epoch": 0.22898120672601385, + "grad_norm": 0.8636007951346818, + "learning_rate": 4.985598266010021e-06, + "loss": 0.5016, + "step": 926 + }, + { + "epoch": 0.22922848664688428, + "grad_norm": 0.8832259920894722, + "learning_rate": 4.985563407495411e-06, + "loss": 0.4954, + "step": 927 + }, + { + "epoch": 0.2294757665677547, + "grad_norm": 0.8191758445943106, + "learning_rate": 4.985528506967488e-06, + "loss": 0.507, + "step": 928 + }, + { + "epoch": 0.22972304648862513, + "grad_norm": 0.8789118908623763, + "learning_rate": 4.985493564426841e-06, + "loss": 0.4885, + "step": 929 + }, + { + "epoch": 0.22997032640949555, + "grad_norm": 0.8924491857440008, + "learning_rate": 4.985458579874061e-06, + "loss": 0.5033, + "step": 930 + }, + { + "epoch": 0.23021760633036598, + "grad_norm": 0.8383332268067973, + "learning_rate": 4.9854235533097396e-06, + "loss": 0.5156, + "step": 931 + }, + { + "epoch": 0.2304648862512364, + "grad_norm": 0.8767336119514518, + "learning_rate": 4.985388484734467e-06, + "loss": 0.4849, + "step": 932 + }, + { + "epoch": 0.23071216617210683, + "grad_norm": 0.8405975137037067, + "learning_rate": 4.985353374148838e-06, + "loss": 0.506, + "step": 933 + }, + { + "epoch": 0.23095944609297725, + "grad_norm": 0.8170821427085228, + "learning_rate": 4.9853182215534465e-06, + "loss": 0.4962, + "step": 934 + }, + { + "epoch": 0.23120672601384767, + "grad_norm": 0.8314334184214403, + "learning_rate": 4.985283026948885e-06, + "loss": 0.483, + "step": 935 + }, + { + "epoch": 0.2314540059347181, + "grad_norm": 0.9084995950646598, + "learning_rate": 4.985247790335748e-06, + "loss": 0.4794, + "step": 936 + }, + { + "epoch": 0.23170128585558852, + "grad_norm": 0.8483507056354166, + "learning_rate": 4.9852125117146335e-06, + "loss": 0.5065, + "step": 937 + }, + { + "epoch": 0.23194856577645895, + "grad_norm": 0.8402483495826693, + "learning_rate": 4.985177191086136e-06, + "loss": 0.4915, + "step": 938 + }, + { + "epoch": 0.23219584569732937, + "grad_norm": 0.823169932427246, + "learning_rate": 4.985141828450852e-06, + "loss": 0.4918, + "step": 939 + }, + { + "epoch": 0.2324431256181998, + "grad_norm": 0.8243831987168234, + "learning_rate": 4.985106423809381e-06, + "loss": 0.4755, + "step": 940 + }, + { + "epoch": 0.23269040553907022, + "grad_norm": 0.8252991162421606, + "learning_rate": 4.98507097716232e-06, + "loss": 0.5114, + "step": 941 + }, + { + "epoch": 0.23293768545994065, + "grad_norm": 0.836106206941783, + "learning_rate": 4.98503548851027e-06, + "loss": 0.483, + "step": 942 + }, + { + "epoch": 0.23318496538081107, + "grad_norm": 0.8506488758080649, + "learning_rate": 4.984999957853829e-06, + "loss": 0.4987, + "step": 943 + }, + { + "epoch": 0.2334322453016815, + "grad_norm": 0.8289096749494381, + "learning_rate": 4.984964385193598e-06, + "loss": 0.5003, + "step": 944 + }, + { + "epoch": 0.23367952522255192, + "grad_norm": 0.8768460912190904, + "learning_rate": 4.9849287705301786e-06, + "loss": 0.475, + "step": 945 + }, + { + "epoch": 0.23392680514342234, + "grad_norm": 0.8140839084687289, + "learning_rate": 4.984893113864173e-06, + "loss": 0.5001, + "step": 946 + }, + { + "epoch": 0.23417408506429277, + "grad_norm": 0.8527563886559387, + "learning_rate": 4.9848574151961835e-06, + "loss": 0.5011, + "step": 947 + }, + { + "epoch": 0.2344213649851632, + "grad_norm": 0.8497649216961695, + "learning_rate": 4.984821674526813e-06, + "loss": 0.5048, + "step": 948 + }, + { + "epoch": 0.23466864490603362, + "grad_norm": 0.8118793995529595, + "learning_rate": 4.984785891856667e-06, + "loss": 0.4742, + "step": 949 + }, + { + "epoch": 0.23491592482690404, + "grad_norm": 0.8791594821173937, + "learning_rate": 4.984750067186349e-06, + "loss": 0.4938, + "step": 950 + }, + { + "epoch": 0.23516320474777447, + "grad_norm": 0.8888145994376402, + "learning_rate": 4.984714200516465e-06, + "loss": 0.5096, + "step": 951 + }, + { + "epoch": 0.2354104846686449, + "grad_norm": 0.8348077340788664, + "learning_rate": 4.9846782918476225e-06, + "loss": 0.4902, + "step": 952 + }, + { + "epoch": 0.23565776458951534, + "grad_norm": 0.8778483850793724, + "learning_rate": 4.9846423411804255e-06, + "loss": 0.4926, + "step": 953 + }, + { + "epoch": 0.23590504451038577, + "grad_norm": 0.8187933596056294, + "learning_rate": 4.984606348515485e-06, + "loss": 0.4858, + "step": 954 + }, + { + "epoch": 0.2361523244312562, + "grad_norm": 0.7797555946495061, + "learning_rate": 4.984570313853408e-06, + "loss": 0.4931, + "step": 955 + }, + { + "epoch": 0.23639960435212662, + "grad_norm": 0.8552422210148904, + "learning_rate": 4.984534237194802e-06, + "loss": 0.5172, + "step": 956 + }, + { + "epoch": 0.23664688427299704, + "grad_norm": 0.8013218564963912, + "learning_rate": 4.984498118540279e-06, + "loss": 0.4941, + "step": 957 + }, + { + "epoch": 0.23689416419386747, + "grad_norm": 0.8354238632072138, + "learning_rate": 4.984461957890449e-06, + "loss": 0.4857, + "step": 958 + }, + { + "epoch": 0.2371414441147379, + "grad_norm": 0.8201898617265962, + "learning_rate": 4.984425755245923e-06, + "loss": 0.4968, + "step": 959 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 0.8470092115227233, + "learning_rate": 4.984389510607313e-06, + "loss": 0.4862, + "step": 960 + }, + { + "epoch": 0.23763600395647874, + "grad_norm": 0.845953728104242, + "learning_rate": 4.984353223975231e-06, + "loss": 0.486, + "step": 961 + }, + { + "epoch": 0.23788328387734917, + "grad_norm": 0.8270735966336179, + "learning_rate": 4.98431689535029e-06, + "loss": 0.5043, + "step": 962 + }, + { + "epoch": 0.2381305637982196, + "grad_norm": 0.8152648813245965, + "learning_rate": 4.984280524733107e-06, + "loss": 0.4591, + "step": 963 + }, + { + "epoch": 0.23837784371909002, + "grad_norm": 0.8230530257009434, + "learning_rate": 4.984244112124293e-06, + "loss": 0.4709, + "step": 964 + }, + { + "epoch": 0.23862512363996044, + "grad_norm": 0.8769324474154776, + "learning_rate": 4.9842076575244665e-06, + "loss": 0.4615, + "step": 965 + }, + { + "epoch": 0.23887240356083086, + "grad_norm": 0.8170880234645825, + "learning_rate": 4.984171160934243e-06, + "loss": 0.4801, + "step": 966 + }, + { + "epoch": 0.2391196834817013, + "grad_norm": 0.8473619382925522, + "learning_rate": 4.9841346223542375e-06, + "loss": 0.4687, + "step": 967 + }, + { + "epoch": 0.23936696340257171, + "grad_norm": 0.8226901640257309, + "learning_rate": 4.984098041785069e-06, + "loss": 0.4927, + "step": 968 + }, + { + "epoch": 0.23961424332344214, + "grad_norm": 0.9073747433995021, + "learning_rate": 4.9840614192273565e-06, + "loss": 0.4763, + "step": 969 + }, + { + "epoch": 0.23986152324431256, + "grad_norm": 0.8676535240669757, + "learning_rate": 4.984024754681717e-06, + "loss": 0.4699, + "step": 970 + }, + { + "epoch": 0.240108803165183, + "grad_norm": 0.8592342568796588, + "learning_rate": 4.983988048148773e-06, + "loss": 0.4589, + "step": 971 + }, + { + "epoch": 0.2403560830860534, + "grad_norm": 0.8106560924699778, + "learning_rate": 4.983951299629142e-06, + "loss": 0.4864, + "step": 972 + }, + { + "epoch": 0.24060336300692384, + "grad_norm": 0.8273722868885549, + "learning_rate": 4.983914509123447e-06, + "loss": 0.4741, + "step": 973 + }, + { + "epoch": 0.24085064292779426, + "grad_norm": 0.7817588752032723, + "learning_rate": 4.983877676632311e-06, + "loss": 0.4825, + "step": 974 + }, + { + "epoch": 0.2410979228486647, + "grad_norm": 0.7946571630913781, + "learning_rate": 4.983840802156353e-06, + "loss": 0.5078, + "step": 975 + }, + { + "epoch": 0.2413452027695351, + "grad_norm": 0.8172722258882839, + "learning_rate": 4.983803885696199e-06, + "loss": 0.4906, + "step": 976 + }, + { + "epoch": 0.24159248269040554, + "grad_norm": 0.858810786727647, + "learning_rate": 4.983766927252472e-06, + "loss": 0.473, + "step": 977 + }, + { + "epoch": 0.24183976261127596, + "grad_norm": 0.8672721626002375, + "learning_rate": 4.983729926825798e-06, + "loss": 0.4743, + "step": 978 + }, + { + "epoch": 0.24208704253214638, + "grad_norm": 0.9329297411602999, + "learning_rate": 4.983692884416801e-06, + "loss": 0.472, + "step": 979 + }, + { + "epoch": 0.2423343224530168, + "grad_norm": 0.9015184792086365, + "learning_rate": 4.983655800026108e-06, + "loss": 0.5101, + "step": 980 + }, + { + "epoch": 0.24258160237388723, + "grad_norm": 0.8477549099000955, + "learning_rate": 4.983618673654344e-06, + "loss": 0.5164, + "step": 981 + }, + { + "epoch": 0.24282888229475766, + "grad_norm": 0.8737374865512695, + "learning_rate": 4.983581505302139e-06, + "loss": 0.5048, + "step": 982 + }, + { + "epoch": 0.24307616221562808, + "grad_norm": 0.8492727947637201, + "learning_rate": 4.983544294970121e-06, + "loss": 0.4872, + "step": 983 + }, + { + "epoch": 0.2433234421364985, + "grad_norm": 0.8312941393834273, + "learning_rate": 4.983507042658917e-06, + "loss": 0.4921, + "step": 984 + }, + { + "epoch": 0.24357072205736893, + "grad_norm": 0.8457934782705832, + "learning_rate": 4.983469748369159e-06, + "loss": 0.5009, + "step": 985 + }, + { + "epoch": 0.24381800197823936, + "grad_norm": 0.889062562190951, + "learning_rate": 4.983432412101475e-06, + "loss": 0.5046, + "step": 986 + }, + { + "epoch": 0.24406528189910978, + "grad_norm": 0.9200936675121365, + "learning_rate": 4.983395033856498e-06, + "loss": 0.5063, + "step": 987 + }, + { + "epoch": 0.2443125618199802, + "grad_norm": 0.865579100161537, + "learning_rate": 4.9833576136348595e-06, + "loss": 0.4931, + "step": 988 + }, + { + "epoch": 0.24455984174085063, + "grad_norm": 0.8758672885998708, + "learning_rate": 4.983320151437191e-06, + "loss": 0.481, + "step": 989 + }, + { + "epoch": 0.24480712166172106, + "grad_norm": 0.8658318444737918, + "learning_rate": 4.983282647264126e-06, + "loss": 0.4712, + "step": 990 + }, + { + "epoch": 0.24505440158259148, + "grad_norm": 0.8382351361641673, + "learning_rate": 4.983245101116299e-06, + "loss": 0.4911, + "step": 991 + }, + { + "epoch": 0.24530168150346193, + "grad_norm": 0.8463983674050847, + "learning_rate": 4.983207512994345e-06, + "loss": 0.5312, + "step": 992 + }, + { + "epoch": 0.24554896142433236, + "grad_norm": 0.8829203582411207, + "learning_rate": 4.983169882898898e-06, + "loss": 0.459, + "step": 993 + }, + { + "epoch": 0.24579624134520278, + "grad_norm": 0.8208662802668374, + "learning_rate": 4.983132210830596e-06, + "loss": 0.4835, + "step": 994 + }, + { + "epoch": 0.2460435212660732, + "grad_norm": 0.868902743454076, + "learning_rate": 4.983094496790074e-06, + "loss": 0.4895, + "step": 995 + }, + { + "epoch": 0.24629080118694363, + "grad_norm": 0.8525195601900595, + "learning_rate": 4.98305674077797e-06, + "loss": 0.5008, + "step": 996 + }, + { + "epoch": 0.24653808110781406, + "grad_norm": 0.8486858338119296, + "learning_rate": 4.9830189427949225e-06, + "loss": 0.4767, + "step": 997 + }, + { + "epoch": 0.24678536102868448, + "grad_norm": 0.8399588457998348, + "learning_rate": 4.982981102841569e-06, + "loss": 0.4807, + "step": 998 + }, + { + "epoch": 0.2470326409495549, + "grad_norm": 0.8400317988123928, + "learning_rate": 4.982943220918552e-06, + "loss": 0.4472, + "step": 999 + }, + { + "epoch": 0.24727992087042533, + "grad_norm": 0.7933513082453183, + "learning_rate": 4.982905297026509e-06, + "loss": 0.4897, + "step": 1000 + }, + { + "epoch": 0.24752720079129575, + "grad_norm": 0.8533021805361142, + "learning_rate": 4.982867331166083e-06, + "loss": 0.4826, + "step": 1001 + }, + { + "epoch": 0.24777448071216618, + "grad_norm": 0.871571750396739, + "learning_rate": 4.982829323337914e-06, + "loss": 0.4822, + "step": 1002 + }, + { + "epoch": 0.2480217606330366, + "grad_norm": 0.8561307423314862, + "learning_rate": 4.982791273542646e-06, + "loss": 0.4928, + "step": 1003 + }, + { + "epoch": 0.24826904055390703, + "grad_norm": 0.8162243679119858, + "learning_rate": 4.9827531817809215e-06, + "loss": 0.4918, + "step": 1004 + }, + { + "epoch": 0.24851632047477745, + "grad_norm": 0.8091135061963646, + "learning_rate": 4.9827150480533835e-06, + "loss": 0.5059, + "step": 1005 + }, + { + "epoch": 0.24876360039564788, + "grad_norm": 0.839406106272644, + "learning_rate": 4.982676872360677e-06, + "loss": 0.5087, + "step": 1006 + }, + { + "epoch": 0.2490108803165183, + "grad_norm": 0.8313205468854626, + "learning_rate": 4.982638654703449e-06, + "loss": 0.4686, + "step": 1007 + }, + { + "epoch": 0.24925816023738873, + "grad_norm": 0.8271088700543145, + "learning_rate": 4.9826003950823445e-06, + "loss": 0.4938, + "step": 1008 + }, + { + "epoch": 0.24950544015825915, + "grad_norm": 0.8405321735427581, + "learning_rate": 4.982562093498009e-06, + "loss": 0.4876, + "step": 1009 + }, + { + "epoch": 0.24975272007912958, + "grad_norm": 0.8665303043556788, + "learning_rate": 4.982523749951091e-06, + "loss": 0.4805, + "step": 1010 + }, + { + "epoch": 0.25, + "grad_norm": 0.8787115918536075, + "learning_rate": 4.982485364442238e-06, + "loss": 0.487, + "step": 1011 + }, + { + "epoch": 0.2502472799208704, + "grad_norm": 0.8409882874323071, + "learning_rate": 4.982446936972099e-06, + "loss": 0.4678, + "step": 1012 + }, + { + "epoch": 0.25049455984174085, + "grad_norm": 0.8561426691383394, + "learning_rate": 4.982408467541325e-06, + "loss": 0.4897, + "step": 1013 + }, + { + "epoch": 0.2507418397626113, + "grad_norm": 0.8572923571758608, + "learning_rate": 4.982369956150563e-06, + "loss": 0.4852, + "step": 1014 + }, + { + "epoch": 0.2509891196834817, + "grad_norm": 0.8981118858205535, + "learning_rate": 4.982331402800468e-06, + "loss": 0.4807, + "step": 1015 + }, + { + "epoch": 0.2512363996043521, + "grad_norm": 0.9095037169315664, + "learning_rate": 4.982292807491688e-06, + "loss": 0.5035, + "step": 1016 + }, + { + "epoch": 0.25148367952522255, + "grad_norm": 0.8421819705790509, + "learning_rate": 4.982254170224878e-06, + "loss": 0.4421, + "step": 1017 + }, + { + "epoch": 0.251730959446093, + "grad_norm": 0.8399733420526356, + "learning_rate": 4.982215491000689e-06, + "loss": 0.4987, + "step": 1018 + }, + { + "epoch": 0.2519782393669634, + "grad_norm": 0.8546805880965436, + "learning_rate": 4.982176769819777e-06, + "loss": 0.494, + "step": 1019 + }, + { + "epoch": 0.2522255192878338, + "grad_norm": 0.891980552444231, + "learning_rate": 4.982138006682795e-06, + "loss": 0.4995, + "step": 1020 + }, + { + "epoch": 0.25247279920870425, + "grad_norm": 0.832920847499595, + "learning_rate": 4.982099201590399e-06, + "loss": 0.4675, + "step": 1021 + }, + { + "epoch": 0.25272007912957467, + "grad_norm": 0.8018275844329361, + "learning_rate": 4.982060354543244e-06, + "loss": 0.5018, + "step": 1022 + }, + { + "epoch": 0.2529673590504451, + "grad_norm": 0.8518048762893595, + "learning_rate": 4.982021465541988e-06, + "loss": 0.5046, + "step": 1023 + }, + { + "epoch": 0.2532146389713155, + "grad_norm": 0.830301537925145, + "learning_rate": 4.9819825345872855e-06, + "loss": 0.4926, + "step": 1024 + }, + { + "epoch": 0.25346191889218594, + "grad_norm": 0.8314313821169597, + "learning_rate": 4.981943561679799e-06, + "loss": 0.4857, + "step": 1025 + }, + { + "epoch": 0.25370919881305637, + "grad_norm": 0.8085067071753731, + "learning_rate": 4.981904546820183e-06, + "loss": 0.4997, + "step": 1026 + }, + { + "epoch": 0.2539564787339268, + "grad_norm": 0.8312408342022083, + "learning_rate": 4.981865490009099e-06, + "loss": 0.4873, + "step": 1027 + }, + { + "epoch": 0.2542037586547972, + "grad_norm": 0.8236446385494339, + "learning_rate": 4.9818263912472074e-06, + "loss": 0.4854, + "step": 1028 + }, + { + "epoch": 0.25445103857566764, + "grad_norm": 0.8279083883523636, + "learning_rate": 4.9817872505351686e-06, + "loss": 0.4848, + "step": 1029 + }, + { + "epoch": 0.25469831849653807, + "grad_norm": 0.8690599083185739, + "learning_rate": 4.9817480678736426e-06, + "loss": 0.4865, + "step": 1030 + }, + { + "epoch": 0.2549455984174085, + "grad_norm": 0.8421918735243161, + "learning_rate": 4.981708843263295e-06, + "loss": 0.4685, + "step": 1031 + }, + { + "epoch": 0.2551928783382789, + "grad_norm": 0.8407079219400131, + "learning_rate": 4.981669576704787e-06, + "loss": 0.4956, + "step": 1032 + }, + { + "epoch": 0.25544015825914934, + "grad_norm": 0.8773337094609159, + "learning_rate": 4.9816302681987825e-06, + "loss": 0.4724, + "step": 1033 + }, + { + "epoch": 0.25568743818001977, + "grad_norm": 0.868844911839338, + "learning_rate": 4.981590917745945e-06, + "loss": 0.4981, + "step": 1034 + }, + { + "epoch": 0.2559347181008902, + "grad_norm": 0.8666128355880677, + "learning_rate": 4.981551525346941e-06, + "loss": 0.5125, + "step": 1035 + }, + { + "epoch": 0.2561819980217606, + "grad_norm": 0.9213503864137835, + "learning_rate": 4.9815120910024365e-06, + "loss": 0.516, + "step": 1036 + }, + { + "epoch": 0.25642927794263104, + "grad_norm": 0.8307356659494541, + "learning_rate": 4.981472614713096e-06, + "loss": 0.5132, + "step": 1037 + }, + { + "epoch": 0.25667655786350146, + "grad_norm": 0.8297935826184679, + "learning_rate": 4.981433096479588e-06, + "loss": 0.4802, + "step": 1038 + }, + { + "epoch": 0.2569238377843719, + "grad_norm": 0.8327277791219414, + "learning_rate": 4.981393536302582e-06, + "loss": 0.4928, + "step": 1039 + }, + { + "epoch": 0.2571711177052423, + "grad_norm": 0.8707053568449094, + "learning_rate": 4.981353934182745e-06, + "loss": 0.4899, + "step": 1040 + }, + { + "epoch": 0.25741839762611274, + "grad_norm": 0.8652524290884858, + "learning_rate": 4.981314290120747e-06, + "loss": 0.4886, + "step": 1041 + }, + { + "epoch": 0.25766567754698316, + "grad_norm": 0.8466892303798033, + "learning_rate": 4.981274604117257e-06, + "loss": 0.5103, + "step": 1042 + }, + { + "epoch": 0.2579129574678536, + "grad_norm": 0.9783250538294788, + "learning_rate": 4.981234876172947e-06, + "loss": 0.4887, + "step": 1043 + }, + { + "epoch": 0.258160237388724, + "grad_norm": 0.9318808540890211, + "learning_rate": 4.981195106288488e-06, + "loss": 0.5011, + "step": 1044 + }, + { + "epoch": 0.25840751730959444, + "grad_norm": 0.8551466560368262, + "learning_rate": 4.981155294464552e-06, + "loss": 0.5029, + "step": 1045 + }, + { + "epoch": 0.25865479723046486, + "grad_norm": 0.8793301608280027, + "learning_rate": 4.981115440701814e-06, + "loss": 0.4742, + "step": 1046 + }, + { + "epoch": 0.2589020771513353, + "grad_norm": 0.8969182144318821, + "learning_rate": 4.981075545000944e-06, + "loss": 0.5168, + "step": 1047 + }, + { + "epoch": 0.2591493570722057, + "grad_norm": 0.8662813622902052, + "learning_rate": 4.981035607362619e-06, + "loss": 0.4981, + "step": 1048 + }, + { + "epoch": 0.25939663699307614, + "grad_norm": 0.8852575900920927, + "learning_rate": 4.980995627787513e-06, + "loss": 0.4845, + "step": 1049 + }, + { + "epoch": 0.2596439169139466, + "grad_norm": 0.8434609325170388, + "learning_rate": 4.980955606276303e-06, + "loss": 0.4663, + "step": 1050 + }, + { + "epoch": 0.25989119683481704, + "grad_norm": 0.8403758855291492, + "learning_rate": 4.980915542829664e-06, + "loss": 0.4831, + "step": 1051 + }, + { + "epoch": 0.26013847675568746, + "grad_norm": 0.8877325981859041, + "learning_rate": 4.980875437448274e-06, + "loss": 0.4785, + "step": 1052 + }, + { + "epoch": 0.2603857566765579, + "grad_norm": 0.8052008309585926, + "learning_rate": 4.98083529013281e-06, + "loss": 0.5048, + "step": 1053 + }, + { + "epoch": 0.2606330365974283, + "grad_norm": 0.8478864104168425, + "learning_rate": 4.980795100883953e-06, + "loss": 0.4704, + "step": 1054 + }, + { + "epoch": 0.26088031651829874, + "grad_norm": 0.8794267459522233, + "learning_rate": 4.9807548697023795e-06, + "loss": 0.4629, + "step": 1055 + }, + { + "epoch": 0.26112759643916916, + "grad_norm": 0.8058144806983949, + "learning_rate": 4.9807145965887705e-06, + "loss": 0.4852, + "step": 1056 + }, + { + "epoch": 0.2613748763600396, + "grad_norm": 0.8843411492674896, + "learning_rate": 4.980674281543807e-06, + "loss": 0.4644, + "step": 1057 + }, + { + "epoch": 0.26162215628091, + "grad_norm": 0.8493178218035321, + "learning_rate": 4.98063392456817e-06, + "loss": 0.4879, + "step": 1058 + }, + { + "epoch": 0.26186943620178044, + "grad_norm": 0.8356662278593041, + "learning_rate": 4.980593525662544e-06, + "loss": 0.4703, + "step": 1059 + }, + { + "epoch": 0.26211671612265086, + "grad_norm": 0.8475187900521053, + "learning_rate": 4.980553084827607e-06, + "loss": 0.4914, + "step": 1060 + }, + { + "epoch": 0.2623639960435213, + "grad_norm": 0.8450529868997396, + "learning_rate": 4.980512602064047e-06, + "loss": 0.4844, + "step": 1061 + }, + { + "epoch": 0.2626112759643917, + "grad_norm": 0.8079903048049786, + "learning_rate": 4.9804720773725465e-06, + "loss": 0.4752, + "step": 1062 + }, + { + "epoch": 0.26285855588526214, + "grad_norm": 0.8511900505503611, + "learning_rate": 4.980431510753791e-06, + "loss": 0.4774, + "step": 1063 + }, + { + "epoch": 0.26310583580613256, + "grad_norm": 0.8332435125227167, + "learning_rate": 4.980390902208465e-06, + "loss": 0.4751, + "step": 1064 + }, + { + "epoch": 0.263353115727003, + "grad_norm": 0.8536824402482716, + "learning_rate": 4.980350251737256e-06, + "loss": 0.5205, + "step": 1065 + }, + { + "epoch": 0.2636003956478734, + "grad_norm": 0.8553553797802288, + "learning_rate": 4.980309559340851e-06, + "loss": 0.4665, + "step": 1066 + }, + { + "epoch": 0.26384767556874383, + "grad_norm": 0.8286430821029018, + "learning_rate": 4.980268825019939e-06, + "loss": 0.4861, + "step": 1067 + }, + { + "epoch": 0.26409495548961426, + "grad_norm": 0.8349366249443254, + "learning_rate": 4.980228048775205e-06, + "loss": 0.4921, + "step": 1068 + }, + { + "epoch": 0.2643422354104847, + "grad_norm": 0.8327046354641331, + "learning_rate": 4.980187230607341e-06, + "loss": 0.4672, + "step": 1069 + }, + { + "epoch": 0.2645895153313551, + "grad_norm": 0.8665614695088318, + "learning_rate": 4.980146370517037e-06, + "loss": 0.4803, + "step": 1070 + }, + { + "epoch": 0.26483679525222553, + "grad_norm": 0.8410944228997952, + "learning_rate": 4.980105468504983e-06, + "loss": 0.4753, + "step": 1071 + }, + { + "epoch": 0.26508407517309596, + "grad_norm": 0.8731177178369249, + "learning_rate": 4.9800645245718705e-06, + "loss": 0.5105, + "step": 1072 + }, + { + "epoch": 0.2653313550939664, + "grad_norm": 0.87074377533807, + "learning_rate": 4.980023538718392e-06, + "loss": 0.4868, + "step": 1073 + }, + { + "epoch": 0.2655786350148368, + "grad_norm": 0.8497553336959501, + "learning_rate": 4.979982510945239e-06, + "loss": 0.46, + "step": 1074 + }, + { + "epoch": 0.26582591493570723, + "grad_norm": 0.8482593664870046, + "learning_rate": 4.9799414412531056e-06, + "loss": 0.5059, + "step": 1075 + }, + { + "epoch": 0.26607319485657766, + "grad_norm": 0.8734764426183708, + "learning_rate": 4.9799003296426864e-06, + "loss": 0.457, + "step": 1076 + }, + { + "epoch": 0.2663204747774481, + "grad_norm": 0.8762054435854876, + "learning_rate": 4.979859176114676e-06, + "loss": 0.4828, + "step": 1077 + }, + { + "epoch": 0.2665677546983185, + "grad_norm": 0.8386188836793864, + "learning_rate": 4.979817980669771e-06, + "loss": 0.4531, + "step": 1078 + }, + { + "epoch": 0.26681503461918893, + "grad_norm": 0.8303211743323535, + "learning_rate": 4.979776743308667e-06, + "loss": 0.4786, + "step": 1079 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 0.8763603274871483, + "learning_rate": 4.979735464032059e-06, + "loss": 0.4729, + "step": 1080 + }, + { + "epoch": 0.2673095944609298, + "grad_norm": 0.8482768577451538, + "learning_rate": 4.979694142840647e-06, + "loss": 0.4685, + "step": 1081 + }, + { + "epoch": 0.2675568743818002, + "grad_norm": 0.8181406667783483, + "learning_rate": 4.9796527797351304e-06, + "loss": 0.4883, + "step": 1082 + }, + { + "epoch": 0.2678041543026706, + "grad_norm": 0.820627311447771, + "learning_rate": 4.979611374716207e-06, + "loss": 0.4595, + "step": 1083 + }, + { + "epoch": 0.26805143422354105, + "grad_norm": 0.8494425109762445, + "learning_rate": 4.979569927784576e-06, + "loss": 0.5001, + "step": 1084 + }, + { + "epoch": 0.2682987141444115, + "grad_norm": 0.8347143974885791, + "learning_rate": 4.979528438940938e-06, + "loss": 0.4854, + "step": 1085 + }, + { + "epoch": 0.2685459940652819, + "grad_norm": 0.8293185823179237, + "learning_rate": 4.979486908185996e-06, + "loss": 0.491, + "step": 1086 + }, + { + "epoch": 0.2687932739861523, + "grad_norm": 0.8624687861606527, + "learning_rate": 4.97944533552045e-06, + "loss": 0.4668, + "step": 1087 + }, + { + "epoch": 0.26904055390702275, + "grad_norm": 0.8647425254764696, + "learning_rate": 4.979403720945004e-06, + "loss": 0.4785, + "step": 1088 + }, + { + "epoch": 0.2692878338278932, + "grad_norm": 0.8642500279889467, + "learning_rate": 4.979362064460361e-06, + "loss": 0.4906, + "step": 1089 + }, + { + "epoch": 0.2695351137487636, + "grad_norm": 0.8188279970742318, + "learning_rate": 4.979320366067226e-06, + "loss": 0.4922, + "step": 1090 + }, + { + "epoch": 0.269782393669634, + "grad_norm": 0.8494061066145452, + "learning_rate": 4.979278625766302e-06, + "loss": 0.4373, + "step": 1091 + }, + { + "epoch": 0.27002967359050445, + "grad_norm": 0.8541094055180173, + "learning_rate": 4.979236843558296e-06, + "loss": 0.4982, + "step": 1092 + }, + { + "epoch": 0.2702769535113749, + "grad_norm": 0.8901352777542862, + "learning_rate": 4.979195019443913e-06, + "loss": 0.4895, + "step": 1093 + }, + { + "epoch": 0.2705242334322453, + "grad_norm": 0.8723236124194512, + "learning_rate": 4.9791531534238615e-06, + "loss": 0.4876, + "step": 1094 + }, + { + "epoch": 0.2707715133531157, + "grad_norm": 0.8460275573328927, + "learning_rate": 4.9791112454988485e-06, + "loss": 0.4582, + "step": 1095 + }, + { + "epoch": 0.27101879327398615, + "grad_norm": 0.8032142309223071, + "learning_rate": 4.979069295669582e-06, + "loss": 0.4979, + "step": 1096 + }, + { + "epoch": 0.27126607319485657, + "grad_norm": 0.8984869165646926, + "learning_rate": 4.979027303936771e-06, + "loss": 0.4883, + "step": 1097 + }, + { + "epoch": 0.271513353115727, + "grad_norm": 0.823481716476794, + "learning_rate": 4.9789852703011255e-06, + "loss": 0.4748, + "step": 1098 + }, + { + "epoch": 0.2717606330365974, + "grad_norm": 0.8770522684481887, + "learning_rate": 4.978943194763356e-06, + "loss": 0.4761, + "step": 1099 + }, + { + "epoch": 0.27200791295746785, + "grad_norm": 0.8827483342198571, + "learning_rate": 4.978901077324174e-06, + "loss": 0.5047, + "step": 1100 + }, + { + "epoch": 0.27225519287833827, + "grad_norm": 0.8442680181370851, + "learning_rate": 4.978858917984292e-06, + "loss": 0.476, + "step": 1101 + }, + { + "epoch": 0.2725024727992087, + "grad_norm": 0.8093223452069177, + "learning_rate": 4.9788167167444206e-06, + "loss": 0.4974, + "step": 1102 + }, + { + "epoch": 0.2727497527200791, + "grad_norm": 0.8520597566674003, + "learning_rate": 4.978774473605274e-06, + "loss": 0.4953, + "step": 1103 + }, + { + "epoch": 0.27299703264094954, + "grad_norm": 0.8851794426709616, + "learning_rate": 4.978732188567568e-06, + "loss": 0.4748, + "step": 1104 + }, + { + "epoch": 0.27324431256181997, + "grad_norm": 0.8047095063413441, + "learning_rate": 4.978689861632016e-06, + "loss": 0.4799, + "step": 1105 + }, + { + "epoch": 0.2734915924826904, + "grad_norm": 0.8036515216440242, + "learning_rate": 4.978647492799332e-06, + "loss": 0.4623, + "step": 1106 + }, + { + "epoch": 0.2737388724035608, + "grad_norm": 0.8287473081378811, + "learning_rate": 4.978605082070234e-06, + "loss": 0.4808, + "step": 1107 + }, + { + "epoch": 0.27398615232443124, + "grad_norm": 0.7830078111911496, + "learning_rate": 4.9785626294454385e-06, + "loss": 0.4848, + "step": 1108 + }, + { + "epoch": 0.27423343224530167, + "grad_norm": 0.8796446581845684, + "learning_rate": 4.978520134925663e-06, + "loss": 0.4649, + "step": 1109 + }, + { + "epoch": 0.2744807121661721, + "grad_norm": 0.8207939048014206, + "learning_rate": 4.978477598511625e-06, + "loss": 0.4956, + "step": 1110 + }, + { + "epoch": 0.2747279920870425, + "grad_norm": 0.8206309916991434, + "learning_rate": 4.978435020204045e-06, + "loss": 0.5177, + "step": 1111 + }, + { + "epoch": 0.27497527200791294, + "grad_norm": 0.8349867209882694, + "learning_rate": 4.978392400003642e-06, + "loss": 0.4801, + "step": 1112 + }, + { + "epoch": 0.27522255192878337, + "grad_norm": 0.8467923995371689, + "learning_rate": 4.978349737911136e-06, + "loss": 0.4868, + "step": 1113 + }, + { + "epoch": 0.2754698318496538, + "grad_norm": 0.8578801603556364, + "learning_rate": 4.9783070339272485e-06, + "loss": 0.487, + "step": 1114 + }, + { + "epoch": 0.2757171117705242, + "grad_norm": 0.7810402137613652, + "learning_rate": 4.978264288052701e-06, + "loss": 0.4741, + "step": 1115 + }, + { + "epoch": 0.27596439169139464, + "grad_norm": 0.7896403749725643, + "learning_rate": 4.978221500288217e-06, + "loss": 0.5014, + "step": 1116 + }, + { + "epoch": 0.27621167161226506, + "grad_norm": 0.8440418279389584, + "learning_rate": 4.978178670634518e-06, + "loss": 0.4677, + "step": 1117 + }, + { + "epoch": 0.2764589515331355, + "grad_norm": 0.8645968361942973, + "learning_rate": 4.97813579909233e-06, + "loss": 0.4951, + "step": 1118 + }, + { + "epoch": 0.2767062314540059, + "grad_norm": 0.8850512904631548, + "learning_rate": 4.9780928856623765e-06, + "loss": 0.4813, + "step": 1119 + }, + { + "epoch": 0.27695351137487634, + "grad_norm": 0.8682217080189116, + "learning_rate": 4.978049930345382e-06, + "loss": 0.4832, + "step": 1120 + }, + { + "epoch": 0.27720079129574676, + "grad_norm": 0.8551141090731345, + "learning_rate": 4.978006933142075e-06, + "loss": 0.4796, + "step": 1121 + }, + { + "epoch": 0.2774480712166172, + "grad_norm": 0.8561104145175743, + "learning_rate": 4.97796389405318e-06, + "loss": 0.4945, + "step": 1122 + }, + { + "epoch": 0.2776953511374876, + "grad_norm": 0.8408560414663986, + "learning_rate": 4.977920813079426e-06, + "loss": 0.464, + "step": 1123 + }, + { + "epoch": 0.27794263105835804, + "grad_norm": 0.8654252795240646, + "learning_rate": 4.97787769022154e-06, + "loss": 0.4858, + "step": 1124 + }, + { + "epoch": 0.27818991097922846, + "grad_norm": 0.860275164094403, + "learning_rate": 4.9778345254802505e-06, + "loss": 0.4902, + "step": 1125 + }, + { + "epoch": 0.2784371909000989, + "grad_norm": 0.8232668434710275, + "learning_rate": 4.977791318856289e-06, + "loss": 0.4662, + "step": 1126 + }, + { + "epoch": 0.2786844708209693, + "grad_norm": 0.8368306472164873, + "learning_rate": 4.977748070350385e-06, + "loss": 0.4809, + "step": 1127 + }, + { + "epoch": 0.2789317507418398, + "grad_norm": 0.836130275495686, + "learning_rate": 4.977704779963269e-06, + "loss": 0.4929, + "step": 1128 + }, + { + "epoch": 0.2791790306627102, + "grad_norm": 0.9149228882397445, + "learning_rate": 4.9776614476956735e-06, + "loss": 0.4691, + "step": 1129 + }, + { + "epoch": 0.27942631058358064, + "grad_norm": 0.8068181975659364, + "learning_rate": 4.97761807354833e-06, + "loss": 0.4684, + "step": 1130 + }, + { + "epoch": 0.27967359050445106, + "grad_norm": 0.8422358197300707, + "learning_rate": 4.977574657521973e-06, + "loss": 0.4761, + "step": 1131 + }, + { + "epoch": 0.2799208704253215, + "grad_norm": 0.8502642960030118, + "learning_rate": 4.977531199617335e-06, + "loss": 0.4574, + "step": 1132 + }, + { + "epoch": 0.2801681503461919, + "grad_norm": 0.8596392419555559, + "learning_rate": 4.977487699835151e-06, + "loss": 0.4956, + "step": 1133 + }, + { + "epoch": 0.28041543026706234, + "grad_norm": 0.8457975155369798, + "learning_rate": 4.977444158176157e-06, + "loss": 0.5049, + "step": 1134 + }, + { + "epoch": 0.28066271018793276, + "grad_norm": 0.8237071413210715, + "learning_rate": 4.9774005746410885e-06, + "loss": 0.4795, + "step": 1135 + }, + { + "epoch": 0.2809099901088032, + "grad_norm": 0.8690081793064807, + "learning_rate": 4.977356949230681e-06, + "loss": 0.4831, + "step": 1136 + }, + { + "epoch": 0.2811572700296736, + "grad_norm": 0.8522428926389992, + "learning_rate": 4.977313281945674e-06, + "loss": 0.4606, + "step": 1137 + }, + { + "epoch": 0.28140454995054404, + "grad_norm": 0.8686896451812126, + "learning_rate": 4.977269572786804e-06, + "loss": 0.4681, + "step": 1138 + }, + { + "epoch": 0.28165182987141446, + "grad_norm": 0.8152365489529878, + "learning_rate": 4.9772258217548105e-06, + "loss": 0.4911, + "step": 1139 + }, + { + "epoch": 0.2818991097922849, + "grad_norm": 0.8264760785996421, + "learning_rate": 4.977182028850434e-06, + "loss": 0.4598, + "step": 1140 + }, + { + "epoch": 0.2821463897131553, + "grad_norm": 0.8211689788246055, + "learning_rate": 4.9771381940744114e-06, + "loss": 0.457, + "step": 1141 + }, + { + "epoch": 0.28239366963402573, + "grad_norm": 0.8623747209430598, + "learning_rate": 4.977094317427488e-06, + "loss": 0.4527, + "step": 1142 + }, + { + "epoch": 0.28264094955489616, + "grad_norm": 0.8244132721987166, + "learning_rate": 4.977050398910402e-06, + "loss": 0.4821, + "step": 1143 + }, + { + "epoch": 0.2828882294757666, + "grad_norm": 0.88533076955512, + "learning_rate": 4.977006438523898e-06, + "loss": 0.4614, + "step": 1144 + }, + { + "epoch": 0.283135509396637, + "grad_norm": 0.91782791472328, + "learning_rate": 4.9769624362687175e-06, + "loss": 0.5028, + "step": 1145 + }, + { + "epoch": 0.28338278931750743, + "grad_norm": 0.8483629265141885, + "learning_rate": 4.9769183921456045e-06, + "loss": 0.4735, + "step": 1146 + }, + { + "epoch": 0.28363006923837786, + "grad_norm": 0.8051321989492259, + "learning_rate": 4.976874306155305e-06, + "loss": 0.4827, + "step": 1147 + }, + { + "epoch": 0.2838773491592483, + "grad_norm": 0.9174584209766616, + "learning_rate": 4.9768301782985625e-06, + "loss": 0.4711, + "step": 1148 + }, + { + "epoch": 0.2841246290801187, + "grad_norm": 0.8715629939118485, + "learning_rate": 4.9767860085761234e-06, + "loss": 0.4716, + "step": 1149 + }, + { + "epoch": 0.28437190900098913, + "grad_norm": 0.8190866023796931, + "learning_rate": 4.9767417969887345e-06, + "loss": 0.4717, + "step": 1150 + }, + { + "epoch": 0.28461918892185956, + "grad_norm": 0.8436727789264102, + "learning_rate": 4.976697543537144e-06, + "loss": 0.4655, + "step": 1151 + }, + { + "epoch": 0.28486646884273, + "grad_norm": 0.9569028098073552, + "learning_rate": 4.976653248222097e-06, + "loss": 0.4459, + "step": 1152 + }, + { + "epoch": 0.2851137487636004, + "grad_norm": 0.9183949145964179, + "learning_rate": 4.976608911044345e-06, + "loss": 0.4595, + "step": 1153 + }, + { + "epoch": 0.28536102868447083, + "grad_norm": 0.8331716153606737, + "learning_rate": 4.976564532004636e-06, + "loss": 0.4654, + "step": 1154 + }, + { + "epoch": 0.28560830860534125, + "grad_norm": 0.818296150644725, + "learning_rate": 4.976520111103721e-06, + "loss": 0.4777, + "step": 1155 + }, + { + "epoch": 0.2858555885262117, + "grad_norm": 0.88625253346554, + "learning_rate": 4.976475648342351e-06, + "loss": 0.4807, + "step": 1156 + }, + { + "epoch": 0.2861028684470821, + "grad_norm": 0.8305715999786408, + "learning_rate": 4.976431143721277e-06, + "loss": 0.4482, + "step": 1157 + }, + { + "epoch": 0.28635014836795253, + "grad_norm": 0.8426926584136363, + "learning_rate": 4.976386597241251e-06, + "loss": 0.4872, + "step": 1158 + }, + { + "epoch": 0.28659742828882295, + "grad_norm": 0.8366689422958908, + "learning_rate": 4.976342008903025e-06, + "loss": 0.486, + "step": 1159 + }, + { + "epoch": 0.2868447082096934, + "grad_norm": 0.8956525760405114, + "learning_rate": 4.976297378707355e-06, + "loss": 0.4715, + "step": 1160 + }, + { + "epoch": 0.2870919881305638, + "grad_norm": 0.8272900765226568, + "learning_rate": 4.976252706654995e-06, + "loss": 0.4691, + "step": 1161 + }, + { + "epoch": 0.2873392680514342, + "grad_norm": 0.8050824860585051, + "learning_rate": 4.976207992746699e-06, + "loss": 0.4725, + "step": 1162 + }, + { + "epoch": 0.28758654797230465, + "grad_norm": 0.8407468844517442, + "learning_rate": 4.976163236983223e-06, + "loss": 0.4985, + "step": 1163 + }, + { + "epoch": 0.2878338278931751, + "grad_norm": 0.9432126176475985, + "learning_rate": 4.976118439365324e-06, + "loss": 0.4755, + "step": 1164 + }, + { + "epoch": 0.2880811078140455, + "grad_norm": 0.8343734797289791, + "learning_rate": 4.976073599893758e-06, + "loss": 0.4469, + "step": 1165 + }, + { + "epoch": 0.2883283877349159, + "grad_norm": 0.80957107924281, + "learning_rate": 4.976028718569285e-06, + "loss": 0.479, + "step": 1166 + }, + { + "epoch": 0.28857566765578635, + "grad_norm": 0.7838786090709212, + "learning_rate": 4.975983795392662e-06, + "loss": 0.4671, + "step": 1167 + }, + { + "epoch": 0.2888229475766568, + "grad_norm": 0.8892179393889282, + "learning_rate": 4.975938830364649e-06, + "loss": 0.4629, + "step": 1168 + }, + { + "epoch": 0.2890702274975272, + "grad_norm": 0.8885646660585704, + "learning_rate": 4.975893823486006e-06, + "loss": 0.5188, + "step": 1169 + }, + { + "epoch": 0.2893175074183976, + "grad_norm": 0.8950341542311611, + "learning_rate": 4.975848774757493e-06, + "loss": 0.4882, + "step": 1170 + }, + { + "epoch": 0.28956478733926805, + "grad_norm": 0.8153300668207971, + "learning_rate": 4.975803684179873e-06, + "loss": 0.4822, + "step": 1171 + }, + { + "epoch": 0.2898120672601385, + "grad_norm": 0.8661852061603399, + "learning_rate": 4.975758551753906e-06, + "loss": 0.4655, + "step": 1172 + }, + { + "epoch": 0.2900593471810089, + "grad_norm": 0.8741765156999356, + "learning_rate": 4.975713377480357e-06, + "loss": 0.4802, + "step": 1173 + }, + { + "epoch": 0.2903066271018793, + "grad_norm": 0.8232541934414569, + "learning_rate": 4.975668161359988e-06, + "loss": 0.4991, + "step": 1174 + }, + { + "epoch": 0.29055390702274975, + "grad_norm": 0.8204906769924353, + "learning_rate": 4.9756229033935646e-06, + "loss": 0.4593, + "step": 1175 + }, + { + "epoch": 0.29080118694362017, + "grad_norm": 0.8707746000436765, + "learning_rate": 4.97557760358185e-06, + "loss": 0.4607, + "step": 1176 + }, + { + "epoch": 0.2910484668644906, + "grad_norm": 0.9088228180075838, + "learning_rate": 4.975532261925612e-06, + "loss": 0.479, + "step": 1177 + }, + { + "epoch": 0.291295746785361, + "grad_norm": 0.8474947512406128, + "learning_rate": 4.975486878425616e-06, + "loss": 0.449, + "step": 1178 + }, + { + "epoch": 0.29154302670623145, + "grad_norm": 0.8214047939389825, + "learning_rate": 4.975441453082629e-06, + "loss": 0.5202, + "step": 1179 + }, + { + "epoch": 0.29179030662710187, + "grad_norm": 0.874242317071171, + "learning_rate": 4.9753959858974195e-06, + "loss": 0.5118, + "step": 1180 + }, + { + "epoch": 0.2920375865479723, + "grad_norm": 0.8628389428690536, + "learning_rate": 4.975350476870755e-06, + "loss": 0.4553, + "step": 1181 + }, + { + "epoch": 0.2922848664688427, + "grad_norm": 0.8522753264964066, + "learning_rate": 4.975304926003405e-06, + "loss": 0.4472, + "step": 1182 + }, + { + "epoch": 0.29253214638971314, + "grad_norm": 0.8249790271701399, + "learning_rate": 4.97525933329614e-06, + "loss": 0.4739, + "step": 1183 + }, + { + "epoch": 0.29277942631058357, + "grad_norm": 0.9003269024269399, + "learning_rate": 4.97521369874973e-06, + "loss": 0.4815, + "step": 1184 + }, + { + "epoch": 0.293026706231454, + "grad_norm": 0.920371626612895, + "learning_rate": 4.975168022364948e-06, + "loss": 0.4552, + "step": 1185 + }, + { + "epoch": 0.2932739861523244, + "grad_norm": 0.8453419054277556, + "learning_rate": 4.975122304142564e-06, + "loss": 0.4827, + "step": 1186 + }, + { + "epoch": 0.29352126607319484, + "grad_norm": 0.893824981194909, + "learning_rate": 4.97507654408335e-06, + "loss": 0.4593, + "step": 1187 + }, + { + "epoch": 0.29376854599406527, + "grad_norm": 0.8765416230068477, + "learning_rate": 4.9750307421880825e-06, + "loss": 0.468, + "step": 1188 + }, + { + "epoch": 0.2940158259149357, + "grad_norm": 0.7894978603788854, + "learning_rate": 4.974984898457534e-06, + "loss": 0.486, + "step": 1189 + }, + { + "epoch": 0.2942631058358061, + "grad_norm": 0.8576759645871942, + "learning_rate": 4.9749390128924806e-06, + "loss": 0.5149, + "step": 1190 + }, + { + "epoch": 0.29451038575667654, + "grad_norm": 0.8637297369732642, + "learning_rate": 4.9748930854936955e-06, + "loss": 0.4778, + "step": 1191 + }, + { + "epoch": 0.29475766567754697, + "grad_norm": 0.8527005901061284, + "learning_rate": 4.974847116261957e-06, + "loss": 0.5054, + "step": 1192 + }, + { + "epoch": 0.2950049455984174, + "grad_norm": 0.815567947874674, + "learning_rate": 4.974801105198042e-06, + "loss": 0.4802, + "step": 1193 + }, + { + "epoch": 0.2952522255192878, + "grad_norm": 0.8479219848611034, + "learning_rate": 4.974755052302726e-06, + "loss": 0.4702, + "step": 1194 + }, + { + "epoch": 0.29549950544015824, + "grad_norm": 0.8556137872597406, + "learning_rate": 4.974708957576791e-06, + "loss": 0.4704, + "step": 1195 + }, + { + "epoch": 0.29574678536102866, + "grad_norm": 0.8398783844649267, + "learning_rate": 4.974662821021014e-06, + "loss": 0.4854, + "step": 1196 + }, + { + "epoch": 0.2959940652818991, + "grad_norm": 0.8769661180300431, + "learning_rate": 4.974616642636174e-06, + "loss": 0.502, + "step": 1197 + }, + { + "epoch": 0.2962413452027695, + "grad_norm": 0.846002502686499, + "learning_rate": 4.974570422423053e-06, + "loss": 0.5028, + "step": 1198 + }, + { + "epoch": 0.29648862512363994, + "grad_norm": 0.8732437761139039, + "learning_rate": 4.974524160382433e-06, + "loss": 0.4487, + "step": 1199 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 0.8534970199568261, + "learning_rate": 4.974477856515094e-06, + "loss": 0.4772, + "step": 1200 + }, + { + "epoch": 0.2969831849653808, + "grad_norm": 0.8100385165833147, + "learning_rate": 4.97443151082182e-06, + "loss": 0.5059, + "step": 1201 + }, + { + "epoch": 0.2972304648862512, + "grad_norm": 0.8427846867481712, + "learning_rate": 4.974385123303394e-06, + "loss": 0.4697, + "step": 1202 + }, + { + "epoch": 0.29747774480712164, + "grad_norm": 0.8733326416501844, + "learning_rate": 4.974338693960599e-06, + "loss": 0.4638, + "step": 1203 + }, + { + "epoch": 0.29772502472799206, + "grad_norm": 0.8599299796559156, + "learning_rate": 4.974292222794223e-06, + "loss": 0.4563, + "step": 1204 + }, + { + "epoch": 0.2979723046488625, + "grad_norm": 0.8424539008316979, + "learning_rate": 4.9742457098050475e-06, + "loss": 0.5017, + "step": 1205 + }, + { + "epoch": 0.29821958456973297, + "grad_norm": 0.8474616781082716, + "learning_rate": 4.974199154993862e-06, + "loss": 0.4874, + "step": 1206 + }, + { + "epoch": 0.2984668644906034, + "grad_norm": 0.8432090258049286, + "learning_rate": 4.974152558361451e-06, + "loss": 0.4536, + "step": 1207 + }, + { + "epoch": 0.2987141444114738, + "grad_norm": 0.8365560807553833, + "learning_rate": 4.9741059199086024e-06, + "loss": 0.4971, + "step": 1208 + }, + { + "epoch": 0.29896142433234424, + "grad_norm": 0.8247988037749963, + "learning_rate": 4.974059239636106e-06, + "loss": 0.4564, + "step": 1209 + }, + { + "epoch": 0.29920870425321466, + "grad_norm": 0.8364405143056981, + "learning_rate": 4.97401251754475e-06, + "loss": 0.539, + "step": 1210 + }, + { + "epoch": 0.2994559841740851, + "grad_norm": 0.8749742019563281, + "learning_rate": 4.973965753635325e-06, + "loss": 0.4773, + "step": 1211 + }, + { + "epoch": 0.2997032640949555, + "grad_norm": 0.8643498007988748, + "learning_rate": 4.97391894790862e-06, + "loss": 0.4521, + "step": 1212 + }, + { + "epoch": 0.29995054401582594, + "grad_norm": 0.8051146754308409, + "learning_rate": 4.973872100365427e-06, + "loss": 0.4659, + "step": 1213 + }, + { + "epoch": 0.30019782393669636, + "grad_norm": 0.8483014608260354, + "learning_rate": 4.973825211006537e-06, + "loss": 0.4753, + "step": 1214 + }, + { + "epoch": 0.3004451038575668, + "grad_norm": 0.8784932670634085, + "learning_rate": 4.973778279832744e-06, + "loss": 0.4907, + "step": 1215 + }, + { + "epoch": 0.3006923837784372, + "grad_norm": 0.8310183524315704, + "learning_rate": 4.97373130684484e-06, + "loss": 0.4672, + "step": 1216 + }, + { + "epoch": 0.30093966369930764, + "grad_norm": 0.8636283603039391, + "learning_rate": 4.973684292043619e-06, + "loss": 0.435, + "step": 1217 + }, + { + "epoch": 0.30118694362017806, + "grad_norm": 0.8389114184976831, + "learning_rate": 4.973637235429877e-06, + "loss": 0.4993, + "step": 1218 + }, + { + "epoch": 0.3014342235410485, + "grad_norm": 0.8429199752469544, + "learning_rate": 4.973590137004408e-06, + "loss": 0.5084, + "step": 1219 + }, + { + "epoch": 0.3016815034619189, + "grad_norm": 0.8648994179752031, + "learning_rate": 4.9735429967680094e-06, + "loss": 0.4584, + "step": 1220 + }, + { + "epoch": 0.30192878338278933, + "grad_norm": 0.8092649880087687, + "learning_rate": 4.973495814721477e-06, + "loss": 0.4748, + "step": 1221 + }, + { + "epoch": 0.30217606330365976, + "grad_norm": 0.8045236519854421, + "learning_rate": 4.9734485908656075e-06, + "loss": 0.5183, + "step": 1222 + }, + { + "epoch": 0.3024233432245302, + "grad_norm": 0.8375848102184394, + "learning_rate": 4.973401325201202e-06, + "loss": 0.4846, + "step": 1223 + }, + { + "epoch": 0.3026706231454006, + "grad_norm": 0.8204333352305591, + "learning_rate": 4.9733540177290566e-06, + "loss": 0.4735, + "step": 1224 + }, + { + "epoch": 0.30291790306627103, + "grad_norm": 0.9320660470455538, + "learning_rate": 4.973306668449971e-06, + "loss": 0.487, + "step": 1225 + }, + { + "epoch": 0.30316518298714146, + "grad_norm": 0.8512691440015686, + "learning_rate": 4.973259277364748e-06, + "loss": 0.4564, + "step": 1226 + }, + { + "epoch": 0.3034124629080119, + "grad_norm": 0.8448716455291453, + "learning_rate": 4.973211844474187e-06, + "loss": 0.4607, + "step": 1227 + }, + { + "epoch": 0.3036597428288823, + "grad_norm": 0.9007565601848737, + "learning_rate": 4.973164369779089e-06, + "loss": 0.4714, + "step": 1228 + }, + { + "epoch": 0.30390702274975273, + "grad_norm": 0.8980928349718376, + "learning_rate": 4.9731168532802586e-06, + "loss": 0.4674, + "step": 1229 + }, + { + "epoch": 0.30415430267062316, + "grad_norm": 0.8184075705545406, + "learning_rate": 4.973069294978497e-06, + "loss": 0.4683, + "step": 1230 + }, + { + "epoch": 0.3044015825914936, + "grad_norm": 0.8307595261131485, + "learning_rate": 4.973021694874609e-06, + "loss": 0.4803, + "step": 1231 + }, + { + "epoch": 0.304648862512364, + "grad_norm": 0.8995162154565732, + "learning_rate": 4.972974052969399e-06, + "loss": 0.4909, + "step": 1232 + }, + { + "epoch": 0.30489614243323443, + "grad_norm": 0.8256622372929362, + "learning_rate": 4.972926369263672e-06, + "loss": 0.5004, + "step": 1233 + }, + { + "epoch": 0.30514342235410485, + "grad_norm": 0.7964903751490071, + "learning_rate": 4.972878643758234e-06, + "loss": 0.5006, + "step": 1234 + }, + { + "epoch": 0.3053907022749753, + "grad_norm": 0.9183004423758536, + "learning_rate": 4.972830876453893e-06, + "loss": 0.4817, + "step": 1235 + }, + { + "epoch": 0.3056379821958457, + "grad_norm": 0.8931482050228089, + "learning_rate": 4.972783067351455e-06, + "loss": 0.4799, + "step": 1236 + }, + { + "epoch": 0.30588526211671613, + "grad_norm": 0.9003322117775642, + "learning_rate": 4.972735216451728e-06, + "loss": 0.4832, + "step": 1237 + }, + { + "epoch": 0.30613254203758655, + "grad_norm": 0.8025764006110967, + "learning_rate": 4.972687323755522e-06, + "loss": 0.4695, + "step": 1238 + }, + { + "epoch": 0.306379821958457, + "grad_norm": 0.8313814899782755, + "learning_rate": 4.972639389263645e-06, + "loss": 0.5017, + "step": 1239 + }, + { + "epoch": 0.3066271018793274, + "grad_norm": 0.8085003588929675, + "learning_rate": 4.97259141297691e-06, + "loss": 0.484, + "step": 1240 + }, + { + "epoch": 0.3068743818001978, + "grad_norm": 0.8789937098823752, + "learning_rate": 4.9725433948961235e-06, + "loss": 0.4629, + "step": 1241 + }, + { + "epoch": 0.30712166172106825, + "grad_norm": 0.8417003920288945, + "learning_rate": 4.972495335022101e-06, + "loss": 0.4892, + "step": 1242 + }, + { + "epoch": 0.3073689416419387, + "grad_norm": 0.8296348241115892, + "learning_rate": 4.972447233355654e-06, + "loss": 0.471, + "step": 1243 + }, + { + "epoch": 0.3076162215628091, + "grad_norm": 0.8383843142341431, + "learning_rate": 4.972399089897594e-06, + "loss": 0.5055, + "step": 1244 + }, + { + "epoch": 0.3078635014836795, + "grad_norm": 0.8684099957350048, + "learning_rate": 4.972350904648736e-06, + "loss": 0.456, + "step": 1245 + }, + { + "epoch": 0.30811078140454995, + "grad_norm": 0.8774377280391846, + "learning_rate": 4.972302677609895e-06, + "loss": 0.4821, + "step": 1246 + }, + { + "epoch": 0.3083580613254204, + "grad_norm": 0.8757062818355338, + "learning_rate": 4.972254408781885e-06, + "loss": 0.4662, + "step": 1247 + }, + { + "epoch": 0.3086053412462908, + "grad_norm": 0.8623536682487267, + "learning_rate": 4.972206098165522e-06, + "loss": 0.4751, + "step": 1248 + }, + { + "epoch": 0.3088526211671612, + "grad_norm": 0.8482930842606974, + "learning_rate": 4.972157745761624e-06, + "loss": 0.4963, + "step": 1249 + }, + { + "epoch": 0.30909990108803165, + "grad_norm": 0.8841543541026415, + "learning_rate": 4.972109351571006e-06, + "loss": 0.4845, + "step": 1250 + }, + { + "epoch": 0.3093471810089021, + "grad_norm": 0.8180021244000164, + "learning_rate": 4.972060915594488e-06, + "loss": 0.5033, + "step": 1251 + }, + { + "epoch": 0.3095944609297725, + "grad_norm": 0.8183146518355443, + "learning_rate": 4.9720124378328885e-06, + "loss": 0.4673, + "step": 1252 + }, + { + "epoch": 0.3098417408506429, + "grad_norm": 0.9126517291897078, + "learning_rate": 4.971963918287026e-06, + "loss": 0.4785, + "step": 1253 + }, + { + "epoch": 0.31008902077151335, + "grad_norm": 0.8475981845701279, + "learning_rate": 4.971915356957721e-06, + "loss": 0.4515, + "step": 1254 + }, + { + "epoch": 0.31033630069238377, + "grad_norm": 0.829251256081981, + "learning_rate": 4.971866753845794e-06, + "loss": 0.4841, + "step": 1255 + }, + { + "epoch": 0.3105835806132542, + "grad_norm": 0.810109516370709, + "learning_rate": 4.971818108952066e-06, + "loss": 0.4957, + "step": 1256 + }, + { + "epoch": 0.3108308605341246, + "grad_norm": 0.8424982626224964, + "learning_rate": 4.9717694222773624e-06, + "loss": 0.4903, + "step": 1257 + }, + { + "epoch": 0.31107814045499504, + "grad_norm": 0.8273938509757517, + "learning_rate": 4.971720693822503e-06, + "loss": 0.4696, + "step": 1258 + }, + { + "epoch": 0.31132542037586547, + "grad_norm": 0.8584209103624083, + "learning_rate": 4.971671923588312e-06, + "loss": 0.4694, + "step": 1259 + }, + { + "epoch": 0.3115727002967359, + "grad_norm": 0.8635606173746708, + "learning_rate": 4.971623111575614e-06, + "loss": 0.4716, + "step": 1260 + }, + { + "epoch": 0.3118199802176063, + "grad_norm": 0.8772316694757156, + "learning_rate": 4.971574257785234e-06, + "loss": 0.4748, + "step": 1261 + }, + { + "epoch": 0.31206726013847674, + "grad_norm": 0.9346025894181195, + "learning_rate": 4.971525362217998e-06, + "loss": 0.4853, + "step": 1262 + }, + { + "epoch": 0.31231454005934717, + "grad_norm": 0.8196238081047258, + "learning_rate": 4.971476424874733e-06, + "loss": 0.4775, + "step": 1263 + }, + { + "epoch": 0.3125618199802176, + "grad_norm": 0.8501168249933853, + "learning_rate": 4.971427445756265e-06, + "loss": 0.485, + "step": 1264 + }, + { + "epoch": 0.312809099901088, + "grad_norm": 0.8453782824523821, + "learning_rate": 4.971378424863423e-06, + "loss": 0.4672, + "step": 1265 + }, + { + "epoch": 0.31305637982195844, + "grad_norm": 0.8316282468393369, + "learning_rate": 4.971329362197035e-06, + "loss": 0.4592, + "step": 1266 + }, + { + "epoch": 0.31330365974282887, + "grad_norm": 0.8913623594613578, + "learning_rate": 4.97128025775793e-06, + "loss": 0.4562, + "step": 1267 + }, + { + "epoch": 0.3135509396636993, + "grad_norm": 0.8805133912519674, + "learning_rate": 4.971231111546939e-06, + "loss": 0.4554, + "step": 1268 + }, + { + "epoch": 0.3137982195845697, + "grad_norm": 0.8378082421317632, + "learning_rate": 4.971181923564892e-06, + "loss": 0.4898, + "step": 1269 + }, + { + "epoch": 0.31404549950544014, + "grad_norm": 0.9098239301913463, + "learning_rate": 4.97113269381262e-06, + "loss": 0.507, + "step": 1270 + }, + { + "epoch": 0.31429277942631056, + "grad_norm": 0.8805091022613049, + "learning_rate": 4.971083422290956e-06, + "loss": 0.459, + "step": 1271 + }, + { + "epoch": 0.314540059347181, + "grad_norm": 0.8558333892198907, + "learning_rate": 4.971034109000732e-06, + "loss": 0.4786, + "step": 1272 + }, + { + "epoch": 0.3147873392680514, + "grad_norm": 0.830002999251238, + "learning_rate": 4.970984753942783e-06, + "loss": 0.4449, + "step": 1273 + }, + { + "epoch": 0.31503461918892184, + "grad_norm": 0.8401177086898065, + "learning_rate": 4.970935357117941e-06, + "loss": 0.5094, + "step": 1274 + }, + { + "epoch": 0.31528189910979226, + "grad_norm": 0.8369130683042246, + "learning_rate": 4.9708859185270435e-06, + "loss": 0.4823, + "step": 1275 + }, + { + "epoch": 0.3155291790306627, + "grad_norm": 0.8319862899317088, + "learning_rate": 4.970836438170924e-06, + "loss": 0.4855, + "step": 1276 + }, + { + "epoch": 0.3157764589515331, + "grad_norm": 0.8244796578240013, + "learning_rate": 4.97078691605042e-06, + "loss": 0.4527, + "step": 1277 + }, + { + "epoch": 0.31602373887240354, + "grad_norm": 0.8239294602758832, + "learning_rate": 4.970737352166368e-06, + "loss": 0.4307, + "step": 1278 + }, + { + "epoch": 0.31627101879327396, + "grad_norm": 0.812065916663278, + "learning_rate": 4.970687746519607e-06, + "loss": 0.4816, + "step": 1279 + }, + { + "epoch": 0.3165182987141444, + "grad_norm": 0.8330917454579297, + "learning_rate": 4.970638099110974e-06, + "loss": 0.5176, + "step": 1280 + }, + { + "epoch": 0.3167655786350148, + "grad_norm": 0.8260932984000681, + "learning_rate": 4.970588409941308e-06, + "loss": 0.4897, + "step": 1281 + }, + { + "epoch": 0.31701285855588524, + "grad_norm": 0.8279566577283952, + "learning_rate": 4.9705386790114505e-06, + "loss": 0.4774, + "step": 1282 + }, + { + "epoch": 0.31726013847675566, + "grad_norm": 0.8146960068506958, + "learning_rate": 4.970488906322241e-06, + "loss": 0.4885, + "step": 1283 + }, + { + "epoch": 0.31750741839762614, + "grad_norm": 0.855983788309996, + "learning_rate": 4.970439091874521e-06, + "loss": 0.4864, + "step": 1284 + }, + { + "epoch": 0.31775469831849656, + "grad_norm": 0.8477308612881375, + "learning_rate": 4.970389235669133e-06, + "loss": 0.4884, + "step": 1285 + }, + { + "epoch": 0.318001978239367, + "grad_norm": 0.8379968825304376, + "learning_rate": 4.97033933770692e-06, + "loss": 0.4715, + "step": 1286 + }, + { + "epoch": 0.3182492581602374, + "grad_norm": 0.8551487364632888, + "learning_rate": 4.970289397988724e-06, + "loss": 0.4888, + "step": 1287 + }, + { + "epoch": 0.31849653808110784, + "grad_norm": 0.8350034977215137, + "learning_rate": 4.970239416515389e-06, + "loss": 0.4842, + "step": 1288 + }, + { + "epoch": 0.31874381800197826, + "grad_norm": 0.8237669521560471, + "learning_rate": 4.970189393287761e-06, + "loss": 0.4764, + "step": 1289 + }, + { + "epoch": 0.3189910979228487, + "grad_norm": 0.8858090914826243, + "learning_rate": 4.970139328306686e-06, + "loss": 0.4789, + "step": 1290 + }, + { + "epoch": 0.3192383778437191, + "grad_norm": 0.8106320226692121, + "learning_rate": 4.970089221573008e-06, + "loss": 0.4689, + "step": 1291 + }, + { + "epoch": 0.31948565776458954, + "grad_norm": 0.8175678750319051, + "learning_rate": 4.970039073087577e-06, + "loss": 0.4512, + "step": 1292 + }, + { + "epoch": 0.31973293768545996, + "grad_norm": 0.7995128955825734, + "learning_rate": 4.969988882851238e-06, + "loss": 0.4832, + "step": 1293 + }, + { + "epoch": 0.3199802176063304, + "grad_norm": 0.799837226959758, + "learning_rate": 4.969938650864841e-06, + "loss": 0.4528, + "step": 1294 + }, + { + "epoch": 0.3202274975272008, + "grad_norm": 0.834121167692778, + "learning_rate": 4.969888377129234e-06, + "loss": 0.4597, + "step": 1295 + }, + { + "epoch": 0.32047477744807124, + "grad_norm": 0.8653081086264667, + "learning_rate": 4.969838061645268e-06, + "loss": 0.4595, + "step": 1296 + }, + { + "epoch": 0.32072205736894166, + "grad_norm": 0.8673155533135949, + "learning_rate": 4.969787704413792e-06, + "loss": 0.4812, + "step": 1297 + }, + { + "epoch": 0.3209693372898121, + "grad_norm": 0.8122719817063542, + "learning_rate": 4.969737305435658e-06, + "loss": 0.4781, + "step": 1298 + }, + { + "epoch": 0.3212166172106825, + "grad_norm": 0.8047716005193889, + "learning_rate": 4.969686864711718e-06, + "loss": 0.4999, + "step": 1299 + }, + { + "epoch": 0.32146389713155293, + "grad_norm": 0.8407561134099769, + "learning_rate": 4.969636382242825e-06, + "loss": 0.4627, + "step": 1300 + }, + { + "epoch": 0.32171117705242336, + "grad_norm": 0.8419429745734757, + "learning_rate": 4.969585858029831e-06, + "loss": 0.4729, + "step": 1301 + }, + { + "epoch": 0.3219584569732938, + "grad_norm": 0.8166830456200092, + "learning_rate": 4.96953529207359e-06, + "loss": 0.4871, + "step": 1302 + }, + { + "epoch": 0.3222057368941642, + "grad_norm": 0.7917900543444013, + "learning_rate": 4.969484684374959e-06, + "loss": 0.4935, + "step": 1303 + }, + { + "epoch": 0.32245301681503463, + "grad_norm": 0.9590690166755755, + "learning_rate": 4.9694340349347904e-06, + "loss": 0.429, + "step": 1304 + }, + { + "epoch": 0.32270029673590506, + "grad_norm": 0.8500944513695028, + "learning_rate": 4.969383343753943e-06, + "loss": 0.456, + "step": 1305 + }, + { + "epoch": 0.3229475766567755, + "grad_norm": 0.9050469051293906, + "learning_rate": 4.9693326108332716e-06, + "loss": 0.4792, + "step": 1306 + }, + { + "epoch": 0.3231948565776459, + "grad_norm": 0.8340987878049144, + "learning_rate": 4.969281836173635e-06, + "loss": 0.4722, + "step": 1307 + }, + { + "epoch": 0.32344213649851633, + "grad_norm": 0.8987401808532778, + "learning_rate": 4.969231019775891e-06, + "loss": 0.4293, + "step": 1308 + }, + { + "epoch": 0.32368941641938676, + "grad_norm": 0.8581280019324882, + "learning_rate": 4.969180161640898e-06, + "loss": 0.4519, + "step": 1309 + }, + { + "epoch": 0.3239366963402572, + "grad_norm": 0.8465852233831679, + "learning_rate": 4.9691292617695165e-06, + "loss": 0.4632, + "step": 1310 + }, + { + "epoch": 0.3241839762611276, + "grad_norm": 0.8182499069557968, + "learning_rate": 4.969078320162607e-06, + "loss": 0.4673, + "step": 1311 + }, + { + "epoch": 0.32443125618199803, + "grad_norm": 0.8583292489781121, + "learning_rate": 4.969027336821029e-06, + "loss": 0.4787, + "step": 1312 + }, + { + "epoch": 0.32467853610286845, + "grad_norm": 0.9144514645243177, + "learning_rate": 4.968976311745647e-06, + "loss": 0.4611, + "step": 1313 + }, + { + "epoch": 0.3249258160237389, + "grad_norm": 0.8542093569498418, + "learning_rate": 4.96892524493732e-06, + "loss": 0.4807, + "step": 1314 + }, + { + "epoch": 0.3251730959446093, + "grad_norm": 0.8531682089560951, + "learning_rate": 4.968874136396914e-06, + "loss": 0.463, + "step": 1315 + }, + { + "epoch": 0.3254203758654797, + "grad_norm": 0.8740204340015105, + "learning_rate": 4.968822986125292e-06, + "loss": 0.4723, + "step": 1316 + }, + { + "epoch": 0.32566765578635015, + "grad_norm": 0.7890203994099924, + "learning_rate": 4.968771794123318e-06, + "loss": 0.4811, + "step": 1317 + }, + { + "epoch": 0.3259149357072206, + "grad_norm": 0.8646734697790014, + "learning_rate": 4.968720560391859e-06, + "loss": 0.4412, + "step": 1318 + }, + { + "epoch": 0.326162215628091, + "grad_norm": 0.8417898419858085, + "learning_rate": 4.968669284931779e-06, + "loss": 0.4992, + "step": 1319 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 0.8403085258475721, + "learning_rate": 4.968617967743945e-06, + "loss": 0.4671, + "step": 1320 + }, + { + "epoch": 0.32665677546983185, + "grad_norm": 0.8161124612225035, + "learning_rate": 4.968566608829225e-06, + "loss": 0.4612, + "step": 1321 + }, + { + "epoch": 0.3269040553907023, + "grad_norm": 0.8176140395301841, + "learning_rate": 4.968515208188487e-06, + "loss": 0.4773, + "step": 1322 + }, + { + "epoch": 0.3271513353115727, + "grad_norm": 0.8612121577584373, + "learning_rate": 4.9684637658226e-06, + "loss": 0.4636, + "step": 1323 + }, + { + "epoch": 0.3273986152324431, + "grad_norm": 0.7636475891146703, + "learning_rate": 4.968412281732433e-06, + "loss": 0.4591, + "step": 1324 + }, + { + "epoch": 0.32764589515331355, + "grad_norm": 0.8356407912568218, + "learning_rate": 4.968360755918858e-06, + "loss": 0.4654, + "step": 1325 + }, + { + "epoch": 0.327893175074184, + "grad_norm": 0.8130038980473393, + "learning_rate": 4.968309188382743e-06, + "loss": 0.493, + "step": 1326 + }, + { + "epoch": 0.3281404549950544, + "grad_norm": 0.8073046510001293, + "learning_rate": 4.968257579124962e-06, + "loss": 0.4621, + "step": 1327 + }, + { + "epoch": 0.3283877349159248, + "grad_norm": 0.8016870786489134, + "learning_rate": 4.968205928146386e-06, + "loss": 0.4643, + "step": 1328 + }, + { + "epoch": 0.32863501483679525, + "grad_norm": 0.8448895040799353, + "learning_rate": 4.968154235447889e-06, + "loss": 0.4859, + "step": 1329 + }, + { + "epoch": 0.32888229475766567, + "grad_norm": 0.8191604076948825, + "learning_rate": 4.9681025010303445e-06, + "loss": 0.4691, + "step": 1330 + }, + { + "epoch": 0.3291295746785361, + "grad_norm": 0.8128752296677418, + "learning_rate": 4.968050724894626e-06, + "loss": 0.4621, + "step": 1331 + }, + { + "epoch": 0.3293768545994065, + "grad_norm": 0.843228610985233, + "learning_rate": 4.9679989070416106e-06, + "loss": 0.442, + "step": 1332 + }, + { + "epoch": 0.32962413452027695, + "grad_norm": 0.7963756355605212, + "learning_rate": 4.967947047472172e-06, + "loss": 0.4616, + "step": 1333 + }, + { + "epoch": 0.32987141444114737, + "grad_norm": 0.839654855859208, + "learning_rate": 4.967895146187189e-06, + "loss": 0.4643, + "step": 1334 + }, + { + "epoch": 0.3301186943620178, + "grad_norm": 0.8274671903075769, + "learning_rate": 4.967843203187537e-06, + "loss": 0.4426, + "step": 1335 + }, + { + "epoch": 0.3303659742828882, + "grad_norm": 0.8799303601806644, + "learning_rate": 4.967791218474095e-06, + "loss": 0.4644, + "step": 1336 + }, + { + "epoch": 0.33061325420375864, + "grad_norm": 0.8179823637800756, + "learning_rate": 4.967739192047741e-06, + "loss": 0.4659, + "step": 1337 + }, + { + "epoch": 0.33086053412462907, + "grad_norm": 0.8195725524565542, + "learning_rate": 4.967687123909355e-06, + "loss": 0.5096, + "step": 1338 + }, + { + "epoch": 0.3311078140454995, + "grad_norm": 0.84197720805029, + "learning_rate": 4.9676350140598165e-06, + "loss": 0.4727, + "step": 1339 + }, + { + "epoch": 0.3313550939663699, + "grad_norm": 0.8260250113198276, + "learning_rate": 4.9675828625000065e-06, + "loss": 0.4779, + "step": 1340 + }, + { + "epoch": 0.33160237388724034, + "grad_norm": 0.8880207738324564, + "learning_rate": 4.967530669230808e-06, + "loss": 0.4368, + "step": 1341 + }, + { + "epoch": 0.33184965380811077, + "grad_norm": 0.8222910498101001, + "learning_rate": 4.967478434253101e-06, + "loss": 0.4881, + "step": 1342 + }, + { + "epoch": 0.3320969337289812, + "grad_norm": 0.9239814524826327, + "learning_rate": 4.9674261575677696e-06, + "loss": 0.4394, + "step": 1343 + }, + { + "epoch": 0.3323442136498516, + "grad_norm": 0.8162367375969642, + "learning_rate": 4.967373839175696e-06, + "loss": 0.43, + "step": 1344 + }, + { + "epoch": 0.33259149357072204, + "grad_norm": 0.8420930848992175, + "learning_rate": 4.967321479077768e-06, + "loss": 0.4708, + "step": 1345 + }, + { + "epoch": 0.33283877349159247, + "grad_norm": 0.8299200362774546, + "learning_rate": 4.967269077274867e-06, + "loss": 0.502, + "step": 1346 + }, + { + "epoch": 0.3330860534124629, + "grad_norm": 0.8903658765485658, + "learning_rate": 4.96721663376788e-06, + "loss": 0.4873, + "step": 1347 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.842432576192086, + "learning_rate": 4.967164148557694e-06, + "loss": 0.4523, + "step": 1348 + }, + { + "epoch": 0.33358061325420374, + "grad_norm": 0.8784188816986657, + "learning_rate": 4.967111621645195e-06, + "loss": 0.4648, + "step": 1349 + }, + { + "epoch": 0.33382789317507416, + "grad_norm": 0.8890735207690358, + "learning_rate": 4.967059053031272e-06, + "loss": 0.4735, + "step": 1350 + }, + { + "epoch": 0.3340751730959446, + "grad_norm": 0.9139484525999664, + "learning_rate": 4.967006442716814e-06, + "loss": 0.4811, + "step": 1351 + }, + { + "epoch": 0.334322453016815, + "grad_norm": 0.8432725929225718, + "learning_rate": 4.966953790702709e-06, + "loss": 0.4814, + "step": 1352 + }, + { + "epoch": 0.33456973293768544, + "grad_norm": 0.7968350339816171, + "learning_rate": 4.9669010969898465e-06, + "loss": 0.4945, + "step": 1353 + }, + { + "epoch": 0.33481701285855586, + "grad_norm": 0.8448439028983895, + "learning_rate": 4.966848361579119e-06, + "loss": 0.471, + "step": 1354 + }, + { + "epoch": 0.3350642927794263, + "grad_norm": 0.8700792089452078, + "learning_rate": 4.966795584471417e-06, + "loss": 0.4778, + "step": 1355 + }, + { + "epoch": 0.3353115727002967, + "grad_norm": 0.8576573597500617, + "learning_rate": 4.966742765667632e-06, + "loss": 0.439, + "step": 1356 + }, + { + "epoch": 0.33555885262116714, + "grad_norm": 0.8412617464584977, + "learning_rate": 4.9666899051686565e-06, + "loss": 0.4579, + "step": 1357 + }, + { + "epoch": 0.33580613254203756, + "grad_norm": 0.8660641241003527, + "learning_rate": 4.966637002975387e-06, + "loss": 0.4947, + "step": 1358 + }, + { + "epoch": 0.336053412462908, + "grad_norm": 0.8784136045891929, + "learning_rate": 4.966584059088714e-06, + "loss": 0.4509, + "step": 1359 + }, + { + "epoch": 0.3363006923837784, + "grad_norm": 0.8575154859392102, + "learning_rate": 4.966531073509534e-06, + "loss": 0.4628, + "step": 1360 + }, + { + "epoch": 0.3365479723046489, + "grad_norm": 0.8558968082320106, + "learning_rate": 4.966478046238742e-06, + "loss": 0.4539, + "step": 1361 + }, + { + "epoch": 0.3367952522255193, + "grad_norm": 0.8463118657623281, + "learning_rate": 4.966424977277236e-06, + "loss": 0.4874, + "step": 1362 + }, + { + "epoch": 0.33704253214638974, + "grad_norm": 0.8582147987913828, + "learning_rate": 4.966371866625912e-06, + "loss": 0.4827, + "step": 1363 + }, + { + "epoch": 0.33728981206726016, + "grad_norm": 0.8425557042931806, + "learning_rate": 4.966318714285667e-06, + "loss": 0.5158, + "step": 1364 + }, + { + "epoch": 0.3375370919881306, + "grad_norm": 0.8198957499784623, + "learning_rate": 4.966265520257399e-06, + "loss": 0.4602, + "step": 1365 + }, + { + "epoch": 0.337784371909001, + "grad_norm": 0.901675824405454, + "learning_rate": 4.9662122845420105e-06, + "loss": 0.4638, + "step": 1366 + }, + { + "epoch": 0.33803165182987144, + "grad_norm": 0.8421039706556158, + "learning_rate": 4.9661590071403975e-06, + "loss": 0.4773, + "step": 1367 + }, + { + "epoch": 0.33827893175074186, + "grad_norm": 0.8881502365059725, + "learning_rate": 4.966105688053462e-06, + "loss": 0.4977, + "step": 1368 + }, + { + "epoch": 0.3385262116716123, + "grad_norm": 0.8286924700026043, + "learning_rate": 4.966052327282106e-06, + "loss": 0.4528, + "step": 1369 + }, + { + "epoch": 0.3387734915924827, + "grad_norm": 0.815530177047265, + "learning_rate": 4.96599892482723e-06, + "loss": 0.4678, + "step": 1370 + }, + { + "epoch": 0.33902077151335314, + "grad_norm": 0.8498159029250026, + "learning_rate": 4.965945480689738e-06, + "loss": 0.4797, + "step": 1371 + }, + { + "epoch": 0.33926805143422356, + "grad_norm": 0.837998064753534, + "learning_rate": 4.965891994870533e-06, + "loss": 0.4518, + "step": 1372 + }, + { + "epoch": 0.339515331355094, + "grad_norm": 0.8571090602934665, + "learning_rate": 4.965838467370518e-06, + "loss": 0.4516, + "step": 1373 + }, + { + "epoch": 0.3397626112759644, + "grad_norm": 0.9212861435307308, + "learning_rate": 4.9657848981905985e-06, + "loss": 0.4589, + "step": 1374 + }, + { + "epoch": 0.34000989119683483, + "grad_norm": 0.8646273616764547, + "learning_rate": 4.9657312873316806e-06, + "loss": 0.4656, + "step": 1375 + }, + { + "epoch": 0.34025717111770526, + "grad_norm": 0.8649249854703567, + "learning_rate": 4.965677634794671e-06, + "loss": 0.4678, + "step": 1376 + }, + { + "epoch": 0.3405044510385757, + "grad_norm": 0.8280715209748639, + "learning_rate": 4.965623940580474e-06, + "loss": 0.4806, + "step": 1377 + }, + { + "epoch": 0.3407517309594461, + "grad_norm": 0.8540637518211892, + "learning_rate": 4.965570204689999e-06, + "loss": 0.466, + "step": 1378 + }, + { + "epoch": 0.34099901088031653, + "grad_norm": 0.8722297658167012, + "learning_rate": 4.965516427124155e-06, + "loss": 0.4912, + "step": 1379 + }, + { + "epoch": 0.34124629080118696, + "grad_norm": 0.8324087882341675, + "learning_rate": 4.965462607883849e-06, + "loss": 0.4282, + "step": 1380 + }, + { + "epoch": 0.3414935707220574, + "grad_norm": 0.8542616511213038, + "learning_rate": 4.965408746969993e-06, + "loss": 0.4565, + "step": 1381 + }, + { + "epoch": 0.3417408506429278, + "grad_norm": 0.8897532286442891, + "learning_rate": 4.965354844383494e-06, + "loss": 0.4476, + "step": 1382 + }, + { + "epoch": 0.34198813056379823, + "grad_norm": 0.900818254020432, + "learning_rate": 4.965300900125267e-06, + "loss": 0.4626, + "step": 1383 + }, + { + "epoch": 0.34223541048466866, + "grad_norm": 0.8121891811895046, + "learning_rate": 4.965246914196222e-06, + "loss": 0.4723, + "step": 1384 + }, + { + "epoch": 0.3424826904055391, + "grad_norm": 0.8085099498992192, + "learning_rate": 4.965192886597271e-06, + "loss": 0.4969, + "step": 1385 + }, + { + "epoch": 0.3427299703264095, + "grad_norm": 0.8169789083197132, + "learning_rate": 4.965138817329328e-06, + "loss": 0.4983, + "step": 1386 + }, + { + "epoch": 0.34297725024727993, + "grad_norm": 0.8413101020719048, + "learning_rate": 4.965084706393307e-06, + "loss": 0.4624, + "step": 1387 + }, + { + "epoch": 0.34322453016815035, + "grad_norm": 0.8554114796569622, + "learning_rate": 4.965030553790123e-06, + "loss": 0.4847, + "step": 1388 + }, + { + "epoch": 0.3434718100890208, + "grad_norm": 0.883414822387632, + "learning_rate": 4.964976359520689e-06, + "loss": 0.4873, + "step": 1389 + }, + { + "epoch": 0.3437190900098912, + "grad_norm": 0.9225841299701298, + "learning_rate": 4.964922123585924e-06, + "loss": 0.4517, + "step": 1390 + }, + { + "epoch": 0.34396636993076163, + "grad_norm": 0.836069523426395, + "learning_rate": 4.964867845986742e-06, + "loss": 0.508, + "step": 1391 + }, + { + "epoch": 0.34421364985163205, + "grad_norm": 0.8230356770161646, + "learning_rate": 4.964813526724064e-06, + "loss": 0.4811, + "step": 1392 + }, + { + "epoch": 0.3444609297725025, + "grad_norm": 0.8144205084918253, + "learning_rate": 4.964759165798806e-06, + "loss": 0.4746, + "step": 1393 + }, + { + "epoch": 0.3447082096933729, + "grad_norm": 0.848243953448798, + "learning_rate": 4.964704763211886e-06, + "loss": 0.4605, + "step": 1394 + }, + { + "epoch": 0.3449554896142433, + "grad_norm": 0.862441294591035, + "learning_rate": 4.964650318964224e-06, + "loss": 0.4261, + "step": 1395 + }, + { + "epoch": 0.34520276953511375, + "grad_norm": 0.823727505626556, + "learning_rate": 4.964595833056742e-06, + "loss": 0.4542, + "step": 1396 + }, + { + "epoch": 0.3454500494559842, + "grad_norm": 0.8461579909245877, + "learning_rate": 4.964541305490359e-06, + "loss": 0.4642, + "step": 1397 + }, + { + "epoch": 0.3456973293768546, + "grad_norm": 0.8682318696565303, + "learning_rate": 4.964486736265998e-06, + "loss": 0.4619, + "step": 1398 + }, + { + "epoch": 0.345944609297725, + "grad_norm": 0.8154599041496469, + "learning_rate": 4.964432125384581e-06, + "loss": 0.49, + "step": 1399 + }, + { + "epoch": 0.34619188921859545, + "grad_norm": 0.8521804504909071, + "learning_rate": 4.96437747284703e-06, + "loss": 0.4339, + "step": 1400 + }, + { + "epoch": 0.3464391691394659, + "grad_norm": 0.8556600517889699, + "learning_rate": 4.964322778654271e-06, + "loss": 0.4447, + "step": 1401 + }, + { + "epoch": 0.3466864490603363, + "grad_norm": 0.8688194992174506, + "learning_rate": 4.964268042807227e-06, + "loss": 0.4644, + "step": 1402 + }, + { + "epoch": 0.3469337289812067, + "grad_norm": 0.8017975295806874, + "learning_rate": 4.9642132653068224e-06, + "loss": 0.4561, + "step": 1403 + }, + { + "epoch": 0.34718100890207715, + "grad_norm": 0.8536874452765987, + "learning_rate": 4.964158446153985e-06, + "loss": 0.4648, + "step": 1404 + }, + { + "epoch": 0.3474282888229476, + "grad_norm": 0.8189127192711046, + "learning_rate": 4.964103585349639e-06, + "loss": 0.4388, + "step": 1405 + }, + { + "epoch": 0.347675568743818, + "grad_norm": 0.8705939326174833, + "learning_rate": 4.9640486828947146e-06, + "loss": 0.4596, + "step": 1406 + }, + { + "epoch": 0.3479228486646884, + "grad_norm": 0.885106043728194, + "learning_rate": 4.963993738790138e-06, + "loss": 0.4743, + "step": 1407 + }, + { + "epoch": 0.34817012858555885, + "grad_norm": 0.8332350836081958, + "learning_rate": 4.963938753036839e-06, + "loss": 0.4795, + "step": 1408 + }, + { + "epoch": 0.34841740850642927, + "grad_norm": 0.8151625206936463, + "learning_rate": 4.963883725635746e-06, + "loss": 0.4588, + "step": 1409 + }, + { + "epoch": 0.3486646884272997, + "grad_norm": 0.8584186317187182, + "learning_rate": 4.963828656587789e-06, + "loss": 0.4609, + "step": 1410 + }, + { + "epoch": 0.3489119683481701, + "grad_norm": 0.8374239839439461, + "learning_rate": 4.9637735458939e-06, + "loss": 0.4663, + "step": 1411 + }, + { + "epoch": 0.34915924826904055, + "grad_norm": 0.8335071629302205, + "learning_rate": 4.96371839355501e-06, + "loss": 0.4976, + "step": 1412 + }, + { + "epoch": 0.34940652818991097, + "grad_norm": 0.8566093656507006, + "learning_rate": 4.96366319957205e-06, + "loss": 0.4427, + "step": 1413 + }, + { + "epoch": 0.3496538081107814, + "grad_norm": 0.8797486028317246, + "learning_rate": 4.963607963945954e-06, + "loss": 0.4638, + "step": 1414 + }, + { + "epoch": 0.3499010880316518, + "grad_norm": 0.7973883458360015, + "learning_rate": 4.963552686677656e-06, + "loss": 0.4756, + "step": 1415 + }, + { + "epoch": 0.35014836795252224, + "grad_norm": 0.8605355070813538, + "learning_rate": 4.963497367768091e-06, + "loss": 0.4937, + "step": 1416 + }, + { + "epoch": 0.35039564787339267, + "grad_norm": 0.8087052810593318, + "learning_rate": 4.9634420072181925e-06, + "loss": 0.5043, + "step": 1417 + }, + { + "epoch": 0.3506429277942631, + "grad_norm": 0.8042980991613009, + "learning_rate": 4.963386605028897e-06, + "loss": 0.4866, + "step": 1418 + }, + { + "epoch": 0.3508902077151335, + "grad_norm": 0.8397881265192508, + "learning_rate": 4.96333116120114e-06, + "loss": 0.4775, + "step": 1419 + }, + { + "epoch": 0.35113748763600394, + "grad_norm": 0.8229969968530699, + "learning_rate": 4.963275675735859e-06, + "loss": 0.4909, + "step": 1420 + }, + { + "epoch": 0.35138476755687437, + "grad_norm": 0.8582458934680285, + "learning_rate": 4.963220148633994e-06, + "loss": 0.4483, + "step": 1421 + }, + { + "epoch": 0.3516320474777448, + "grad_norm": 0.8509944642538084, + "learning_rate": 4.963164579896481e-06, + "loss": 0.4931, + "step": 1422 + }, + { + "epoch": 0.3518793273986152, + "grad_norm": 0.7778259466986965, + "learning_rate": 4.963108969524261e-06, + "loss": 0.4506, + "step": 1423 + }, + { + "epoch": 0.35212660731948564, + "grad_norm": 0.806063787046905, + "learning_rate": 4.9630533175182714e-06, + "loss": 0.4828, + "step": 1424 + }, + { + "epoch": 0.35237388724035607, + "grad_norm": 0.8595506336846482, + "learning_rate": 4.962997623879456e-06, + "loss": 0.4606, + "step": 1425 + }, + { + "epoch": 0.3526211671612265, + "grad_norm": 0.8324046367039577, + "learning_rate": 4.962941888608754e-06, + "loss": 0.4489, + "step": 1426 + }, + { + "epoch": 0.3528684470820969, + "grad_norm": 0.8519133065926907, + "learning_rate": 4.9628861117071095e-06, + "loss": 0.4664, + "step": 1427 + }, + { + "epoch": 0.35311572700296734, + "grad_norm": 0.7904079696015989, + "learning_rate": 4.962830293175463e-06, + "loss": 0.4657, + "step": 1428 + }, + { + "epoch": 0.35336300692383776, + "grad_norm": 0.8232281020750102, + "learning_rate": 4.96277443301476e-06, + "loss": 0.444, + "step": 1429 + }, + { + "epoch": 0.3536102868447082, + "grad_norm": 0.8535424230625128, + "learning_rate": 4.962718531225942e-06, + "loss": 0.465, + "step": 1430 + }, + { + "epoch": 0.3538575667655786, + "grad_norm": 0.8615311397260789, + "learning_rate": 4.962662587809957e-06, + "loss": 0.4482, + "step": 1431 + }, + { + "epoch": 0.35410484668644904, + "grad_norm": 0.8179287695934544, + "learning_rate": 4.9626066027677496e-06, + "loss": 0.4796, + "step": 1432 + }, + { + "epoch": 0.35435212660731946, + "grad_norm": 0.8435396498957267, + "learning_rate": 4.962550576100265e-06, + "loss": 0.4504, + "step": 1433 + }, + { + "epoch": 0.3545994065281899, + "grad_norm": 0.8045830988187997, + "learning_rate": 4.962494507808452e-06, + "loss": 0.4718, + "step": 1434 + }, + { + "epoch": 0.3548466864490603, + "grad_norm": 0.7973282650780453, + "learning_rate": 4.962438397893256e-06, + "loss": 0.5084, + "step": 1435 + }, + { + "epoch": 0.35509396636993074, + "grad_norm": 0.81788673220588, + "learning_rate": 4.962382246355628e-06, + "loss": 0.4762, + "step": 1436 + }, + { + "epoch": 0.35534124629080116, + "grad_norm": 0.8191220728313339, + "learning_rate": 4.962326053196515e-06, + "loss": 0.459, + "step": 1437 + }, + { + "epoch": 0.3555885262116716, + "grad_norm": 0.8418837776836511, + "learning_rate": 4.9622698184168684e-06, + "loss": 0.4832, + "step": 1438 + }, + { + "epoch": 0.35583580613254207, + "grad_norm": 0.822644289062169, + "learning_rate": 4.962213542017638e-06, + "loss": 0.4891, + "step": 1439 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 0.8055518425958926, + "learning_rate": 4.962157223999774e-06, + "loss": 0.4805, + "step": 1440 + }, + { + "epoch": 0.3563303659742829, + "grad_norm": 0.8663900660198538, + "learning_rate": 4.962100864364231e-06, + "loss": 0.4414, + "step": 1441 + }, + { + "epoch": 0.35657764589515334, + "grad_norm": 0.8065354382371167, + "learning_rate": 4.962044463111959e-06, + "loss": 0.472, + "step": 1442 + }, + { + "epoch": 0.35682492581602376, + "grad_norm": 0.8429417885865965, + "learning_rate": 4.961988020243913e-06, + "loss": 0.4802, + "step": 1443 + }, + { + "epoch": 0.3570722057368942, + "grad_norm": 0.8628035030884751, + "learning_rate": 4.961931535761046e-06, + "loss": 0.4278, + "step": 1444 + }, + { + "epoch": 0.3573194856577646, + "grad_norm": 0.8255801822657475, + "learning_rate": 4.961875009664313e-06, + "loss": 0.4637, + "step": 1445 + }, + { + "epoch": 0.35756676557863504, + "grad_norm": 0.8435723559389733, + "learning_rate": 4.9618184419546705e-06, + "loss": 0.4711, + "step": 1446 + }, + { + "epoch": 0.35781404549950546, + "grad_norm": 0.8287755155885044, + "learning_rate": 4.961761832633073e-06, + "loss": 0.4893, + "step": 1447 + }, + { + "epoch": 0.3580613254203759, + "grad_norm": 0.8113313485945742, + "learning_rate": 4.961705181700479e-06, + "loss": 0.4758, + "step": 1448 + }, + { + "epoch": 0.3583086053412463, + "grad_norm": 0.8541198849431507, + "learning_rate": 4.9616484891578455e-06, + "loss": 0.447, + "step": 1449 + }, + { + "epoch": 0.35855588526211674, + "grad_norm": 0.8277973505342479, + "learning_rate": 4.96159175500613e-06, + "loss": 0.4556, + "step": 1450 + }, + { + "epoch": 0.35880316518298716, + "grad_norm": 0.8217363815126292, + "learning_rate": 4.9615349792462916e-06, + "loss": 0.4682, + "step": 1451 + }, + { + "epoch": 0.3590504451038576, + "grad_norm": 0.8235924475409444, + "learning_rate": 4.961478161879291e-06, + "loss": 0.4577, + "step": 1452 + }, + { + "epoch": 0.359297725024728, + "grad_norm": 0.8366896026328494, + "learning_rate": 4.961421302906087e-06, + "loss": 0.4536, + "step": 1453 + }, + { + "epoch": 0.35954500494559843, + "grad_norm": 0.8480005034889219, + "learning_rate": 4.961364402327643e-06, + "loss": 0.467, + "step": 1454 + }, + { + "epoch": 0.35979228486646886, + "grad_norm": 0.8056929237512162, + "learning_rate": 4.961307460144919e-06, + "loss": 0.4733, + "step": 1455 + }, + { + "epoch": 0.3600395647873393, + "grad_norm": 0.8221098249663793, + "learning_rate": 4.9612504763588774e-06, + "loss": 0.4598, + "step": 1456 + }, + { + "epoch": 0.3602868447082097, + "grad_norm": 0.8347778715831121, + "learning_rate": 4.961193450970483e-06, + "loss": 0.4568, + "step": 1457 + }, + { + "epoch": 0.36053412462908013, + "grad_norm": 0.8772150797089943, + "learning_rate": 4.961136383980697e-06, + "loss": 0.4588, + "step": 1458 + }, + { + "epoch": 0.36078140454995056, + "grad_norm": 0.8530561104086893, + "learning_rate": 4.9610792753904866e-06, + "loss": 0.4616, + "step": 1459 + }, + { + "epoch": 0.361028684470821, + "grad_norm": 0.8823335046697693, + "learning_rate": 4.961022125200816e-06, + "loss": 0.4699, + "step": 1460 + }, + { + "epoch": 0.3612759643916914, + "grad_norm": 0.8485109471759061, + "learning_rate": 4.960964933412652e-06, + "loss": 0.4378, + "step": 1461 + }, + { + "epoch": 0.36152324431256183, + "grad_norm": 0.8898917632495861, + "learning_rate": 4.96090770002696e-06, + "loss": 0.4654, + "step": 1462 + }, + { + "epoch": 0.36177052423343226, + "grad_norm": 0.8470917053499859, + "learning_rate": 4.9608504250447075e-06, + "loss": 0.4553, + "step": 1463 + }, + { + "epoch": 0.3620178041543027, + "grad_norm": 0.8348048460177506, + "learning_rate": 4.960793108466863e-06, + "loss": 0.4423, + "step": 1464 + }, + { + "epoch": 0.3622650840751731, + "grad_norm": 0.8837887760254436, + "learning_rate": 4.960735750294397e-06, + "loss": 0.437, + "step": 1465 + }, + { + "epoch": 0.36251236399604353, + "grad_norm": 0.8521679859494443, + "learning_rate": 4.960678350528277e-06, + "loss": 0.4577, + "step": 1466 + }, + { + "epoch": 0.36275964391691395, + "grad_norm": 0.7953899887086049, + "learning_rate": 4.9606209091694734e-06, + "loss": 0.4822, + "step": 1467 + }, + { + "epoch": 0.3630069238377844, + "grad_norm": 0.7982731717026136, + "learning_rate": 4.960563426218957e-06, + "loss": 0.4584, + "step": 1468 + }, + { + "epoch": 0.3632542037586548, + "grad_norm": 0.9145430537767979, + "learning_rate": 4.960505901677701e-06, + "loss": 0.5103, + "step": 1469 + }, + { + "epoch": 0.36350148367952523, + "grad_norm": 0.8606249759635389, + "learning_rate": 4.9604483355466756e-06, + "loss": 0.4532, + "step": 1470 + }, + { + "epoch": 0.36374876360039565, + "grad_norm": 0.8640284738109312, + "learning_rate": 4.960390727826856e-06, + "loss": 0.4562, + "step": 1471 + }, + { + "epoch": 0.3639960435212661, + "grad_norm": 0.8927421670919629, + "learning_rate": 4.960333078519214e-06, + "loss": 0.4427, + "step": 1472 + }, + { + "epoch": 0.3642433234421365, + "grad_norm": 0.8209190272202821, + "learning_rate": 4.9602753876247244e-06, + "loss": 0.4658, + "step": 1473 + }, + { + "epoch": 0.3644906033630069, + "grad_norm": 0.8506090980497512, + "learning_rate": 4.960217655144364e-06, + "loss": 0.4903, + "step": 1474 + }, + { + "epoch": 0.36473788328387735, + "grad_norm": 0.8304617275337773, + "learning_rate": 4.960159881079106e-06, + "loss": 0.4371, + "step": 1475 + }, + { + "epoch": 0.3649851632047478, + "grad_norm": 0.8074225851216803, + "learning_rate": 4.960102065429929e-06, + "loss": 0.4615, + "step": 1476 + }, + { + "epoch": 0.3652324431256182, + "grad_norm": 0.8003088453641801, + "learning_rate": 4.96004420819781e-06, + "loss": 0.4602, + "step": 1477 + }, + { + "epoch": 0.3654797230464886, + "grad_norm": 0.8470939389079107, + "learning_rate": 4.959986309383726e-06, + "loss": 0.4713, + "step": 1478 + }, + { + "epoch": 0.36572700296735905, + "grad_norm": 0.830521949089574, + "learning_rate": 4.959928368988657e-06, + "loss": 0.4811, + "step": 1479 + }, + { + "epoch": 0.3659742828882295, + "grad_norm": 0.8745328214858118, + "learning_rate": 4.959870387013581e-06, + "loss": 0.4557, + "step": 1480 + }, + { + "epoch": 0.3662215628090999, + "grad_norm": 0.847094148012273, + "learning_rate": 4.959812363459479e-06, + "loss": 0.4655, + "step": 1481 + }, + { + "epoch": 0.3664688427299703, + "grad_norm": 0.8491449367553962, + "learning_rate": 4.959754298327332e-06, + "loss": 0.4597, + "step": 1482 + }, + { + "epoch": 0.36671612265084075, + "grad_norm": 0.8229085286999269, + "learning_rate": 4.959696191618119e-06, + "loss": 0.4854, + "step": 1483 + }, + { + "epoch": 0.3669634025717112, + "grad_norm": 0.8702802741898958, + "learning_rate": 4.959638043332826e-06, + "loss": 0.455, + "step": 1484 + }, + { + "epoch": 0.3672106824925816, + "grad_norm": 0.8459937215244333, + "learning_rate": 4.959579853472434e-06, + "loss": 0.4758, + "step": 1485 + }, + { + "epoch": 0.367457962413452, + "grad_norm": 0.8881342218230955, + "learning_rate": 4.959521622037925e-06, + "loss": 0.468, + "step": 1486 + }, + { + "epoch": 0.36770524233432245, + "grad_norm": 0.7847108194328761, + "learning_rate": 4.959463349030285e-06, + "loss": 0.4896, + "step": 1487 + }, + { + "epoch": 0.36795252225519287, + "grad_norm": 0.8417601497534091, + "learning_rate": 4.959405034450501e-06, + "loss": 0.4686, + "step": 1488 + }, + { + "epoch": 0.3681998021760633, + "grad_norm": 0.8122499845921317, + "learning_rate": 4.959346678299555e-06, + "loss": 0.4692, + "step": 1489 + }, + { + "epoch": 0.3684470820969337, + "grad_norm": 0.8580283394958202, + "learning_rate": 4.9592882805784345e-06, + "loss": 0.4742, + "step": 1490 + }, + { + "epoch": 0.36869436201780414, + "grad_norm": 0.8428257373547411, + "learning_rate": 4.959229841288128e-06, + "loss": 0.4675, + "step": 1491 + }, + { + "epoch": 0.36894164193867457, + "grad_norm": 0.8259183229805209, + "learning_rate": 4.959171360429621e-06, + "loss": 0.4404, + "step": 1492 + }, + { + "epoch": 0.369188921859545, + "grad_norm": 0.8126446310683214, + "learning_rate": 4.959112838003905e-06, + "loss": 0.4687, + "step": 1493 + }, + { + "epoch": 0.3694362017804154, + "grad_norm": 0.7830986243714438, + "learning_rate": 4.959054274011966e-06, + "loss": 0.4625, + "step": 1494 + }, + { + "epoch": 0.36968348170128584, + "grad_norm": 0.8237721672583825, + "learning_rate": 4.958995668454796e-06, + "loss": 0.4679, + "step": 1495 + }, + { + "epoch": 0.36993076162215627, + "grad_norm": 0.842776685828888, + "learning_rate": 4.958937021333384e-06, + "loss": 0.46, + "step": 1496 + }, + { + "epoch": 0.3701780415430267, + "grad_norm": 0.8364831387221224, + "learning_rate": 4.958878332648724e-06, + "loss": 0.4899, + "step": 1497 + }, + { + "epoch": 0.3704253214638971, + "grad_norm": 0.7940034967601184, + "learning_rate": 4.958819602401806e-06, + "loss": 0.4595, + "step": 1498 + }, + { + "epoch": 0.37067260138476754, + "grad_norm": 0.8153152604939218, + "learning_rate": 4.958760830593621e-06, + "loss": 0.4808, + "step": 1499 + }, + { + "epoch": 0.37091988130563797, + "grad_norm": 0.8603126838223264, + "learning_rate": 4.958702017225166e-06, + "loss": 0.4643, + "step": 1500 + }, + { + "epoch": 0.3711671612265084, + "grad_norm": 0.832304898978481, + "learning_rate": 4.958643162297434e-06, + "loss": 0.4609, + "step": 1501 + }, + { + "epoch": 0.3714144411473788, + "grad_norm": 0.8437671915588434, + "learning_rate": 4.958584265811419e-06, + "loss": 0.4607, + "step": 1502 + }, + { + "epoch": 0.37166172106824924, + "grad_norm": 0.8270741301993446, + "learning_rate": 4.958525327768117e-06, + "loss": 0.4866, + "step": 1503 + }, + { + "epoch": 0.37190900098911966, + "grad_norm": 0.8458032983321746, + "learning_rate": 4.9584663481685235e-06, + "loss": 0.4842, + "step": 1504 + }, + { + "epoch": 0.3721562809099901, + "grad_norm": 0.8240486908432811, + "learning_rate": 4.958407327013637e-06, + "loss": 0.4836, + "step": 1505 + }, + { + "epoch": 0.3724035608308605, + "grad_norm": 0.8103231743244376, + "learning_rate": 4.9583482643044535e-06, + "loss": 0.4607, + "step": 1506 + }, + { + "epoch": 0.37265084075173094, + "grad_norm": 0.8309144783040975, + "learning_rate": 4.9582891600419714e-06, + "loss": 0.4767, + "step": 1507 + }, + { + "epoch": 0.37289812067260136, + "grad_norm": 0.812854985431406, + "learning_rate": 4.958230014227191e-06, + "loss": 0.5015, + "step": 1508 + }, + { + "epoch": 0.3731454005934718, + "grad_norm": 0.8389457059849967, + "learning_rate": 4.9581708268611116e-06, + "loss": 0.4895, + "step": 1509 + }, + { + "epoch": 0.3733926805143422, + "grad_norm": 0.8324431688711037, + "learning_rate": 4.958111597944734e-06, + "loss": 0.4802, + "step": 1510 + }, + { + "epoch": 0.37363996043521264, + "grad_norm": 0.8858341769964009, + "learning_rate": 4.9580523274790585e-06, + "loss": 0.4677, + "step": 1511 + }, + { + "epoch": 0.37388724035608306, + "grad_norm": 0.8346574872673881, + "learning_rate": 4.957993015465086e-06, + "loss": 0.4434, + "step": 1512 + }, + { + "epoch": 0.3741345202769535, + "grad_norm": 0.8159836769145766, + "learning_rate": 4.957933661903822e-06, + "loss": 0.49, + "step": 1513 + }, + { + "epoch": 0.3743818001978239, + "grad_norm": 0.903045314748104, + "learning_rate": 4.957874266796267e-06, + "loss": 0.4861, + "step": 1514 + }, + { + "epoch": 0.37462908011869434, + "grad_norm": 0.8234870808009122, + "learning_rate": 4.9578148301434255e-06, + "loss": 0.4452, + "step": 1515 + }, + { + "epoch": 0.37487636003956476, + "grad_norm": 0.8153560688366689, + "learning_rate": 4.957755351946303e-06, + "loss": 0.4734, + "step": 1516 + }, + { + "epoch": 0.37512363996043524, + "grad_norm": 0.8400329349725666, + "learning_rate": 4.957695832205905e-06, + "loss": 0.497, + "step": 1517 + }, + { + "epoch": 0.37537091988130566, + "grad_norm": 0.8358924451402449, + "learning_rate": 4.957636270923237e-06, + "loss": 0.4562, + "step": 1518 + }, + { + "epoch": 0.3756181998021761, + "grad_norm": 0.8432421412883345, + "learning_rate": 4.9575766680993056e-06, + "loss": 0.4428, + "step": 1519 + }, + { + "epoch": 0.3758654797230465, + "grad_norm": 0.8160910599610325, + "learning_rate": 4.957517023735119e-06, + "loss": 0.4627, + "step": 1520 + }, + { + "epoch": 0.37611275964391694, + "grad_norm": 0.8711532573510746, + "learning_rate": 4.957457337831684e-06, + "loss": 0.4717, + "step": 1521 + }, + { + "epoch": 0.37636003956478736, + "grad_norm": 0.8242546795865497, + "learning_rate": 4.95739761039001e-06, + "loss": 0.4819, + "step": 1522 + }, + { + "epoch": 0.3766073194856578, + "grad_norm": 0.8613610896652953, + "learning_rate": 4.957337841411107e-06, + "loss": 0.446, + "step": 1523 + }, + { + "epoch": 0.3768545994065282, + "grad_norm": 0.864261076328205, + "learning_rate": 4.9572780308959865e-06, + "loss": 0.4698, + "step": 1524 + }, + { + "epoch": 0.37710187932739864, + "grad_norm": 0.8614226944088311, + "learning_rate": 4.957218178845657e-06, + "loss": 0.4808, + "step": 1525 + }, + { + "epoch": 0.37734915924826906, + "grad_norm": 0.8224765428766275, + "learning_rate": 4.957158285261131e-06, + "loss": 0.4403, + "step": 1526 + }, + { + "epoch": 0.3775964391691395, + "grad_norm": 0.8181585369424585, + "learning_rate": 4.957098350143422e-06, + "loss": 0.4766, + "step": 1527 + }, + { + "epoch": 0.3778437190900099, + "grad_norm": 0.8096774878681889, + "learning_rate": 4.957038373493541e-06, + "loss": 0.46, + "step": 1528 + }, + { + "epoch": 0.37809099901088034, + "grad_norm": 0.8490900193792322, + "learning_rate": 4.956978355312505e-06, + "loss": 0.4781, + "step": 1529 + }, + { + "epoch": 0.37833827893175076, + "grad_norm": 0.8322232014452549, + "learning_rate": 4.956918295601325e-06, + "loss": 0.5009, + "step": 1530 + }, + { + "epoch": 0.3785855588526212, + "grad_norm": 0.7735197922262507, + "learning_rate": 4.956858194361018e-06, + "loss": 0.478, + "step": 1531 + }, + { + "epoch": 0.3788328387734916, + "grad_norm": 0.8585921318091921, + "learning_rate": 4.9567980515926e-06, + "loss": 0.4856, + "step": 1532 + }, + { + "epoch": 0.37908011869436203, + "grad_norm": 0.8446307436491305, + "learning_rate": 4.956737867297086e-06, + "loss": 0.4472, + "step": 1533 + }, + { + "epoch": 0.37932739861523246, + "grad_norm": 0.8357409701620357, + "learning_rate": 4.9566776414754955e-06, + "loss": 0.4704, + "step": 1534 + }, + { + "epoch": 0.3795746785361029, + "grad_norm": 0.883193450189684, + "learning_rate": 4.9566173741288445e-06, + "loss": 0.4445, + "step": 1535 + }, + { + "epoch": 0.3798219584569733, + "grad_norm": 0.8362706611107679, + "learning_rate": 4.956557065258154e-06, + "loss": 0.4763, + "step": 1536 + }, + { + "epoch": 0.38006923837784373, + "grad_norm": 0.8445029539445396, + "learning_rate": 4.956496714864442e-06, + "loss": 0.4363, + "step": 1537 + }, + { + "epoch": 0.38031651829871416, + "grad_norm": 0.9005044139041423, + "learning_rate": 4.956436322948728e-06, + "loss": 0.4257, + "step": 1538 + }, + { + "epoch": 0.3805637982195846, + "grad_norm": 0.803480549136306, + "learning_rate": 4.956375889512033e-06, + "loss": 0.4643, + "step": 1539 + }, + { + "epoch": 0.380811078140455, + "grad_norm": 0.8944145630210211, + "learning_rate": 4.95631541455538e-06, + "loss": 0.4649, + "step": 1540 + }, + { + "epoch": 0.38105835806132543, + "grad_norm": 0.8431926814490958, + "learning_rate": 4.956254898079789e-06, + "loss": 0.4634, + "step": 1541 + }, + { + "epoch": 0.38130563798219586, + "grad_norm": 0.8623971408995822, + "learning_rate": 4.956194340086284e-06, + "loss": 0.473, + "step": 1542 + }, + { + "epoch": 0.3815529179030663, + "grad_norm": 0.855497209762524, + "learning_rate": 4.956133740575889e-06, + "loss": 0.4384, + "step": 1543 + }, + { + "epoch": 0.3818001978239367, + "grad_norm": 0.8154439659482268, + "learning_rate": 4.9560730995496285e-06, + "loss": 0.4714, + "step": 1544 + }, + { + "epoch": 0.38204747774480713, + "grad_norm": 0.8117948295753515, + "learning_rate": 4.956012417008526e-06, + "loss": 0.4573, + "step": 1545 + }, + { + "epoch": 0.38229475766567755, + "grad_norm": 0.8411563068995113, + "learning_rate": 4.95595169295361e-06, + "loss": 0.447, + "step": 1546 + }, + { + "epoch": 0.382542037586548, + "grad_norm": 0.840886345860403, + "learning_rate": 4.955890927385903e-06, + "loss": 0.4373, + "step": 1547 + }, + { + "epoch": 0.3827893175074184, + "grad_norm": 0.8749065735486805, + "learning_rate": 4.955830120306436e-06, + "loss": 0.466, + "step": 1548 + }, + { + "epoch": 0.3830365974282888, + "grad_norm": 0.8266971574028511, + "learning_rate": 4.955769271716234e-06, + "loss": 0.4524, + "step": 1549 + }, + { + "epoch": 0.38328387734915925, + "grad_norm": 0.7856800140279594, + "learning_rate": 4.955708381616327e-06, + "loss": 0.4661, + "step": 1550 + }, + { + "epoch": 0.3835311572700297, + "grad_norm": 0.8227838319773383, + "learning_rate": 4.955647450007743e-06, + "loss": 0.4728, + "step": 1551 + }, + { + "epoch": 0.3837784371909001, + "grad_norm": 0.855400428650959, + "learning_rate": 4.955586476891514e-06, + "loss": 0.4523, + "step": 1552 + }, + { + "epoch": 0.3840257171117705, + "grad_norm": 0.854109040398388, + "learning_rate": 4.955525462268669e-06, + "loss": 0.4733, + "step": 1553 + }, + { + "epoch": 0.38427299703264095, + "grad_norm": 0.8476868409597142, + "learning_rate": 4.955464406140239e-06, + "loss": 0.4342, + "step": 1554 + }, + { + "epoch": 0.3845202769535114, + "grad_norm": 0.8221087986524699, + "learning_rate": 4.955403308507257e-06, + "loss": 0.4741, + "step": 1555 + }, + { + "epoch": 0.3847675568743818, + "grad_norm": 0.8985974432339402, + "learning_rate": 4.955342169370755e-06, + "loss": 0.4764, + "step": 1556 + }, + { + "epoch": 0.3850148367952522, + "grad_norm": 0.840966394271443, + "learning_rate": 4.955280988731768e-06, + "loss": 0.4677, + "step": 1557 + }, + { + "epoch": 0.38526211671612265, + "grad_norm": 0.855372122677701, + "learning_rate": 4.9552197665913284e-06, + "loss": 0.4412, + "step": 1558 + }, + { + "epoch": 0.3855093966369931, + "grad_norm": 0.8491148581437344, + "learning_rate": 4.955158502950471e-06, + "loss": 0.4804, + "step": 1559 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 0.8354735780843734, + "learning_rate": 4.955097197810233e-06, + "loss": 0.461, + "step": 1560 + }, + { + "epoch": 0.3860039564787339, + "grad_norm": 0.8421198371822611, + "learning_rate": 4.955035851171648e-06, + "loss": 0.4553, + "step": 1561 + }, + { + "epoch": 0.38625123639960435, + "grad_norm": 0.8561721132938402, + "learning_rate": 4.954974463035756e-06, + "loss": 0.4647, + "step": 1562 + }, + { + "epoch": 0.38649851632047477, + "grad_norm": 0.8326602684541324, + "learning_rate": 4.9549130334035925e-06, + "loss": 0.4217, + "step": 1563 + }, + { + "epoch": 0.3867457962413452, + "grad_norm": 0.8318953344187765, + "learning_rate": 4.954851562276196e-06, + "loss": 0.476, + "step": 1564 + }, + { + "epoch": 0.3869930761622156, + "grad_norm": 0.8104714755106442, + "learning_rate": 4.954790049654608e-06, + "loss": 0.4516, + "step": 1565 + }, + { + "epoch": 0.38724035608308605, + "grad_norm": 0.8723051321964577, + "learning_rate": 4.954728495539865e-06, + "loss": 0.483, + "step": 1566 + }, + { + "epoch": 0.38748763600395647, + "grad_norm": 0.8735820265601282, + "learning_rate": 4.954666899933008e-06, + "loss": 0.5133, + "step": 1567 + }, + { + "epoch": 0.3877349159248269, + "grad_norm": 0.8858755298043255, + "learning_rate": 4.954605262835079e-06, + "loss": 0.4557, + "step": 1568 + }, + { + "epoch": 0.3879821958456973, + "grad_norm": 0.875902503739144, + "learning_rate": 4.954543584247121e-06, + "loss": 0.4242, + "step": 1569 + }, + { + "epoch": 0.38822947576656774, + "grad_norm": 0.7924017320572292, + "learning_rate": 4.954481864170175e-06, + "loss": 0.4822, + "step": 1570 + }, + { + "epoch": 0.38847675568743817, + "grad_norm": 0.8362697886438909, + "learning_rate": 4.9544201026052845e-06, + "loss": 0.4602, + "step": 1571 + }, + { + "epoch": 0.3887240356083086, + "grad_norm": 0.8916382978285358, + "learning_rate": 4.954358299553492e-06, + "loss": 0.4405, + "step": 1572 + }, + { + "epoch": 0.388971315529179, + "grad_norm": 0.8634107397727967, + "learning_rate": 4.954296455015846e-06, + "loss": 0.456, + "step": 1573 + }, + { + "epoch": 0.38921859545004944, + "grad_norm": 0.928281717353563, + "learning_rate": 4.9542345689933875e-06, + "loss": 0.466, + "step": 1574 + }, + { + "epoch": 0.38946587537091987, + "grad_norm": 0.9131256535394252, + "learning_rate": 4.954172641487165e-06, + "loss": 0.4358, + "step": 1575 + }, + { + "epoch": 0.3897131552917903, + "grad_norm": 0.8553374784847576, + "learning_rate": 4.954110672498226e-06, + "loss": 0.447, + "step": 1576 + }, + { + "epoch": 0.3899604352126607, + "grad_norm": 0.8617713943665484, + "learning_rate": 4.954048662027615e-06, + "loss": 0.454, + "step": 1577 + }, + { + "epoch": 0.39020771513353114, + "grad_norm": 0.8378470097296922, + "learning_rate": 4.953986610076383e-06, + "loss": 0.457, + "step": 1578 + }, + { + "epoch": 0.39045499505440157, + "grad_norm": 0.890341315715719, + "learning_rate": 4.953924516645578e-06, + "loss": 0.4542, + "step": 1579 + }, + { + "epoch": 0.390702274975272, + "grad_norm": 0.867350939044516, + "learning_rate": 4.953862381736249e-06, + "loss": 0.4343, + "step": 1580 + }, + { + "epoch": 0.3909495548961424, + "grad_norm": 0.8877957331446623, + "learning_rate": 4.953800205349446e-06, + "loss": 0.488, + "step": 1581 + }, + { + "epoch": 0.39119683481701284, + "grad_norm": 0.8807877558443802, + "learning_rate": 4.953737987486221e-06, + "loss": 0.4735, + "step": 1582 + }, + { + "epoch": 0.39144411473788326, + "grad_norm": 0.8782111156976654, + "learning_rate": 4.953675728147625e-06, + "loss": 0.448, + "step": 1583 + }, + { + "epoch": 0.3916913946587537, + "grad_norm": 0.8500467289046308, + "learning_rate": 4.953613427334711e-06, + "loss": 0.4617, + "step": 1584 + }, + { + "epoch": 0.3919386745796241, + "grad_norm": 0.8409368715056174, + "learning_rate": 4.953551085048531e-06, + "loss": 0.4684, + "step": 1585 + }, + { + "epoch": 0.39218595450049454, + "grad_norm": 0.8522470314278595, + "learning_rate": 4.95348870129014e-06, + "loss": 0.4592, + "step": 1586 + }, + { + "epoch": 0.39243323442136496, + "grad_norm": 0.8168922995846802, + "learning_rate": 4.953426276060592e-06, + "loss": 0.4258, + "step": 1587 + }, + { + "epoch": 0.3926805143422354, + "grad_norm": 0.8256035919532246, + "learning_rate": 4.953363809360942e-06, + "loss": 0.4665, + "step": 1588 + }, + { + "epoch": 0.3929277942631058, + "grad_norm": 0.8541007709621282, + "learning_rate": 4.953301301192246e-06, + "loss": 0.4363, + "step": 1589 + }, + { + "epoch": 0.39317507418397624, + "grad_norm": 0.8882561921472727, + "learning_rate": 4.95323875155556e-06, + "loss": 0.4635, + "step": 1590 + }, + { + "epoch": 0.39342235410484666, + "grad_norm": 0.8501720096698456, + "learning_rate": 4.953176160451942e-06, + "loss": 0.4653, + "step": 1591 + }, + { + "epoch": 0.3936696340257171, + "grad_norm": 0.8721229757098223, + "learning_rate": 4.95311352788245e-06, + "loss": 0.4806, + "step": 1592 + }, + { + "epoch": 0.3939169139465875, + "grad_norm": 0.8979202349196487, + "learning_rate": 4.953050853848143e-06, + "loss": 0.454, + "step": 1593 + }, + { + "epoch": 0.39416419386745793, + "grad_norm": 0.8760747889780947, + "learning_rate": 4.9529881383500785e-06, + "loss": 0.4763, + "step": 1594 + }, + { + "epoch": 0.3944114737883284, + "grad_norm": 0.8452437439381595, + "learning_rate": 4.9529253813893185e-06, + "loss": 0.4282, + "step": 1595 + }, + { + "epoch": 0.39465875370919884, + "grad_norm": 0.8396178678462056, + "learning_rate": 4.952862582966923e-06, + "loss": 0.4531, + "step": 1596 + }, + { + "epoch": 0.39490603363006926, + "grad_norm": 0.8401292338194442, + "learning_rate": 4.9527997430839535e-06, + "loss": 0.4677, + "step": 1597 + }, + { + "epoch": 0.3951533135509397, + "grad_norm": 0.8078584240478519, + "learning_rate": 4.952736861741473e-06, + "loss": 0.4612, + "step": 1598 + }, + { + "epoch": 0.3954005934718101, + "grad_norm": 0.8580871690645315, + "learning_rate": 4.952673938940543e-06, + "loss": 0.4561, + "step": 1599 + }, + { + "epoch": 0.39564787339268054, + "grad_norm": 0.8309114317557393, + "learning_rate": 4.952610974682228e-06, + "loss": 0.4587, + "step": 1600 + }, + { + "epoch": 0.39589515331355096, + "grad_norm": 0.8018195354020252, + "learning_rate": 4.952547968967592e-06, + "loss": 0.4764, + "step": 1601 + }, + { + "epoch": 0.3961424332344214, + "grad_norm": 0.8227914040250693, + "learning_rate": 4.9524849217977e-06, + "loss": 0.5004, + "step": 1602 + }, + { + "epoch": 0.3963897131552918, + "grad_norm": 0.9344301223745453, + "learning_rate": 4.952421833173618e-06, + "loss": 0.4284, + "step": 1603 + }, + { + "epoch": 0.39663699307616224, + "grad_norm": 0.8125773670413535, + "learning_rate": 4.952358703096412e-06, + "loss": 0.4878, + "step": 1604 + }, + { + "epoch": 0.39688427299703266, + "grad_norm": 0.8794216357490461, + "learning_rate": 4.952295531567149e-06, + "loss": 0.4417, + "step": 1605 + }, + { + "epoch": 0.3971315529179031, + "grad_norm": 0.8785534278955001, + "learning_rate": 4.952232318586897e-06, + "loss": 0.4572, + "step": 1606 + }, + { + "epoch": 0.3973788328387735, + "grad_norm": 0.8732103433976532, + "learning_rate": 4.952169064156724e-06, + "loss": 0.4628, + "step": 1607 + }, + { + "epoch": 0.39762611275964393, + "grad_norm": 0.8388512265824324, + "learning_rate": 4.952105768277701e-06, + "loss": 0.4819, + "step": 1608 + }, + { + "epoch": 0.39787339268051436, + "grad_norm": 0.8531960312588226, + "learning_rate": 4.9520424309508954e-06, + "loss": 0.4596, + "step": 1609 + }, + { + "epoch": 0.3981206726013848, + "grad_norm": 0.8990481924298742, + "learning_rate": 4.951979052177379e-06, + "loss": 0.4679, + "step": 1610 + }, + { + "epoch": 0.3983679525222552, + "grad_norm": 0.9241672808577421, + "learning_rate": 4.9519156319582226e-06, + "loss": 0.4409, + "step": 1611 + }, + { + "epoch": 0.39861523244312563, + "grad_norm": 0.8075821047905285, + "learning_rate": 4.9518521702945e-06, + "loss": 0.4304, + "step": 1612 + }, + { + "epoch": 0.39886251236399606, + "grad_norm": 0.8439405328812108, + "learning_rate": 4.951788667187281e-06, + "loss": 0.4412, + "step": 1613 + }, + { + "epoch": 0.3991097922848665, + "grad_norm": 0.8936493881653371, + "learning_rate": 4.95172512263764e-06, + "loss": 0.446, + "step": 1614 + }, + { + "epoch": 0.3993570722057369, + "grad_norm": 0.8718520805400518, + "learning_rate": 4.9516615366466535e-06, + "loss": 0.4565, + "step": 1615 + }, + { + "epoch": 0.39960435212660733, + "grad_norm": 0.8336286918355128, + "learning_rate": 4.951597909215393e-06, + "loss": 0.4722, + "step": 1616 + }, + { + "epoch": 0.39985163204747776, + "grad_norm": 0.8757209786566342, + "learning_rate": 4.951534240344936e-06, + "loss": 0.4569, + "step": 1617 + }, + { + "epoch": 0.4000989119683482, + "grad_norm": 0.8705182757213266, + "learning_rate": 4.951470530036358e-06, + "loss": 0.4605, + "step": 1618 + }, + { + "epoch": 0.4003461918892186, + "grad_norm": 0.8640519203759794, + "learning_rate": 4.951406778290735e-06, + "loss": 0.4554, + "step": 1619 + }, + { + "epoch": 0.40059347181008903, + "grad_norm": 0.8362305667086866, + "learning_rate": 4.951342985109147e-06, + "loss": 0.4673, + "step": 1620 + }, + { + "epoch": 0.40084075173095945, + "grad_norm": 0.855065016510783, + "learning_rate": 4.951279150492669e-06, + "loss": 0.4581, + "step": 1621 + }, + { + "epoch": 0.4010880316518299, + "grad_norm": 0.9169850417037374, + "learning_rate": 4.9512152744423836e-06, + "loss": 0.4765, + "step": 1622 + }, + { + "epoch": 0.4013353115727003, + "grad_norm": 0.8578287913225426, + "learning_rate": 4.951151356959368e-06, + "loss": 0.4479, + "step": 1623 + }, + { + "epoch": 0.40158259149357073, + "grad_norm": 0.9055998939207615, + "learning_rate": 4.951087398044702e-06, + "loss": 0.4566, + "step": 1624 + }, + { + "epoch": 0.40182987141444115, + "grad_norm": 0.8854013010161614, + "learning_rate": 4.951023397699469e-06, + "loss": 0.4654, + "step": 1625 + }, + { + "epoch": 0.4020771513353116, + "grad_norm": 0.8676151411605866, + "learning_rate": 4.9509593559247505e-06, + "loss": 0.4685, + "step": 1626 + }, + { + "epoch": 0.402324431256182, + "grad_norm": 0.8113407989272491, + "learning_rate": 4.950895272721627e-06, + "loss": 0.4685, + "step": 1627 + }, + { + "epoch": 0.4025717111770524, + "grad_norm": 0.8210151970043335, + "learning_rate": 4.950831148091184e-06, + "loss": 0.4517, + "step": 1628 + }, + { + "epoch": 0.40281899109792285, + "grad_norm": 0.8853716567053723, + "learning_rate": 4.950766982034504e-06, + "loss": 0.4227, + "step": 1629 + }, + { + "epoch": 0.4030662710187933, + "grad_norm": 0.9285034946776956, + "learning_rate": 4.950702774552671e-06, + "loss": 0.4095, + "step": 1630 + }, + { + "epoch": 0.4033135509396637, + "grad_norm": 0.8081586648223669, + "learning_rate": 4.950638525646773e-06, + "loss": 0.4483, + "step": 1631 + }, + { + "epoch": 0.4035608308605341, + "grad_norm": 0.814407403090029, + "learning_rate": 4.9505742353178935e-06, + "loss": 0.4395, + "step": 1632 + }, + { + "epoch": 0.40380811078140455, + "grad_norm": 0.9318384755859314, + "learning_rate": 4.9505099035671185e-06, + "loss": 0.4528, + "step": 1633 + }, + { + "epoch": 0.404055390702275, + "grad_norm": 0.8556784555850027, + "learning_rate": 4.950445530395539e-06, + "loss": 0.4551, + "step": 1634 + }, + { + "epoch": 0.4043026706231454, + "grad_norm": 0.8236028720202845, + "learning_rate": 4.9503811158042394e-06, + "loss": 0.4969, + "step": 1635 + }, + { + "epoch": 0.4045499505440158, + "grad_norm": 0.7955100118692611, + "learning_rate": 4.9503166597943105e-06, + "loss": 0.4911, + "step": 1636 + }, + { + "epoch": 0.40479723046488625, + "grad_norm": 0.8321762635331581, + "learning_rate": 4.950252162366841e-06, + "loss": 0.4743, + "step": 1637 + }, + { + "epoch": 0.4050445103857567, + "grad_norm": 0.892782367227542, + "learning_rate": 4.950187623522922e-06, + "loss": 0.4635, + "step": 1638 + }, + { + "epoch": 0.4052917903066271, + "grad_norm": 0.8290384221114429, + "learning_rate": 4.950123043263644e-06, + "loss": 0.4701, + "step": 1639 + }, + { + "epoch": 0.4055390702274975, + "grad_norm": 0.8635462718575356, + "learning_rate": 4.9500584215900975e-06, + "loss": 0.4315, + "step": 1640 + }, + { + "epoch": 0.40578635014836795, + "grad_norm": 0.8303892565178683, + "learning_rate": 4.949993758503376e-06, + "loss": 0.4925, + "step": 1641 + }, + { + "epoch": 0.40603363006923837, + "grad_norm": 0.840727633311021, + "learning_rate": 4.949929054004572e-06, + "loss": 0.4629, + "step": 1642 + }, + { + "epoch": 0.4062809099901088, + "grad_norm": 0.8634706522730431, + "learning_rate": 4.949864308094779e-06, + "loss": 0.4796, + "step": 1643 + }, + { + "epoch": 0.4065281899109792, + "grad_norm": 0.8361394254501443, + "learning_rate": 4.949799520775092e-06, + "loss": 0.4568, + "step": 1644 + }, + { + "epoch": 0.40677546983184965, + "grad_norm": 0.8139921355363459, + "learning_rate": 4.9497346920466074e-06, + "loss": 0.4557, + "step": 1645 + }, + { + "epoch": 0.40702274975272007, + "grad_norm": 0.812250032386861, + "learning_rate": 4.949669821910418e-06, + "loss": 0.4566, + "step": 1646 + }, + { + "epoch": 0.4072700296735905, + "grad_norm": 0.828761594585985, + "learning_rate": 4.949604910367623e-06, + "loss": 0.4862, + "step": 1647 + }, + { + "epoch": 0.4075173095944609, + "grad_norm": 0.8177756231917882, + "learning_rate": 4.949539957419317e-06, + "loss": 0.4586, + "step": 1648 + }, + { + "epoch": 0.40776458951533134, + "grad_norm": 0.8214768220432287, + "learning_rate": 4.949474963066599e-06, + "loss": 0.449, + "step": 1649 + }, + { + "epoch": 0.40801186943620177, + "grad_norm": 0.8194371482269276, + "learning_rate": 4.9494099273105686e-06, + "loss": 0.457, + "step": 1650 + }, + { + "epoch": 0.4082591493570722, + "grad_norm": 0.8244018158911955, + "learning_rate": 4.9493448501523245e-06, + "loss": 0.4345, + "step": 1651 + }, + { + "epoch": 0.4085064292779426, + "grad_norm": 0.8009139765863647, + "learning_rate": 4.949279731592967e-06, + "loss": 0.4791, + "step": 1652 + }, + { + "epoch": 0.40875370919881304, + "grad_norm": 0.8116159409968832, + "learning_rate": 4.949214571633595e-06, + "loss": 0.4369, + "step": 1653 + }, + { + "epoch": 0.40900098911968347, + "grad_norm": 0.8562285011172734, + "learning_rate": 4.949149370275311e-06, + "loss": 0.4557, + "step": 1654 + }, + { + "epoch": 0.4092482690405539, + "grad_norm": 0.8191314049207302, + "learning_rate": 4.949084127519219e-06, + "loss": 0.4683, + "step": 1655 + }, + { + "epoch": 0.4094955489614243, + "grad_norm": 0.7778969873901285, + "learning_rate": 4.949018843366419e-06, + "loss": 0.4639, + "step": 1656 + }, + { + "epoch": 0.40974282888229474, + "grad_norm": 0.806816984603995, + "learning_rate": 4.9489535178180155e-06, + "loss": 0.4335, + "step": 1657 + }, + { + "epoch": 0.40999010880316517, + "grad_norm": 0.8215100542029089, + "learning_rate": 4.9488881508751135e-06, + "loss": 0.4436, + "step": 1658 + }, + { + "epoch": 0.4102373887240356, + "grad_norm": 0.8281492795541459, + "learning_rate": 4.948822742538817e-06, + "loss": 0.4521, + "step": 1659 + }, + { + "epoch": 0.410484668644906, + "grad_norm": 0.897412440002854, + "learning_rate": 4.9487572928102315e-06, + "loss": 0.4777, + "step": 1660 + }, + { + "epoch": 0.41073194856577644, + "grad_norm": 0.8231014853022581, + "learning_rate": 4.948691801690464e-06, + "loss": 0.4668, + "step": 1661 + }, + { + "epoch": 0.41097922848664686, + "grad_norm": 0.8309257711014859, + "learning_rate": 4.948626269180621e-06, + "loss": 0.423, + "step": 1662 + }, + { + "epoch": 0.4112265084075173, + "grad_norm": 0.788186095486044, + "learning_rate": 4.94856069528181e-06, + "loss": 0.4461, + "step": 1663 + }, + { + "epoch": 0.4114737883283877, + "grad_norm": 0.8031013505892846, + "learning_rate": 4.948495079995139e-06, + "loss": 0.4581, + "step": 1664 + }, + { + "epoch": 0.41172106824925814, + "grad_norm": 0.8214990535999098, + "learning_rate": 4.948429423321719e-06, + "loss": 0.4386, + "step": 1665 + }, + { + "epoch": 0.41196834817012856, + "grad_norm": 0.8697067040191242, + "learning_rate": 4.9483637252626585e-06, + "loss": 0.4274, + "step": 1666 + }, + { + "epoch": 0.412215628090999, + "grad_norm": 0.8208094818047257, + "learning_rate": 4.948297985819067e-06, + "loss": 0.4561, + "step": 1667 + }, + { + "epoch": 0.4124629080118694, + "grad_norm": 0.8199301154122366, + "learning_rate": 4.9482322049920575e-06, + "loss": 0.4606, + "step": 1668 + }, + { + "epoch": 0.41271018793273984, + "grad_norm": 0.7865329706718255, + "learning_rate": 4.948166382782741e-06, + "loss": 0.4564, + "step": 1669 + }, + { + "epoch": 0.41295746785361026, + "grad_norm": 0.8352567987592875, + "learning_rate": 4.948100519192229e-06, + "loss": 0.4558, + "step": 1670 + }, + { + "epoch": 0.4132047477744807, + "grad_norm": 0.8861005482099316, + "learning_rate": 4.9480346142216375e-06, + "loss": 0.4658, + "step": 1671 + }, + { + "epoch": 0.4134520276953511, + "grad_norm": 0.852460995747833, + "learning_rate": 4.947968667872079e-06, + "loss": 0.4487, + "step": 1672 + }, + { + "epoch": 0.4136993076162216, + "grad_norm": 0.8173953757463533, + "learning_rate": 4.947902680144667e-06, + "loss": 0.4554, + "step": 1673 + }, + { + "epoch": 0.413946587537092, + "grad_norm": 0.8668203863267794, + "learning_rate": 4.947836651040519e-06, + "loss": 0.4846, + "step": 1674 + }, + { + "epoch": 0.41419386745796244, + "grad_norm": 0.7885997646188458, + "learning_rate": 4.94777058056075e-06, + "loss": 0.4815, + "step": 1675 + }, + { + "epoch": 0.41444114737883286, + "grad_norm": 0.8506062284072559, + "learning_rate": 4.947704468706477e-06, + "loss": 0.4362, + "step": 1676 + }, + { + "epoch": 0.4146884272997033, + "grad_norm": 0.8786876540859795, + "learning_rate": 4.947638315478817e-06, + "loss": 0.4119, + "step": 1677 + }, + { + "epoch": 0.4149357072205737, + "grad_norm": 0.782177966395444, + "learning_rate": 4.9475721208788885e-06, + "loss": 0.478, + "step": 1678 + }, + { + "epoch": 0.41518298714144414, + "grad_norm": 0.8478757384596047, + "learning_rate": 4.94750588490781e-06, + "loss": 0.4391, + "step": 1679 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 0.811351078735783, + "learning_rate": 4.947439607566703e-06, + "loss": 0.4447, + "step": 1680 + }, + { + "epoch": 0.415677546983185, + "grad_norm": 0.7843424493602333, + "learning_rate": 4.947373288856685e-06, + "loss": 0.4617, + "step": 1681 + }, + { + "epoch": 0.4159248269040554, + "grad_norm": 0.8235806060021909, + "learning_rate": 4.947306928778879e-06, + "loss": 0.4864, + "step": 1682 + }, + { + "epoch": 0.41617210682492584, + "grad_norm": 0.847868232417344, + "learning_rate": 4.947240527334406e-06, + "loss": 0.4773, + "step": 1683 + }, + { + "epoch": 0.41641938674579626, + "grad_norm": 0.8697925069615653, + "learning_rate": 4.947174084524387e-06, + "loss": 0.4453, + "step": 1684 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.8142283134114524, + "learning_rate": 4.947107600349948e-06, + "loss": 0.4721, + "step": 1685 + }, + { + "epoch": 0.4169139465875371, + "grad_norm": 0.8511529611862372, + "learning_rate": 4.947041074812211e-06, + "loss": 0.4462, + "step": 1686 + }, + { + "epoch": 0.41716122650840753, + "grad_norm": 0.8329691094863567, + "learning_rate": 4.946974507912301e-06, + "loss": 0.4389, + "step": 1687 + }, + { + "epoch": 0.41740850642927796, + "grad_norm": 0.8428860586768815, + "learning_rate": 4.946907899651342e-06, + "loss": 0.4514, + "step": 1688 + }, + { + "epoch": 0.4176557863501484, + "grad_norm": 0.81243219915672, + "learning_rate": 4.946841250030461e-06, + "loss": 0.4397, + "step": 1689 + }, + { + "epoch": 0.4179030662710188, + "grad_norm": 0.8532101717804385, + "learning_rate": 4.946774559050785e-06, + "loss": 0.5014, + "step": 1690 + }, + { + "epoch": 0.41815034619188923, + "grad_norm": 0.8488612058084708, + "learning_rate": 4.9467078267134396e-06, + "loss": 0.4745, + "step": 1691 + }, + { + "epoch": 0.41839762611275966, + "grad_norm": 0.8429030041129258, + "learning_rate": 4.946641053019554e-06, + "loss": 0.4668, + "step": 1692 + }, + { + "epoch": 0.4186449060336301, + "grad_norm": 0.8126448430228969, + "learning_rate": 4.9465742379702574e-06, + "loss": 0.4771, + "step": 1693 + }, + { + "epoch": 0.4188921859545005, + "grad_norm": 0.8732774473739731, + "learning_rate": 4.946507381566677e-06, + "loss": 0.4761, + "step": 1694 + }, + { + "epoch": 0.41913946587537093, + "grad_norm": 0.8717402341975771, + "learning_rate": 4.946440483809946e-06, + "loss": 0.4822, + "step": 1695 + }, + { + "epoch": 0.41938674579624136, + "grad_norm": 0.8502511813109619, + "learning_rate": 4.946373544701193e-06, + "loss": 0.4152, + "step": 1696 + }, + { + "epoch": 0.4196340257171118, + "grad_norm": 0.8596676366588083, + "learning_rate": 4.9463065642415485e-06, + "loss": 0.4362, + "step": 1697 + }, + { + "epoch": 0.4198813056379822, + "grad_norm": 0.8901457121778148, + "learning_rate": 4.9462395424321476e-06, + "loss": 0.4418, + "step": 1698 + }, + { + "epoch": 0.42012858555885263, + "grad_norm": 0.9208361238486875, + "learning_rate": 4.946172479274121e-06, + "loss": 0.4329, + "step": 1699 + }, + { + "epoch": 0.42037586547972305, + "grad_norm": 0.8547536386643835, + "learning_rate": 4.946105374768603e-06, + "loss": 0.4873, + "step": 1700 + }, + { + "epoch": 0.4206231454005935, + "grad_norm": 0.8508039548865888, + "learning_rate": 4.9460382289167284e-06, + "loss": 0.4251, + "step": 1701 + }, + { + "epoch": 0.4208704253214639, + "grad_norm": 0.7970088950959086, + "learning_rate": 4.945971041719631e-06, + "loss": 0.4671, + "step": 1702 + }, + { + "epoch": 0.42111770524233433, + "grad_norm": 0.7841689731189674, + "learning_rate": 4.945903813178447e-06, + "loss": 0.467, + "step": 1703 + }, + { + "epoch": 0.42136498516320475, + "grad_norm": 0.8173705703188501, + "learning_rate": 4.945836543294312e-06, + "loss": 0.4546, + "step": 1704 + }, + { + "epoch": 0.4216122650840752, + "grad_norm": 0.8161859578647371, + "learning_rate": 4.945769232068364e-06, + "loss": 0.4916, + "step": 1705 + }, + { + "epoch": 0.4218595450049456, + "grad_norm": 0.8198882926959018, + "learning_rate": 4.945701879501742e-06, + "loss": 0.4505, + "step": 1706 + }, + { + "epoch": 0.422106824925816, + "grad_norm": 0.8118644016805705, + "learning_rate": 4.945634485595582e-06, + "loss": 0.4554, + "step": 1707 + }, + { + "epoch": 0.42235410484668645, + "grad_norm": 0.8913670827889651, + "learning_rate": 4.945567050351024e-06, + "loss": 0.4465, + "step": 1708 + }, + { + "epoch": 0.4226013847675569, + "grad_norm": 0.8459259044723303, + "learning_rate": 4.945499573769209e-06, + "loss": 0.4573, + "step": 1709 + }, + { + "epoch": 0.4228486646884273, + "grad_norm": 0.826742105916524, + "learning_rate": 4.945432055851276e-06, + "loss": 0.4777, + "step": 1710 + }, + { + "epoch": 0.4230959446092977, + "grad_norm": 0.7946204416088612, + "learning_rate": 4.945364496598366e-06, + "loss": 0.4572, + "step": 1711 + }, + { + "epoch": 0.42334322453016815, + "grad_norm": 0.8168683419578028, + "learning_rate": 4.9452968960116235e-06, + "loss": 0.448, + "step": 1712 + }, + { + "epoch": 0.4235905044510386, + "grad_norm": 0.8297167888254489, + "learning_rate": 4.945229254092188e-06, + "loss": 0.4519, + "step": 1713 + }, + { + "epoch": 0.423837784371909, + "grad_norm": 0.8970476493506931, + "learning_rate": 4.945161570841205e-06, + "loss": 0.4432, + "step": 1714 + }, + { + "epoch": 0.4240850642927794, + "grad_norm": 0.8465002130673689, + "learning_rate": 4.945093846259817e-06, + "loss": 0.4476, + "step": 1715 + }, + { + "epoch": 0.42433234421364985, + "grad_norm": 0.8347667071381571, + "learning_rate": 4.9450260803491705e-06, + "loss": 0.4394, + "step": 1716 + }, + { + "epoch": 0.4245796241345203, + "grad_norm": 0.814417603831734, + "learning_rate": 4.94495827311041e-06, + "loss": 0.4542, + "step": 1717 + }, + { + "epoch": 0.4248269040553907, + "grad_norm": 0.9018559702844308, + "learning_rate": 4.944890424544681e-06, + "loss": 0.4449, + "step": 1718 + }, + { + "epoch": 0.4250741839762611, + "grad_norm": 0.7992025415966338, + "learning_rate": 4.944822534653131e-06, + "loss": 0.462, + "step": 1719 + }, + { + "epoch": 0.42532146389713155, + "grad_norm": 0.8037703415889722, + "learning_rate": 4.944754603436908e-06, + "loss": 0.4583, + "step": 1720 + }, + { + "epoch": 0.42556874381800197, + "grad_norm": 0.8439242492521162, + "learning_rate": 4.94468663089716e-06, + "loss": 0.4465, + "step": 1721 + }, + { + "epoch": 0.4258160237388724, + "grad_norm": 0.8326257136253545, + "learning_rate": 4.944618617035035e-06, + "loss": 0.4599, + "step": 1722 + }, + { + "epoch": 0.4260633036597428, + "grad_norm": 0.8225161900870498, + "learning_rate": 4.944550561851685e-06, + "loss": 0.4424, + "step": 1723 + }, + { + "epoch": 0.42631058358061324, + "grad_norm": 0.8117853626022742, + "learning_rate": 4.944482465348257e-06, + "loss": 0.4518, + "step": 1724 + }, + { + "epoch": 0.42655786350148367, + "grad_norm": 0.8618327982125517, + "learning_rate": 4.944414327525904e-06, + "loss": 0.4339, + "step": 1725 + }, + { + "epoch": 0.4268051434223541, + "grad_norm": 0.8386378955346201, + "learning_rate": 4.944346148385777e-06, + "loss": 0.4841, + "step": 1726 + }, + { + "epoch": 0.4270524233432245, + "grad_norm": 0.8772973032751197, + "learning_rate": 4.9442779279290295e-06, + "loss": 0.4892, + "step": 1727 + }, + { + "epoch": 0.42729970326409494, + "grad_norm": 0.908072104090296, + "learning_rate": 4.944209666156814e-06, + "loss": 0.457, + "step": 1728 + }, + { + "epoch": 0.42754698318496537, + "grad_norm": 0.821449218350982, + "learning_rate": 4.944141363070284e-06, + "loss": 0.4392, + "step": 1729 + }, + { + "epoch": 0.4277942631058358, + "grad_norm": 0.8316991236817805, + "learning_rate": 4.944073018670594e-06, + "loss": 0.4764, + "step": 1730 + }, + { + "epoch": 0.4280415430267062, + "grad_norm": 0.8694517096681799, + "learning_rate": 4.9440046329589e-06, + "loss": 0.4539, + "step": 1731 + }, + { + "epoch": 0.42828882294757664, + "grad_norm": 0.8471764393664999, + "learning_rate": 4.943936205936359e-06, + "loss": 0.4114, + "step": 1732 + }, + { + "epoch": 0.42853610286844707, + "grad_norm": 0.8183886076647181, + "learning_rate": 4.943867737604123e-06, + "loss": 0.4322, + "step": 1733 + }, + { + "epoch": 0.4287833827893175, + "grad_norm": 0.8579245008449513, + "learning_rate": 4.943799227963354e-06, + "loss": 0.4497, + "step": 1734 + }, + { + "epoch": 0.4290306627101879, + "grad_norm": 0.8792696834992108, + "learning_rate": 4.943730677015209e-06, + "loss": 0.4437, + "step": 1735 + }, + { + "epoch": 0.42927794263105834, + "grad_norm": 0.8196225113446304, + "learning_rate": 4.9436620847608455e-06, + "loss": 0.4486, + "step": 1736 + }, + { + "epoch": 0.42952522255192876, + "grad_norm": 0.792556437218192, + "learning_rate": 4.943593451201424e-06, + "loss": 0.4464, + "step": 1737 + }, + { + "epoch": 0.4297725024727992, + "grad_norm": 0.7962854260626189, + "learning_rate": 4.943524776338104e-06, + "loss": 0.4723, + "step": 1738 + }, + { + "epoch": 0.4300197823936696, + "grad_norm": 0.8352264957077538, + "learning_rate": 4.943456060172046e-06, + "loss": 0.4501, + "step": 1739 + }, + { + "epoch": 0.43026706231454004, + "grad_norm": 0.8400190502171467, + "learning_rate": 4.943387302704412e-06, + "loss": 0.454, + "step": 1740 + }, + { + "epoch": 0.43051434223541046, + "grad_norm": 0.8276334232747109, + "learning_rate": 4.943318503936364e-06, + "loss": 0.4144, + "step": 1741 + }, + { + "epoch": 0.4307616221562809, + "grad_norm": 0.8302199804463065, + "learning_rate": 4.943249663869066e-06, + "loss": 0.4686, + "step": 1742 + }, + { + "epoch": 0.4310089020771513, + "grad_norm": 0.8207197842738865, + "learning_rate": 4.94318078250368e-06, + "loss": 0.4644, + "step": 1743 + }, + { + "epoch": 0.43125618199802174, + "grad_norm": 0.834730998310763, + "learning_rate": 4.943111859841371e-06, + "loss": 0.4695, + "step": 1744 + }, + { + "epoch": 0.43150346191889216, + "grad_norm": 0.8070569226814367, + "learning_rate": 4.943042895883304e-06, + "loss": 0.466, + "step": 1745 + }, + { + "epoch": 0.4317507418397626, + "grad_norm": 0.8163444224681524, + "learning_rate": 4.942973890630645e-06, + "loss": 0.4782, + "step": 1746 + }, + { + "epoch": 0.431998021760633, + "grad_norm": 0.83208535669435, + "learning_rate": 4.942904844084559e-06, + "loss": 0.4534, + "step": 1747 + }, + { + "epoch": 0.43224530168150344, + "grad_norm": 0.7887636687926843, + "learning_rate": 4.942835756246215e-06, + "loss": 0.4407, + "step": 1748 + }, + { + "epoch": 0.43249258160237386, + "grad_norm": 0.8521733444979064, + "learning_rate": 4.942766627116779e-06, + "loss": 0.459, + "step": 1749 + }, + { + "epoch": 0.43273986152324434, + "grad_norm": 0.8518317628217601, + "learning_rate": 4.942697456697422e-06, + "loss": 0.4615, + "step": 1750 + }, + { + "epoch": 0.43298714144411476, + "grad_norm": 0.9185604966693071, + "learning_rate": 4.94262824498931e-06, + "loss": 0.4503, + "step": 1751 + }, + { + "epoch": 0.4332344213649852, + "grad_norm": 0.8103056740307707, + "learning_rate": 4.942558991993615e-06, + "loss": 0.4554, + "step": 1752 + }, + { + "epoch": 0.4334817012858556, + "grad_norm": 0.799261239542613, + "learning_rate": 4.942489697711508e-06, + "loss": 0.4603, + "step": 1753 + }, + { + "epoch": 0.43372898120672604, + "grad_norm": 0.8425868034525744, + "learning_rate": 4.9424203621441585e-06, + "loss": 0.4639, + "step": 1754 + }, + { + "epoch": 0.43397626112759646, + "grad_norm": 0.813736917430641, + "learning_rate": 4.9423509852927395e-06, + "loss": 0.4504, + "step": 1755 + }, + { + "epoch": 0.4342235410484669, + "grad_norm": 0.8325032700998303, + "learning_rate": 4.942281567158424e-06, + "loss": 0.4486, + "step": 1756 + }, + { + "epoch": 0.4344708209693373, + "grad_norm": 0.8578441243657652, + "learning_rate": 4.942212107742384e-06, + "loss": 0.4324, + "step": 1757 + }, + { + "epoch": 0.43471810089020774, + "grad_norm": 0.7950101986986995, + "learning_rate": 4.9421426070457946e-06, + "loss": 0.455, + "step": 1758 + }, + { + "epoch": 0.43496538081107816, + "grad_norm": 0.8464725428496769, + "learning_rate": 4.94207306506983e-06, + "loss": 0.4338, + "step": 1759 + }, + { + "epoch": 0.4352126607319486, + "grad_norm": 0.8469706639469825, + "learning_rate": 4.942003481815666e-06, + "loss": 0.4288, + "step": 1760 + }, + { + "epoch": 0.435459940652819, + "grad_norm": 0.8775115383798611, + "learning_rate": 4.94193385728448e-06, + "loss": 0.4337, + "step": 1761 + }, + { + "epoch": 0.43570722057368944, + "grad_norm": 0.7880837358851591, + "learning_rate": 4.9418641914774465e-06, + "loss": 0.4471, + "step": 1762 + }, + { + "epoch": 0.43595450049455986, + "grad_norm": 0.8742884553070869, + "learning_rate": 4.9417944843957445e-06, + "loss": 0.4342, + "step": 1763 + }, + { + "epoch": 0.4362017804154303, + "grad_norm": 0.8660240978853062, + "learning_rate": 4.941724736040552e-06, + "loss": 0.4734, + "step": 1764 + }, + { + "epoch": 0.4364490603363007, + "grad_norm": 0.855246178964046, + "learning_rate": 4.941654946413048e-06, + "loss": 0.4821, + "step": 1765 + }, + { + "epoch": 0.43669634025717113, + "grad_norm": 0.8049402509434078, + "learning_rate": 4.941585115514412e-06, + "loss": 0.4572, + "step": 1766 + }, + { + "epoch": 0.43694362017804156, + "grad_norm": 0.8102549094132898, + "learning_rate": 4.9415152433458245e-06, + "loss": 0.4627, + "step": 1767 + }, + { + "epoch": 0.437190900098912, + "grad_norm": 0.855260779736409, + "learning_rate": 4.941445329908466e-06, + "loss": 0.4486, + "step": 1768 + }, + { + "epoch": 0.4374381800197824, + "grad_norm": 0.8398694779387051, + "learning_rate": 4.94137537520352e-06, + "loss": 0.4443, + "step": 1769 + }, + { + "epoch": 0.43768545994065283, + "grad_norm": 0.8495857810077256, + "learning_rate": 4.941305379232166e-06, + "loss": 0.4494, + "step": 1770 + }, + { + "epoch": 0.43793273986152326, + "grad_norm": 0.8457483281660204, + "learning_rate": 4.941235341995589e-06, + "loss": 0.4213, + "step": 1771 + }, + { + "epoch": 0.4381800197823937, + "grad_norm": 0.8416695102723704, + "learning_rate": 4.941165263494974e-06, + "loss": 0.4329, + "step": 1772 + }, + { + "epoch": 0.4384272997032641, + "grad_norm": 0.8077854346661958, + "learning_rate": 4.9410951437315034e-06, + "loss": 0.4563, + "step": 1773 + }, + { + "epoch": 0.43867457962413453, + "grad_norm": 0.8591862542056117, + "learning_rate": 4.941024982706363e-06, + "loss": 0.4395, + "step": 1774 + }, + { + "epoch": 0.43892185954500496, + "grad_norm": 0.8865112531995624, + "learning_rate": 4.9409547804207396e-06, + "loss": 0.4369, + "step": 1775 + }, + { + "epoch": 0.4391691394658754, + "grad_norm": 0.79342992597239, + "learning_rate": 4.940884536875817e-06, + "loss": 0.45, + "step": 1776 + }, + { + "epoch": 0.4394164193867458, + "grad_norm": 0.7881270666911689, + "learning_rate": 4.940814252072787e-06, + "loss": 0.4523, + "step": 1777 + }, + { + "epoch": 0.43966369930761623, + "grad_norm": 0.855046856544146, + "learning_rate": 4.9407439260128345e-06, + "loss": 0.4397, + "step": 1778 + }, + { + "epoch": 0.43991097922848665, + "grad_norm": 0.8388877807693309, + "learning_rate": 4.940673558697149e-06, + "loss": 0.4475, + "step": 1779 + }, + { + "epoch": 0.4401582591493571, + "grad_norm": 0.8512455561172892, + "learning_rate": 4.940603150126919e-06, + "loss": 0.4456, + "step": 1780 + }, + { + "epoch": 0.4404055390702275, + "grad_norm": 0.8861587872050732, + "learning_rate": 4.940532700303337e-06, + "loss": 0.4676, + "step": 1781 + }, + { + "epoch": 0.4406528189910979, + "grad_norm": 0.8159176864311103, + "learning_rate": 4.940462209227592e-06, + "loss": 0.4761, + "step": 1782 + }, + { + "epoch": 0.44090009891196835, + "grad_norm": 0.8589329700847905, + "learning_rate": 4.9403916769008755e-06, + "loss": 0.4238, + "step": 1783 + }, + { + "epoch": 0.4411473788328388, + "grad_norm": 0.8120103864684454, + "learning_rate": 4.940321103324379e-06, + "loss": 0.4533, + "step": 1784 + }, + { + "epoch": 0.4413946587537092, + "grad_norm": 0.830021957146338, + "learning_rate": 4.940250488499298e-06, + "loss": 0.4708, + "step": 1785 + }, + { + "epoch": 0.4416419386745796, + "grad_norm": 0.8784720779654032, + "learning_rate": 4.9401798324268236e-06, + "loss": 0.4853, + "step": 1786 + }, + { + "epoch": 0.44188921859545005, + "grad_norm": 0.8300399974772742, + "learning_rate": 4.940109135108152e-06, + "loss": 0.4547, + "step": 1787 + }, + { + "epoch": 0.4421364985163205, + "grad_norm": 0.8155400116963316, + "learning_rate": 4.940038396544476e-06, + "loss": 0.4464, + "step": 1788 + }, + { + "epoch": 0.4423837784371909, + "grad_norm": 0.827571249140311, + "learning_rate": 4.939967616736994e-06, + "loss": 0.4337, + "step": 1789 + }, + { + "epoch": 0.4426310583580613, + "grad_norm": 0.8167550515040979, + "learning_rate": 4.939896795686899e-06, + "loss": 0.4486, + "step": 1790 + }, + { + "epoch": 0.44287833827893175, + "grad_norm": 0.8494832396966039, + "learning_rate": 4.939825933395391e-06, + "loss": 0.4496, + "step": 1791 + }, + { + "epoch": 0.4431256181998022, + "grad_norm": 0.8003204538621603, + "learning_rate": 4.939755029863667e-06, + "loss": 0.4445, + "step": 1792 + }, + { + "epoch": 0.4433728981206726, + "grad_norm": 0.7824474758817086, + "learning_rate": 4.939684085092925e-06, + "loss": 0.4465, + "step": 1793 + }, + { + "epoch": 0.443620178041543, + "grad_norm": 0.8081718945058649, + "learning_rate": 4.939613099084365e-06, + "loss": 0.4528, + "step": 1794 + }, + { + "epoch": 0.44386745796241345, + "grad_norm": 0.8495881551915857, + "learning_rate": 4.939542071839185e-06, + "loss": 0.4403, + "step": 1795 + }, + { + "epoch": 0.44411473788328387, + "grad_norm": 0.7924639025505061, + "learning_rate": 4.939471003358587e-06, + "loss": 0.4559, + "step": 1796 + }, + { + "epoch": 0.4443620178041543, + "grad_norm": 0.8065926677878719, + "learning_rate": 4.939399893643773e-06, + "loss": 0.4489, + "step": 1797 + }, + { + "epoch": 0.4446092977250247, + "grad_norm": 0.8231537032809033, + "learning_rate": 4.939328742695943e-06, + "loss": 0.4511, + "step": 1798 + }, + { + "epoch": 0.44485657764589515, + "grad_norm": 0.8629947617643108, + "learning_rate": 4.939257550516302e-06, + "loss": 0.452, + "step": 1799 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 0.8452213524607881, + "learning_rate": 4.939186317106051e-06, + "loss": 0.4724, + "step": 1800 + }, + { + "epoch": 0.445351137487636, + "grad_norm": 0.8406637491043357, + "learning_rate": 4.939115042466397e-06, + "loss": 0.435, + "step": 1801 + }, + { + "epoch": 0.4455984174085064, + "grad_norm": 0.822963252538923, + "learning_rate": 4.9390437265985415e-06, + "loss": 0.4258, + "step": 1802 + }, + { + "epoch": 0.44584569732937684, + "grad_norm": 0.7878763520585504, + "learning_rate": 4.93897236950369e-06, + "loss": 0.4608, + "step": 1803 + }, + { + "epoch": 0.44609297725024727, + "grad_norm": 0.8349799067133575, + "learning_rate": 4.938900971183053e-06, + "loss": 0.4327, + "step": 1804 + }, + { + "epoch": 0.4463402571711177, + "grad_norm": 0.8816160813241095, + "learning_rate": 4.9388295316378325e-06, + "loss": 0.4639, + "step": 1805 + }, + { + "epoch": 0.4465875370919881, + "grad_norm": 0.84235020294276, + "learning_rate": 4.938758050869238e-06, + "loss": 0.4499, + "step": 1806 + }, + { + "epoch": 0.44683481701285854, + "grad_norm": 0.8242327113382405, + "learning_rate": 4.938686528878477e-06, + "loss": 0.4735, + "step": 1807 + }, + { + "epoch": 0.44708209693372897, + "grad_norm": 0.7886103886587141, + "learning_rate": 4.93861496566676e-06, + "loss": 0.4615, + "step": 1808 + }, + { + "epoch": 0.4473293768545994, + "grad_norm": 0.8398273563191035, + "learning_rate": 4.938543361235295e-06, + "loss": 0.4269, + "step": 1809 + }, + { + "epoch": 0.4475766567754698, + "grad_norm": 0.8573294083629199, + "learning_rate": 4.938471715585293e-06, + "loss": 0.4333, + "step": 1810 + }, + { + "epoch": 0.44782393669634024, + "grad_norm": 0.8849773282454866, + "learning_rate": 4.938400028717966e-06, + "loss": 0.4464, + "step": 1811 + }, + { + "epoch": 0.44807121661721067, + "grad_norm": 0.8569807993576385, + "learning_rate": 4.938328300634524e-06, + "loss": 0.4555, + "step": 1812 + }, + { + "epoch": 0.4483184965380811, + "grad_norm": 0.8038399479751446, + "learning_rate": 4.93825653133618e-06, + "loss": 0.4432, + "step": 1813 + }, + { + "epoch": 0.4485657764589515, + "grad_norm": 0.8316422195623451, + "learning_rate": 4.938184720824148e-06, + "loss": 0.4357, + "step": 1814 + }, + { + "epoch": 0.44881305637982194, + "grad_norm": 0.7881623847336401, + "learning_rate": 4.938112869099641e-06, + "loss": 0.4457, + "step": 1815 + }, + { + "epoch": 0.44906033630069236, + "grad_norm": 0.8095248000011614, + "learning_rate": 4.9380409761638725e-06, + "loss": 0.4675, + "step": 1816 + }, + { + "epoch": 0.4493076162215628, + "grad_norm": 0.8142992574456057, + "learning_rate": 4.937969042018059e-06, + "loss": 0.4382, + "step": 1817 + }, + { + "epoch": 0.4495548961424332, + "grad_norm": 0.8037598382565191, + "learning_rate": 4.937897066663417e-06, + "loss": 0.4547, + "step": 1818 + }, + { + "epoch": 0.44980217606330364, + "grad_norm": 0.8029225397282946, + "learning_rate": 4.937825050101162e-06, + "loss": 0.4532, + "step": 1819 + }, + { + "epoch": 0.45004945598417406, + "grad_norm": 0.8525725054243789, + "learning_rate": 4.937752992332512e-06, + "loss": 0.4452, + "step": 1820 + }, + { + "epoch": 0.4502967359050445, + "grad_norm": 0.7978513030283886, + "learning_rate": 4.937680893358683e-06, + "loss": 0.4496, + "step": 1821 + }, + { + "epoch": 0.4505440158259149, + "grad_norm": 0.8684712638463695, + "learning_rate": 4.9376087531808964e-06, + "loss": 0.4336, + "step": 1822 + }, + { + "epoch": 0.45079129574678534, + "grad_norm": 0.8163578329839308, + "learning_rate": 4.93753657180037e-06, + "loss": 0.4639, + "step": 1823 + }, + { + "epoch": 0.45103857566765576, + "grad_norm": 0.8225937594696441, + "learning_rate": 4.937464349218325e-06, + "loss": 0.4566, + "step": 1824 + }, + { + "epoch": 0.4512858555885262, + "grad_norm": 0.8008255690442442, + "learning_rate": 4.93739208543598e-06, + "loss": 0.463, + "step": 1825 + }, + { + "epoch": 0.4515331355093966, + "grad_norm": 0.8360458473751116, + "learning_rate": 4.937319780454559e-06, + "loss": 0.4221, + "step": 1826 + }, + { + "epoch": 0.45178041543026703, + "grad_norm": 0.7805314420818854, + "learning_rate": 4.937247434275283e-06, + "loss": 0.4615, + "step": 1827 + }, + { + "epoch": 0.4520276953511375, + "grad_norm": 0.8314479860480369, + "learning_rate": 4.937175046899375e-06, + "loss": 0.4869, + "step": 1828 + }, + { + "epoch": 0.45227497527200794, + "grad_norm": 0.7954839890830541, + "learning_rate": 4.937102618328058e-06, + "loss": 0.4717, + "step": 1829 + }, + { + "epoch": 0.45252225519287836, + "grad_norm": 0.7896791151974906, + "learning_rate": 4.937030148562558e-06, + "loss": 0.4561, + "step": 1830 + }, + { + "epoch": 0.4527695351137488, + "grad_norm": 0.8315873837686758, + "learning_rate": 4.936957637604097e-06, + "loss": 0.4973, + "step": 1831 + }, + { + "epoch": 0.4530168150346192, + "grad_norm": 0.8302395670775476, + "learning_rate": 4.936885085453904e-06, + "loss": 0.4115, + "step": 1832 + }, + { + "epoch": 0.45326409495548964, + "grad_norm": 0.7907392913162058, + "learning_rate": 4.936812492113203e-06, + "loss": 0.4398, + "step": 1833 + }, + { + "epoch": 0.45351137487636006, + "grad_norm": 0.7930420093582806, + "learning_rate": 4.936739857583222e-06, + "loss": 0.4589, + "step": 1834 + }, + { + "epoch": 0.4537586547972305, + "grad_norm": 0.8307780202557283, + "learning_rate": 4.936667181865188e-06, + "loss": 0.4349, + "step": 1835 + }, + { + "epoch": 0.4540059347181009, + "grad_norm": 0.8218166907095654, + "learning_rate": 4.93659446496033e-06, + "loss": 0.4617, + "step": 1836 + }, + { + "epoch": 0.45425321463897134, + "grad_norm": 0.7959798297948486, + "learning_rate": 4.936521706869876e-06, + "loss": 0.4607, + "step": 1837 + }, + { + "epoch": 0.45450049455984176, + "grad_norm": 0.8398423129692738, + "learning_rate": 4.93644890759506e-06, + "loss": 0.4252, + "step": 1838 + }, + { + "epoch": 0.4547477744807122, + "grad_norm": 0.8146261084755446, + "learning_rate": 4.936376067137106e-06, + "loss": 0.4191, + "step": 1839 + }, + { + "epoch": 0.4549950544015826, + "grad_norm": 0.8096282195772391, + "learning_rate": 4.936303185497251e-06, + "loss": 0.4556, + "step": 1840 + }, + { + "epoch": 0.45524233432245303, + "grad_norm": 0.8226335223421032, + "learning_rate": 4.9362302626767236e-06, + "loss": 0.4349, + "step": 1841 + }, + { + "epoch": 0.45548961424332346, + "grad_norm": 0.8387498079572907, + "learning_rate": 4.936157298676757e-06, + "loss": 0.4613, + "step": 1842 + }, + { + "epoch": 0.4557368941641939, + "grad_norm": 0.81593835337991, + "learning_rate": 4.936084293498585e-06, + "loss": 0.4779, + "step": 1843 + }, + { + "epoch": 0.4559841740850643, + "grad_norm": 0.804173667458378, + "learning_rate": 4.936011247143442e-06, + "loss": 0.4465, + "step": 1844 + }, + { + "epoch": 0.45623145400593473, + "grad_norm": 0.8180659963817345, + "learning_rate": 4.935938159612562e-06, + "loss": 0.4247, + "step": 1845 + }, + { + "epoch": 0.45647873392680516, + "grad_norm": 0.8245812500907519, + "learning_rate": 4.93586503090718e-06, + "loss": 0.4857, + "step": 1846 + }, + { + "epoch": 0.4567260138476756, + "grad_norm": 0.8247624372284065, + "learning_rate": 4.9357918610285326e-06, + "loss": 0.44, + "step": 1847 + }, + { + "epoch": 0.456973293768546, + "grad_norm": 0.8117934491093357, + "learning_rate": 4.935718649977857e-06, + "loss": 0.443, + "step": 1848 + }, + { + "epoch": 0.45722057368941643, + "grad_norm": 0.831498957769525, + "learning_rate": 4.93564539775639e-06, + "loss": 0.4285, + "step": 1849 + }, + { + "epoch": 0.45746785361028686, + "grad_norm": 0.8178845965621625, + "learning_rate": 4.9355721043653705e-06, + "loss": 0.4579, + "step": 1850 + }, + { + "epoch": 0.4577151335311573, + "grad_norm": 0.8435823859766596, + "learning_rate": 4.935498769806037e-06, + "loss": 0.4849, + "step": 1851 + }, + { + "epoch": 0.4579624134520277, + "grad_norm": 0.8198546493674795, + "learning_rate": 4.9354253940796285e-06, + "loss": 0.4764, + "step": 1852 + }, + { + "epoch": 0.45820969337289813, + "grad_norm": 0.8612598869000662, + "learning_rate": 4.9353519771873865e-06, + "loss": 0.4655, + "step": 1853 + }, + { + "epoch": 0.45845697329376855, + "grad_norm": 0.8331800596681603, + "learning_rate": 4.935278519130551e-06, + "loss": 0.4427, + "step": 1854 + }, + { + "epoch": 0.458704253214639, + "grad_norm": 0.859575706580175, + "learning_rate": 4.935205019910363e-06, + "loss": 0.4308, + "step": 1855 + }, + { + "epoch": 0.4589515331355094, + "grad_norm": 0.8304086390526508, + "learning_rate": 4.9351314795280665e-06, + "loss": 0.4386, + "step": 1856 + }, + { + "epoch": 0.45919881305637983, + "grad_norm": 0.8526206499245497, + "learning_rate": 4.935057897984904e-06, + "loss": 0.4171, + "step": 1857 + }, + { + "epoch": 0.45944609297725025, + "grad_norm": 0.8122129580874308, + "learning_rate": 4.934984275282119e-06, + "loss": 0.4592, + "step": 1858 + }, + { + "epoch": 0.4596933728981207, + "grad_norm": 0.8184420237101967, + "learning_rate": 4.9349106114209555e-06, + "loss": 0.476, + "step": 1859 + }, + { + "epoch": 0.4599406528189911, + "grad_norm": 0.8276819930577508, + "learning_rate": 4.934836906402659e-06, + "loss": 0.4389, + "step": 1860 + }, + { + "epoch": 0.4601879327398615, + "grad_norm": 0.787148630250681, + "learning_rate": 4.934763160228476e-06, + "loss": 0.4346, + "step": 1861 + }, + { + "epoch": 0.46043521266073195, + "grad_norm": 0.8128804151684144, + "learning_rate": 4.934689372899653e-06, + "loss": 0.4728, + "step": 1862 + }, + { + "epoch": 0.4606824925816024, + "grad_norm": 0.8461977492066648, + "learning_rate": 4.934615544417436e-06, + "loss": 0.4389, + "step": 1863 + }, + { + "epoch": 0.4609297725024728, + "grad_norm": 0.843449430804296, + "learning_rate": 4.934541674783074e-06, + "loss": 0.4457, + "step": 1864 + }, + { + "epoch": 0.4611770524233432, + "grad_norm": 0.8086082758083908, + "learning_rate": 4.934467763997814e-06, + "loss": 0.4343, + "step": 1865 + }, + { + "epoch": 0.46142433234421365, + "grad_norm": 0.8364626491108798, + "learning_rate": 4.934393812062907e-06, + "loss": 0.4242, + "step": 1866 + }, + { + "epoch": 0.4616716122650841, + "grad_norm": 0.8250397394229696, + "learning_rate": 4.934319818979604e-06, + "loss": 0.4468, + "step": 1867 + }, + { + "epoch": 0.4619188921859545, + "grad_norm": 0.8365693927663361, + "learning_rate": 4.9342457847491525e-06, + "loss": 0.4374, + "step": 1868 + }, + { + "epoch": 0.4621661721068249, + "grad_norm": 0.8496261702902699, + "learning_rate": 4.934171709372806e-06, + "loss": 0.4261, + "step": 1869 + }, + { + "epoch": 0.46241345202769535, + "grad_norm": 0.8007783961735965, + "learning_rate": 4.934097592851817e-06, + "loss": 0.4749, + "step": 1870 + }, + { + "epoch": 0.4626607319485658, + "grad_norm": 0.9167830740082399, + "learning_rate": 4.9340234351874375e-06, + "loss": 0.4388, + "step": 1871 + }, + { + "epoch": 0.4629080118694362, + "grad_norm": 0.8811259241380944, + "learning_rate": 4.93394923638092e-06, + "loss": 0.4256, + "step": 1872 + }, + { + "epoch": 0.4631552917903066, + "grad_norm": 0.813618018230188, + "learning_rate": 4.933874996433521e-06, + "loss": 0.4567, + "step": 1873 + }, + { + "epoch": 0.46340257171117705, + "grad_norm": 0.8132288219977344, + "learning_rate": 4.933800715346493e-06, + "loss": 0.4449, + "step": 1874 + }, + { + "epoch": 0.46364985163204747, + "grad_norm": 0.833392110392238, + "learning_rate": 4.933726393121092e-06, + "loss": 0.4675, + "step": 1875 + }, + { + "epoch": 0.4638971315529179, + "grad_norm": 0.8353507684223446, + "learning_rate": 4.933652029758577e-06, + "loss": 0.4734, + "step": 1876 + }, + { + "epoch": 0.4641444114737883, + "grad_norm": 0.8628430560657625, + "learning_rate": 4.933577625260201e-06, + "loss": 0.4304, + "step": 1877 + }, + { + "epoch": 0.46439169139465875, + "grad_norm": 0.8689457252491675, + "learning_rate": 4.933503179627224e-06, + "loss": 0.449, + "step": 1878 + }, + { + "epoch": 0.46463897131552917, + "grad_norm": 0.8110050031566923, + "learning_rate": 4.933428692860904e-06, + "loss": 0.441, + "step": 1879 + }, + { + "epoch": 0.4648862512363996, + "grad_norm": 0.8292633865002436, + "learning_rate": 4.933354164962499e-06, + "loss": 0.429, + "step": 1880 + }, + { + "epoch": 0.46513353115727, + "grad_norm": 0.8725341632231393, + "learning_rate": 4.9332795959332715e-06, + "loss": 0.4289, + "step": 1881 + }, + { + "epoch": 0.46538081107814044, + "grad_norm": 0.8500399734842764, + "learning_rate": 4.933204985774479e-06, + "loss": 0.4512, + "step": 1882 + }, + { + "epoch": 0.46562809099901087, + "grad_norm": 0.8080510822724074, + "learning_rate": 4.933130334487384e-06, + "loss": 0.4194, + "step": 1883 + }, + { + "epoch": 0.4658753709198813, + "grad_norm": 0.8348679766093604, + "learning_rate": 4.933055642073247e-06, + "loss": 0.4223, + "step": 1884 + }, + { + "epoch": 0.4661226508407517, + "grad_norm": 0.811940856384884, + "learning_rate": 4.932980908533332e-06, + "loss": 0.4511, + "step": 1885 + }, + { + "epoch": 0.46636993076162214, + "grad_norm": 0.8623912087016934, + "learning_rate": 4.9329061338689024e-06, + "loss": 0.4067, + "step": 1886 + }, + { + "epoch": 0.46661721068249257, + "grad_norm": 0.8466626050654013, + "learning_rate": 4.932831318081222e-06, + "loss": 0.4448, + "step": 1887 + }, + { + "epoch": 0.466864490603363, + "grad_norm": 0.8167851555645209, + "learning_rate": 4.932756461171554e-06, + "loss": 0.4504, + "step": 1888 + }, + { + "epoch": 0.4671117705242334, + "grad_norm": 0.8104154270120225, + "learning_rate": 4.932681563141164e-06, + "loss": 0.4395, + "step": 1889 + }, + { + "epoch": 0.46735905044510384, + "grad_norm": 0.8655461155693418, + "learning_rate": 4.932606623991319e-06, + "loss": 0.4591, + "step": 1890 + }, + { + "epoch": 0.46760633036597427, + "grad_norm": 0.8176989636313634, + "learning_rate": 4.932531643723285e-06, + "loss": 0.4427, + "step": 1891 + }, + { + "epoch": 0.4678536102868447, + "grad_norm": 0.7838206162822197, + "learning_rate": 4.9324566223383306e-06, + "loss": 0.4498, + "step": 1892 + }, + { + "epoch": 0.4681008902077151, + "grad_norm": 0.8035646415445724, + "learning_rate": 4.9323815598377225e-06, + "loss": 0.4471, + "step": 1893 + }, + { + "epoch": 0.46834817012858554, + "grad_norm": 0.8037368620913304, + "learning_rate": 4.93230645622273e-06, + "loss": 0.4641, + "step": 1894 + }, + { + "epoch": 0.46859545004945596, + "grad_norm": 0.8025390965922428, + "learning_rate": 4.932231311494622e-06, + "loss": 0.4373, + "step": 1895 + }, + { + "epoch": 0.4688427299703264, + "grad_norm": 0.7945979697539853, + "learning_rate": 4.932156125654669e-06, + "loss": 0.4559, + "step": 1896 + }, + { + "epoch": 0.4690900098911968, + "grad_norm": 0.8769209411502517, + "learning_rate": 4.9320808987041424e-06, + "loss": 0.4586, + "step": 1897 + }, + { + "epoch": 0.46933728981206724, + "grad_norm": 0.8476950420274959, + "learning_rate": 4.932005630644314e-06, + "loss": 0.4135, + "step": 1898 + }, + { + "epoch": 0.46958456973293766, + "grad_norm": 0.8244206095276344, + "learning_rate": 4.931930321476455e-06, + "loss": 0.4446, + "step": 1899 + }, + { + "epoch": 0.4698318496538081, + "grad_norm": 0.7977893759785066, + "learning_rate": 4.931854971201838e-06, + "loss": 0.4703, + "step": 1900 + }, + { + "epoch": 0.4700791295746785, + "grad_norm": 0.8611413119435934, + "learning_rate": 4.9317795798217385e-06, + "loss": 0.4591, + "step": 1901 + }, + { + "epoch": 0.47032640949554894, + "grad_norm": 0.8374357250520812, + "learning_rate": 4.931704147337428e-06, + "loss": 0.4472, + "step": 1902 + }, + { + "epoch": 0.47057368941641936, + "grad_norm": 0.8410624597911414, + "learning_rate": 4.931628673750185e-06, + "loss": 0.439, + "step": 1903 + }, + { + "epoch": 0.4708209693372898, + "grad_norm": 0.8562823214973215, + "learning_rate": 4.931553159061283e-06, + "loss": 0.4445, + "step": 1904 + }, + { + "epoch": 0.4710682492581602, + "grad_norm": 0.8224377914499575, + "learning_rate": 4.931477603271999e-06, + "loss": 0.4306, + "step": 1905 + }, + { + "epoch": 0.4713155291790307, + "grad_norm": 0.8697597121950194, + "learning_rate": 4.93140200638361e-06, + "loss": 0.4548, + "step": 1906 + }, + { + "epoch": 0.4715628090999011, + "grad_norm": 0.824255221165472, + "learning_rate": 4.931326368397394e-06, + "loss": 0.4951, + "step": 1907 + }, + { + "epoch": 0.47181008902077154, + "grad_norm": 0.885166598612944, + "learning_rate": 4.9312506893146286e-06, + "loss": 0.4285, + "step": 1908 + }, + { + "epoch": 0.47205736894164196, + "grad_norm": 0.8171167921231146, + "learning_rate": 4.931174969136594e-06, + "loss": 0.446, + "step": 1909 + }, + { + "epoch": 0.4723046488625124, + "grad_norm": 0.820165159360279, + "learning_rate": 4.93109920786457e-06, + "loss": 0.4823, + "step": 1910 + }, + { + "epoch": 0.4725519287833828, + "grad_norm": 0.8153543462857648, + "learning_rate": 4.9310234054998375e-06, + "loss": 0.4478, + "step": 1911 + }, + { + "epoch": 0.47279920870425324, + "grad_norm": 0.8367918771121765, + "learning_rate": 4.930947562043677e-06, + "loss": 0.4695, + "step": 1912 + }, + { + "epoch": 0.47304648862512366, + "grad_norm": 0.8471649045198084, + "learning_rate": 4.930871677497371e-06, + "loss": 0.47, + "step": 1913 + }, + { + "epoch": 0.4732937685459941, + "grad_norm": 0.8076081839098591, + "learning_rate": 4.9307957518622006e-06, + "loss": 0.4391, + "step": 1914 + }, + { + "epoch": 0.4735410484668645, + "grad_norm": 0.8264582394780937, + "learning_rate": 4.9307197851394514e-06, + "loss": 0.4291, + "step": 1915 + }, + { + "epoch": 0.47378832838773494, + "grad_norm": 0.8178298174303794, + "learning_rate": 4.930643777330407e-06, + "loss": 0.4387, + "step": 1916 + }, + { + "epoch": 0.47403560830860536, + "grad_norm": 0.7791823942240508, + "learning_rate": 4.930567728436352e-06, + "loss": 0.4531, + "step": 1917 + }, + { + "epoch": 0.4742828882294758, + "grad_norm": 0.8280393676731757, + "learning_rate": 4.930491638458571e-06, + "loss": 0.4419, + "step": 1918 + }, + { + "epoch": 0.4745301681503462, + "grad_norm": 0.8241832719290327, + "learning_rate": 4.930415507398351e-06, + "loss": 0.4381, + "step": 1919 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 0.8217261660458774, + "learning_rate": 4.930339335256978e-06, + "loss": 0.4505, + "step": 1920 + }, + { + "epoch": 0.47502472799208706, + "grad_norm": 0.8131794869769368, + "learning_rate": 4.93026312203574e-06, + "loss": 0.4311, + "step": 1921 + }, + { + "epoch": 0.4752720079129575, + "grad_norm": 0.8411335977862362, + "learning_rate": 4.930186867735926e-06, + "loss": 0.4783, + "step": 1922 + }, + { + "epoch": 0.4755192878338279, + "grad_norm": 0.8241214015182279, + "learning_rate": 4.930110572358824e-06, + "loss": 0.4497, + "step": 1923 + }, + { + "epoch": 0.47576656775469833, + "grad_norm": 0.839706324810582, + "learning_rate": 4.930034235905724e-06, + "loss": 0.4802, + "step": 1924 + }, + { + "epoch": 0.47601384767556876, + "grad_norm": 0.8397548471567146, + "learning_rate": 4.929957858377915e-06, + "loss": 0.4202, + "step": 1925 + }, + { + "epoch": 0.4762611275964392, + "grad_norm": 0.8660864925533305, + "learning_rate": 4.929881439776691e-06, + "loss": 0.4399, + "step": 1926 + }, + { + "epoch": 0.4765084075173096, + "grad_norm": 0.8347114497369829, + "learning_rate": 4.929804980103341e-06, + "loss": 0.4631, + "step": 1927 + }, + { + "epoch": 0.47675568743818003, + "grad_norm": 0.8646650091416341, + "learning_rate": 4.929728479359158e-06, + "loss": 0.4393, + "step": 1928 + }, + { + "epoch": 0.47700296735905046, + "grad_norm": 0.8252581921541579, + "learning_rate": 4.929651937545436e-06, + "loss": 0.4506, + "step": 1929 + }, + { + "epoch": 0.4772502472799209, + "grad_norm": 0.8094312396500337, + "learning_rate": 4.929575354663467e-06, + "loss": 0.4942, + "step": 1930 + }, + { + "epoch": 0.4774975272007913, + "grad_norm": 0.8772495543820467, + "learning_rate": 4.929498730714548e-06, + "loss": 0.445, + "step": 1931 + }, + { + "epoch": 0.47774480712166173, + "grad_norm": 0.7923372192092943, + "learning_rate": 4.929422065699972e-06, + "loss": 0.4862, + "step": 1932 + }, + { + "epoch": 0.47799208704253215, + "grad_norm": 0.8712668957725559, + "learning_rate": 4.929345359621036e-06, + "loss": 0.4351, + "step": 1933 + }, + { + "epoch": 0.4782393669634026, + "grad_norm": 0.8493635338265155, + "learning_rate": 4.929268612479036e-06, + "loss": 0.4572, + "step": 1934 + }, + { + "epoch": 0.478486646884273, + "grad_norm": 0.8666548569127566, + "learning_rate": 4.929191824275269e-06, + "loss": 0.4427, + "step": 1935 + }, + { + "epoch": 0.47873392680514343, + "grad_norm": 0.9074290287081372, + "learning_rate": 4.929114995011034e-06, + "loss": 0.4321, + "step": 1936 + }, + { + "epoch": 0.47898120672601385, + "grad_norm": 0.8352717250490244, + "learning_rate": 4.929038124687629e-06, + "loss": 0.4348, + "step": 1937 + }, + { + "epoch": 0.4792284866468843, + "grad_norm": 0.8112065646293543, + "learning_rate": 4.9289612133063536e-06, + "loss": 0.4658, + "step": 1938 + }, + { + "epoch": 0.4794757665677547, + "grad_norm": 0.8327013692076111, + "learning_rate": 4.928884260868507e-06, + "loss": 0.4465, + "step": 1939 + }, + { + "epoch": 0.4797230464886251, + "grad_norm": 0.8322228019074666, + "learning_rate": 4.928807267375391e-06, + "loss": 0.439, + "step": 1940 + }, + { + "epoch": 0.47997032640949555, + "grad_norm": 0.8225399645495624, + "learning_rate": 4.928730232828306e-06, + "loss": 0.4131, + "step": 1941 + }, + { + "epoch": 0.480217606330366, + "grad_norm": 0.8287089897738253, + "learning_rate": 4.928653157228555e-06, + "loss": 0.4513, + "step": 1942 + }, + { + "epoch": 0.4804648862512364, + "grad_norm": 0.8387018490840795, + "learning_rate": 4.928576040577441e-06, + "loss": 0.468, + "step": 1943 + }, + { + "epoch": 0.4807121661721068, + "grad_norm": 0.7983871186620123, + "learning_rate": 4.928498882876266e-06, + "loss": 0.4606, + "step": 1944 + }, + { + "epoch": 0.48095944609297725, + "grad_norm": 0.8726955385115273, + "learning_rate": 4.928421684126335e-06, + "loss": 0.4226, + "step": 1945 + }, + { + "epoch": 0.4812067260138477, + "grad_norm": 0.8343516175985839, + "learning_rate": 4.928344444328954e-06, + "loss": 0.4601, + "step": 1946 + }, + { + "epoch": 0.4814540059347181, + "grad_norm": 0.8262000466383999, + "learning_rate": 4.928267163485427e-06, + "loss": 0.4742, + "step": 1947 + }, + { + "epoch": 0.4817012858555885, + "grad_norm": 0.8684322211013671, + "learning_rate": 4.928189841597061e-06, + "loss": 0.4459, + "step": 1948 + }, + { + "epoch": 0.48194856577645895, + "grad_norm": 0.8391599834278922, + "learning_rate": 4.928112478665163e-06, + "loss": 0.4526, + "step": 1949 + }, + { + "epoch": 0.4821958456973294, + "grad_norm": 0.847620940773091, + "learning_rate": 4.92803507469104e-06, + "loss": 0.4429, + "step": 1950 + }, + { + "epoch": 0.4824431256181998, + "grad_norm": 0.8734227149804495, + "learning_rate": 4.927957629676001e-06, + "loss": 0.4414, + "step": 1951 + }, + { + "epoch": 0.4826904055390702, + "grad_norm": 0.875124744595111, + "learning_rate": 4.927880143621355e-06, + "loss": 0.4464, + "step": 1952 + }, + { + "epoch": 0.48293768545994065, + "grad_norm": 0.872328400694578, + "learning_rate": 4.927802616528412e-06, + "loss": 0.4484, + "step": 1953 + }, + { + "epoch": 0.48318496538081107, + "grad_norm": 0.8407440697376363, + "learning_rate": 4.927725048398482e-06, + "loss": 0.4523, + "step": 1954 + }, + { + "epoch": 0.4834322453016815, + "grad_norm": 0.7834132267725564, + "learning_rate": 4.927647439232876e-06, + "loss": 0.4393, + "step": 1955 + }, + { + "epoch": 0.4836795252225519, + "grad_norm": 0.8234773490129442, + "learning_rate": 4.927569789032907e-06, + "loss": 0.4595, + "step": 1956 + }, + { + "epoch": 0.48392680514342234, + "grad_norm": 0.8497947043946262, + "learning_rate": 4.927492097799885e-06, + "loss": 0.4303, + "step": 1957 + }, + { + "epoch": 0.48417408506429277, + "grad_norm": 0.8806520611452342, + "learning_rate": 4.927414365535126e-06, + "loss": 0.4606, + "step": 1958 + }, + { + "epoch": 0.4844213649851632, + "grad_norm": 0.8113702305265199, + "learning_rate": 4.9273365922399416e-06, + "loss": 0.445, + "step": 1959 + }, + { + "epoch": 0.4846686449060336, + "grad_norm": 0.8248851985890626, + "learning_rate": 4.927258777915648e-06, + "loss": 0.4465, + "step": 1960 + }, + { + "epoch": 0.48491592482690404, + "grad_norm": 0.8299620493793199, + "learning_rate": 4.92718092256356e-06, + "loss": 0.4204, + "step": 1961 + }, + { + "epoch": 0.48516320474777447, + "grad_norm": 0.8052928255301574, + "learning_rate": 4.927103026184993e-06, + "loss": 0.478, + "step": 1962 + }, + { + "epoch": 0.4854104846686449, + "grad_norm": 0.8740281640364033, + "learning_rate": 4.927025088781265e-06, + "loss": 0.426, + "step": 1963 + }, + { + "epoch": 0.4856577645895153, + "grad_norm": 0.8183836323362705, + "learning_rate": 4.926947110353692e-06, + "loss": 0.4432, + "step": 1964 + }, + { + "epoch": 0.48590504451038574, + "grad_norm": 0.8215966525772359, + "learning_rate": 4.926869090903593e-06, + "loss": 0.4306, + "step": 1965 + }, + { + "epoch": 0.48615232443125617, + "grad_norm": 0.8491022861836903, + "learning_rate": 4.9267910304322865e-06, + "loss": 0.464, + "step": 1966 + }, + { + "epoch": 0.4863996043521266, + "grad_norm": 0.8538882505150086, + "learning_rate": 4.926712928941092e-06, + "loss": 0.436, + "step": 1967 + }, + { + "epoch": 0.486646884272997, + "grad_norm": 0.777009034398573, + "learning_rate": 4.926634786431329e-06, + "loss": 0.4501, + "step": 1968 + }, + { + "epoch": 0.48689416419386744, + "grad_norm": 0.8005037204433741, + "learning_rate": 4.926556602904319e-06, + "loss": 0.4343, + "step": 1969 + }, + { + "epoch": 0.48714144411473786, + "grad_norm": 0.8289955238627059, + "learning_rate": 4.9264783783613835e-06, + "loss": 0.4614, + "step": 1970 + }, + { + "epoch": 0.4873887240356083, + "grad_norm": 0.8455414270465915, + "learning_rate": 4.926400112803844e-06, + "loss": 0.4669, + "step": 1971 + }, + { + "epoch": 0.4876360039564787, + "grad_norm": 0.7797141965228852, + "learning_rate": 4.926321806233024e-06, + "loss": 0.49, + "step": 1972 + }, + { + "epoch": 0.48788328387734914, + "grad_norm": 0.8493989470575196, + "learning_rate": 4.926243458650248e-06, + "loss": 0.4349, + "step": 1973 + }, + { + "epoch": 0.48813056379821956, + "grad_norm": 0.8512014778641054, + "learning_rate": 4.926165070056839e-06, + "loss": 0.4108, + "step": 1974 + }, + { + "epoch": 0.48837784371909, + "grad_norm": 0.7979992970173828, + "learning_rate": 4.926086640454123e-06, + "loss": 0.4706, + "step": 1975 + }, + { + "epoch": 0.4886251236399604, + "grad_norm": 0.8239102783032143, + "learning_rate": 4.926008169843424e-06, + "loss": 0.4787, + "step": 1976 + }, + { + "epoch": 0.48887240356083084, + "grad_norm": 0.8016225529555583, + "learning_rate": 4.92592965822607e-06, + "loss": 0.4547, + "step": 1977 + }, + { + "epoch": 0.48911968348170126, + "grad_norm": 0.8531911433461715, + "learning_rate": 4.925851105603388e-06, + "loss": 0.4488, + "step": 1978 + }, + { + "epoch": 0.4893669634025717, + "grad_norm": 0.8194447366104644, + "learning_rate": 4.925772511976705e-06, + "loss": 0.4431, + "step": 1979 + }, + { + "epoch": 0.4896142433234421, + "grad_norm": 0.8296301370115579, + "learning_rate": 4.925693877347349e-06, + "loss": 0.4461, + "step": 1980 + }, + { + "epoch": 0.48986152324431254, + "grad_norm": 0.8462798504011503, + "learning_rate": 4.925615201716651e-06, + "loss": 0.443, + "step": 1981 + }, + { + "epoch": 0.49010880316518296, + "grad_norm": 0.8426288056577137, + "learning_rate": 4.92553648508594e-06, + "loss": 0.425, + "step": 1982 + }, + { + "epoch": 0.4903560830860534, + "grad_norm": 0.829429706126443, + "learning_rate": 4.925457727456546e-06, + "loss": 0.4508, + "step": 1983 + }, + { + "epoch": 0.49060336300692386, + "grad_norm": 0.8059370491729095, + "learning_rate": 4.9253789288298e-06, + "loss": 0.4502, + "step": 1984 + }, + { + "epoch": 0.4908506429277943, + "grad_norm": 0.7795504551173943, + "learning_rate": 4.925300089207035e-06, + "loss": 0.4108, + "step": 1985 + }, + { + "epoch": 0.4910979228486647, + "grad_norm": 0.8145867834841408, + "learning_rate": 4.925221208589584e-06, + "loss": 0.4514, + "step": 1986 + }, + { + "epoch": 0.49134520276953514, + "grad_norm": 0.8452473568571481, + "learning_rate": 4.925142286978778e-06, + "loss": 0.4305, + "step": 1987 + }, + { + "epoch": 0.49159248269040556, + "grad_norm": 0.8444624739728522, + "learning_rate": 4.925063324375953e-06, + "loss": 0.4673, + "step": 1988 + }, + { + "epoch": 0.491839762611276, + "grad_norm": 0.8256672771190708, + "learning_rate": 4.9249843207824434e-06, + "loss": 0.4642, + "step": 1989 + }, + { + "epoch": 0.4920870425321464, + "grad_norm": 0.8404370092223706, + "learning_rate": 4.924905276199584e-06, + "loss": 0.4288, + "step": 1990 + }, + { + "epoch": 0.49233432245301684, + "grad_norm": 0.7864654632402006, + "learning_rate": 4.924826190628711e-06, + "loss": 0.4692, + "step": 1991 + }, + { + "epoch": 0.49258160237388726, + "grad_norm": 0.8434933818377831, + "learning_rate": 4.924747064071163e-06, + "loss": 0.4158, + "step": 1992 + }, + { + "epoch": 0.4928288822947577, + "grad_norm": 0.8244874183035269, + "learning_rate": 4.924667896528274e-06, + "loss": 0.4198, + "step": 1993 + }, + { + "epoch": 0.4930761622156281, + "grad_norm": 0.8270973921922327, + "learning_rate": 4.924588688001385e-06, + "loss": 0.4734, + "step": 1994 + }, + { + "epoch": 0.49332344213649854, + "grad_norm": 0.8058932904957486, + "learning_rate": 4.924509438491834e-06, + "loss": 0.4381, + "step": 1995 + }, + { + "epoch": 0.49357072205736896, + "grad_norm": 0.7956261606694742, + "learning_rate": 4.924430148000959e-06, + "loss": 0.429, + "step": 1996 + }, + { + "epoch": 0.4938180019782394, + "grad_norm": 0.7939524290994, + "learning_rate": 4.924350816530104e-06, + "loss": 0.4465, + "step": 1997 + }, + { + "epoch": 0.4940652818991098, + "grad_norm": 0.7916027057232425, + "learning_rate": 4.924271444080606e-06, + "loss": 0.4444, + "step": 1998 + }, + { + "epoch": 0.49431256181998023, + "grad_norm": 0.8173525084366966, + "learning_rate": 4.924192030653808e-06, + "loss": 0.4663, + "step": 1999 + }, + { + "epoch": 0.49455984174085066, + "grad_norm": 0.852494236856881, + "learning_rate": 4.924112576251054e-06, + "loss": 0.454, + "step": 2000 + }, + { + "epoch": 0.4948071216617211, + "grad_norm": 0.7988637477633536, + "learning_rate": 4.924033080873684e-06, + "loss": 0.4392, + "step": 2001 + }, + { + "epoch": 0.4950544015825915, + "grad_norm": 0.8122968158357641, + "learning_rate": 4.923953544523044e-06, + "loss": 0.4637, + "step": 2002 + }, + { + "epoch": 0.49530168150346193, + "grad_norm": 0.81761782432353, + "learning_rate": 4.923873967200479e-06, + "loss": 0.4378, + "step": 2003 + }, + { + "epoch": 0.49554896142433236, + "grad_norm": 0.8633300475989552, + "learning_rate": 4.923794348907331e-06, + "loss": 0.4656, + "step": 2004 + }, + { + "epoch": 0.4957962413452028, + "grad_norm": 0.7944747654872235, + "learning_rate": 4.923714689644948e-06, + "loss": 0.4484, + "step": 2005 + }, + { + "epoch": 0.4960435212660732, + "grad_norm": 0.863856436444404, + "learning_rate": 4.923634989414676e-06, + "loss": 0.4371, + "step": 2006 + }, + { + "epoch": 0.49629080118694363, + "grad_norm": 0.8659860035555645, + "learning_rate": 4.923555248217864e-06, + "loss": 0.4283, + "step": 2007 + }, + { + "epoch": 0.49653808110781406, + "grad_norm": 0.7954268742364704, + "learning_rate": 4.923475466055856e-06, + "loss": 0.4547, + "step": 2008 + }, + { + "epoch": 0.4967853610286845, + "grad_norm": 0.8138754857165406, + "learning_rate": 4.9233956429300034e-06, + "loss": 0.4108, + "step": 2009 + }, + { + "epoch": 0.4970326409495549, + "grad_norm": 0.8581978670153796, + "learning_rate": 4.9233157788416545e-06, + "loss": 0.432, + "step": 2010 + }, + { + "epoch": 0.49727992087042533, + "grad_norm": 0.8241805365555525, + "learning_rate": 4.9232358737921585e-06, + "loss": 0.4382, + "step": 2011 + }, + { + "epoch": 0.49752720079129575, + "grad_norm": 0.8065492877310552, + "learning_rate": 4.923155927782868e-06, + "loss": 0.4384, + "step": 2012 + }, + { + "epoch": 0.4977744807121662, + "grad_norm": 0.8521852417108808, + "learning_rate": 4.923075940815133e-06, + "loss": 0.4417, + "step": 2013 + }, + { + "epoch": 0.4980217606330366, + "grad_norm": 0.8721828748607178, + "learning_rate": 4.922995912890306e-06, + "loss": 0.41, + "step": 2014 + }, + { + "epoch": 0.498269040553907, + "grad_norm": 0.8475215975436352, + "learning_rate": 4.922915844009739e-06, + "loss": 0.4415, + "step": 2015 + }, + { + "epoch": 0.49851632047477745, + "grad_norm": 0.7821130765064389, + "learning_rate": 4.922835734174786e-06, + "loss": 0.4429, + "step": 2016 + }, + { + "epoch": 0.4987636003956479, + "grad_norm": 0.8511117917014502, + "learning_rate": 4.922755583386801e-06, + "loss": 0.4497, + "step": 2017 + }, + { + "epoch": 0.4990108803165183, + "grad_norm": 0.831275796708298, + "learning_rate": 4.92267539164714e-06, + "loss": 0.4578, + "step": 2018 + }, + { + "epoch": 0.4992581602373887, + "grad_norm": 0.8509135804106028, + "learning_rate": 4.922595158957155e-06, + "loss": 0.4569, + "step": 2019 + }, + { + "epoch": 0.49950544015825915, + "grad_norm": 0.8582138029948618, + "learning_rate": 4.922514885318206e-06, + "loss": 0.4386, + "step": 2020 + }, + { + "epoch": 0.4997527200791296, + "grad_norm": 0.8182293486915994, + "learning_rate": 4.922434570731648e-06, + "loss": 0.4499, + "step": 2021 + }, + { + "epoch": 0.5, + "grad_norm": 0.8777366413472809, + "learning_rate": 4.922354215198838e-06, + "loss": 0.4371, + "step": 2022 + }, + { + "epoch": 0.5002472799208705, + "grad_norm": 0.811791173923638, + "learning_rate": 4.922273818721136e-06, + "loss": 0.4672, + "step": 2023 + }, + { + "epoch": 0.5004945598417408, + "grad_norm": 0.8274747192480223, + "learning_rate": 4.922193381299899e-06, + "loss": 0.427, + "step": 2024 + }, + { + "epoch": 0.5007418397626113, + "grad_norm": 0.8065694090822414, + "learning_rate": 4.922112902936489e-06, + "loss": 0.4736, + "step": 2025 + }, + { + "epoch": 0.5009891196834817, + "grad_norm": 0.8128372001404772, + "learning_rate": 4.922032383632263e-06, + "loss": 0.44, + "step": 2026 + }, + { + "epoch": 0.5012363996043522, + "grad_norm": 0.8287731557078928, + "learning_rate": 4.9219518233885856e-06, + "loss": 0.4414, + "step": 2027 + }, + { + "epoch": 0.5014836795252225, + "grad_norm": 0.8063495368442325, + "learning_rate": 4.921871222206817e-06, + "loss": 0.4736, + "step": 2028 + }, + { + "epoch": 0.501730959446093, + "grad_norm": 0.8380024116633067, + "learning_rate": 4.921790580088318e-06, + "loss": 0.4712, + "step": 2029 + }, + { + "epoch": 0.5019782393669634, + "grad_norm": 0.8284317158685391, + "learning_rate": 4.921709897034454e-06, + "loss": 0.4497, + "step": 2030 + }, + { + "epoch": 0.5022255192878339, + "grad_norm": 0.8187102724304874, + "learning_rate": 4.921629173046588e-06, + "loss": 0.4551, + "step": 2031 + }, + { + "epoch": 0.5024727992087042, + "grad_norm": 0.8512236754910734, + "learning_rate": 4.921548408126085e-06, + "loss": 0.4412, + "step": 2032 + }, + { + "epoch": 0.5027200791295747, + "grad_norm": 0.7788881223000658, + "learning_rate": 4.921467602274308e-06, + "loss": 0.4104, + "step": 2033 + }, + { + "epoch": 0.5029673590504451, + "grad_norm": 0.8141480380678998, + "learning_rate": 4.921386755492625e-06, + "loss": 0.4474, + "step": 2034 + }, + { + "epoch": 0.5032146389713156, + "grad_norm": 0.8407140685541208, + "learning_rate": 4.921305867782402e-06, + "loss": 0.4082, + "step": 2035 + }, + { + "epoch": 0.503461918892186, + "grad_norm": 0.8720791912431307, + "learning_rate": 4.9212249391450065e-06, + "loss": 0.4013, + "step": 2036 + }, + { + "epoch": 0.5037091988130564, + "grad_norm": 0.8183434796093931, + "learning_rate": 4.9211439695818065e-06, + "loss": 0.4316, + "step": 2037 + }, + { + "epoch": 0.5039564787339268, + "grad_norm": 0.8449401193598226, + "learning_rate": 4.921062959094169e-06, + "loss": 0.426, + "step": 2038 + }, + { + "epoch": 0.5042037586547973, + "grad_norm": 0.8509257751850934, + "learning_rate": 4.9209819076834655e-06, + "loss": 0.4525, + "step": 2039 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 0.8519430084094141, + "learning_rate": 4.920900815351065e-06, + "loss": 0.4146, + "step": 2040 + }, + { + "epoch": 0.5046983184965381, + "grad_norm": 0.8456813326281569, + "learning_rate": 4.920819682098338e-06, + "loss": 0.4324, + "step": 2041 + }, + { + "epoch": 0.5049455984174085, + "grad_norm": 0.8287711902840623, + "learning_rate": 4.920738507926657e-06, + "loss": 0.4613, + "step": 2042 + }, + { + "epoch": 0.505192878338279, + "grad_norm": 0.8257370297562611, + "learning_rate": 4.920657292837392e-06, + "loss": 0.4234, + "step": 2043 + }, + { + "epoch": 0.5054401582591493, + "grad_norm": 0.8961393796184989, + "learning_rate": 4.9205760368319175e-06, + "loss": 0.4434, + "step": 2044 + }, + { + "epoch": 0.5056874381800198, + "grad_norm": 0.8204890246004772, + "learning_rate": 4.920494739911607e-06, + "loss": 0.4794, + "step": 2045 + }, + { + "epoch": 0.5059347181008902, + "grad_norm": 0.8376177288707035, + "learning_rate": 4.9204134020778335e-06, + "loss": 0.4282, + "step": 2046 + }, + { + "epoch": 0.5061819980217607, + "grad_norm": 0.834472891929355, + "learning_rate": 4.920332023331973e-06, + "loss": 0.4327, + "step": 2047 + }, + { + "epoch": 0.506429277942631, + "grad_norm": 0.8406099377992237, + "learning_rate": 4.9202506036754e-06, + "loss": 0.4441, + "step": 2048 + }, + { + "epoch": 0.5066765578635015, + "grad_norm": 0.8617534109839463, + "learning_rate": 4.920169143109491e-06, + "loss": 0.4324, + "step": 2049 + }, + { + "epoch": 0.5069238377843719, + "grad_norm": 0.8676859852723221, + "learning_rate": 4.920087641635624e-06, + "loss": 0.4398, + "step": 2050 + }, + { + "epoch": 0.5071711177052424, + "grad_norm": 0.8385705486777898, + "learning_rate": 4.920006099255176e-06, + "loss": 0.45, + "step": 2051 + }, + { + "epoch": 0.5074183976261127, + "grad_norm": 0.8654084128937142, + "learning_rate": 4.919924515969524e-06, + "loss": 0.4203, + "step": 2052 + }, + { + "epoch": 0.5076656775469832, + "grad_norm": 0.8643853671809885, + "learning_rate": 4.919842891780049e-06, + "loss": 0.4562, + "step": 2053 + }, + { + "epoch": 0.5079129574678536, + "grad_norm": 0.8071820873638121, + "learning_rate": 4.919761226688129e-06, + "loss": 0.4174, + "step": 2054 + }, + { + "epoch": 0.5081602373887241, + "grad_norm": 0.8008245242716596, + "learning_rate": 4.9196795206951455e-06, + "loss": 0.4124, + "step": 2055 + }, + { + "epoch": 0.5084075173095944, + "grad_norm": 0.8074972712657118, + "learning_rate": 4.919597773802479e-06, + "loss": 0.448, + "step": 2056 + }, + { + "epoch": 0.5086547972304649, + "grad_norm": 0.8302305293316452, + "learning_rate": 4.919515986011512e-06, + "loss": 0.4729, + "step": 2057 + }, + { + "epoch": 0.5089020771513353, + "grad_norm": 0.8081365268996791, + "learning_rate": 4.919434157323627e-06, + "loss": 0.4572, + "step": 2058 + }, + { + "epoch": 0.5091493570722058, + "grad_norm": 0.8033712684412141, + "learning_rate": 4.919352287740205e-06, + "loss": 0.4382, + "step": 2059 + }, + { + "epoch": 0.5093966369930761, + "grad_norm": 0.7587735127229113, + "learning_rate": 4.919270377262633e-06, + "loss": 0.473, + "step": 2060 + }, + { + "epoch": 0.5096439169139466, + "grad_norm": 0.8619234301971191, + "learning_rate": 4.9191884258922926e-06, + "loss": 0.443, + "step": 2061 + }, + { + "epoch": 0.509891196834817, + "grad_norm": 0.8365792373646538, + "learning_rate": 4.919106433630572e-06, + "loss": 0.4133, + "step": 2062 + }, + { + "epoch": 0.5101384767556875, + "grad_norm": 0.7856790139850826, + "learning_rate": 4.919024400478854e-06, + "loss": 0.4528, + "step": 2063 + }, + { + "epoch": 0.5103857566765578, + "grad_norm": 0.7858186305207652, + "learning_rate": 4.918942326438527e-06, + "loss": 0.4014, + "step": 2064 + }, + { + "epoch": 0.5106330365974283, + "grad_norm": 0.8088878916274455, + "learning_rate": 4.918860211510979e-06, + "loss": 0.4551, + "step": 2065 + }, + { + "epoch": 0.5108803165182987, + "grad_norm": 0.7872550776179847, + "learning_rate": 4.918778055697596e-06, + "loss": 0.4684, + "step": 2066 + }, + { + "epoch": 0.5111275964391692, + "grad_norm": 0.817077611575408, + "learning_rate": 4.918695858999767e-06, + "loss": 0.414, + "step": 2067 + }, + { + "epoch": 0.5113748763600395, + "grad_norm": 0.8378442452576724, + "learning_rate": 4.918613621418883e-06, + "loss": 0.4576, + "step": 2068 + }, + { + "epoch": 0.51162215628091, + "grad_norm": 0.8744527942820566, + "learning_rate": 4.918531342956333e-06, + "loss": 0.4543, + "step": 2069 + }, + { + "epoch": 0.5118694362017804, + "grad_norm": 0.8732192451807397, + "learning_rate": 4.9184490236135075e-06, + "loss": 0.472, + "step": 2070 + }, + { + "epoch": 0.5121167161226509, + "grad_norm": 0.8461949177925873, + "learning_rate": 4.9183666633917986e-06, + "loss": 0.4555, + "step": 2071 + }, + { + "epoch": 0.5123639960435212, + "grad_norm": 0.8418875439184351, + "learning_rate": 4.918284262292597e-06, + "loss": 0.4366, + "step": 2072 + }, + { + "epoch": 0.5126112759643917, + "grad_norm": 0.82720798639419, + "learning_rate": 4.9182018203172986e-06, + "loss": 0.4407, + "step": 2073 + }, + { + "epoch": 0.5128585558852621, + "grad_norm": 0.8025497377538527, + "learning_rate": 4.918119337467293e-06, + "loss": 0.4483, + "step": 2074 + }, + { + "epoch": 0.5131058358061326, + "grad_norm": 0.8042377781404518, + "learning_rate": 4.918036813743978e-06, + "loss": 0.4375, + "step": 2075 + }, + { + "epoch": 0.5133531157270029, + "grad_norm": 0.8480814582795039, + "learning_rate": 4.9179542491487455e-06, + "loss": 0.4151, + "step": 2076 + }, + { + "epoch": 0.5136003956478734, + "grad_norm": 0.8057648300526595, + "learning_rate": 4.917871643682993e-06, + "loss": 0.433, + "step": 2077 + }, + { + "epoch": 0.5138476755687438, + "grad_norm": 0.8360351985687441, + "learning_rate": 4.917788997348116e-06, + "loss": 0.4263, + "step": 2078 + }, + { + "epoch": 0.5140949554896143, + "grad_norm": 0.7756523740903111, + "learning_rate": 4.9177063101455115e-06, + "loss": 0.462, + "step": 2079 + }, + { + "epoch": 0.5143422354104846, + "grad_norm": 0.8654229974106498, + "learning_rate": 4.917623582076577e-06, + "loss": 0.4391, + "step": 2080 + }, + { + "epoch": 0.5145895153313551, + "grad_norm": 0.8568313817015912, + "learning_rate": 4.917540813142712e-06, + "loss": 0.4283, + "step": 2081 + }, + { + "epoch": 0.5148367952522255, + "grad_norm": 0.8649603479345634, + "learning_rate": 4.917458003345314e-06, + "loss": 0.4261, + "step": 2082 + }, + { + "epoch": 0.515084075173096, + "grad_norm": 0.8187465423672693, + "learning_rate": 4.9173751526857835e-06, + "loss": 0.4441, + "step": 2083 + }, + { + "epoch": 0.5153313550939663, + "grad_norm": 0.8573292004045983, + "learning_rate": 4.9172922611655205e-06, + "loss": 0.4337, + "step": 2084 + }, + { + "epoch": 0.5155786350148368, + "grad_norm": 0.8663104974580745, + "learning_rate": 4.917209328785927e-06, + "loss": 0.448, + "step": 2085 + }, + { + "epoch": 0.5158259149357072, + "grad_norm": 0.8473083515505613, + "learning_rate": 4.917126355548404e-06, + "loss": 0.4226, + "step": 2086 + }, + { + "epoch": 0.5160731948565777, + "grad_norm": 0.8896880901970612, + "learning_rate": 4.9170433414543545e-06, + "loss": 0.4093, + "step": 2087 + }, + { + "epoch": 0.516320474777448, + "grad_norm": 0.8806978447051235, + "learning_rate": 4.916960286505181e-06, + "loss": 0.4322, + "step": 2088 + }, + { + "epoch": 0.5165677546983185, + "grad_norm": 0.8451289370207562, + "learning_rate": 4.9168771907022885e-06, + "loss": 0.4176, + "step": 2089 + }, + { + "epoch": 0.5168150346191889, + "grad_norm": 0.7887918587386917, + "learning_rate": 4.91679405404708e-06, + "loss": 0.4333, + "step": 2090 + }, + { + "epoch": 0.5170623145400594, + "grad_norm": 0.8093101065598383, + "learning_rate": 4.916710876540962e-06, + "loss": 0.4482, + "step": 2091 + }, + { + "epoch": 0.5173095944609297, + "grad_norm": 0.8145596529223303, + "learning_rate": 4.916627658185339e-06, + "loss": 0.4291, + "step": 2092 + }, + { + "epoch": 0.5175568743818002, + "grad_norm": 0.8464311393382885, + "learning_rate": 4.9165443989816195e-06, + "loss": 0.4458, + "step": 2093 + }, + { + "epoch": 0.5178041543026706, + "grad_norm": 0.8442575884890843, + "learning_rate": 4.91646109893121e-06, + "loss": 0.4598, + "step": 2094 + }, + { + "epoch": 0.518051434223541, + "grad_norm": 0.8413652586604167, + "learning_rate": 4.916377758035519e-06, + "loss": 0.4618, + "step": 2095 + }, + { + "epoch": 0.5182987141444114, + "grad_norm": 0.8364097355333588, + "learning_rate": 4.916294376295954e-06, + "loss": 0.4465, + "step": 2096 + }, + { + "epoch": 0.5185459940652819, + "grad_norm": 0.8400100432544643, + "learning_rate": 4.916210953713926e-06, + "loss": 0.4313, + "step": 2097 + }, + { + "epoch": 0.5187932739861523, + "grad_norm": 0.8128995366146822, + "learning_rate": 4.916127490290843e-06, + "loss": 0.459, + "step": 2098 + }, + { + "epoch": 0.5190405539070228, + "grad_norm": 0.8073646482891128, + "learning_rate": 4.916043986028117e-06, + "loss": 0.4503, + "step": 2099 + }, + { + "epoch": 0.5192878338278932, + "grad_norm": 0.8355382070231939, + "learning_rate": 4.91596044092716e-06, + "loss": 0.4238, + "step": 2100 + }, + { + "epoch": 0.5195351137487636, + "grad_norm": 0.8290409040475074, + "learning_rate": 4.915876854989384e-06, + "loss": 0.4331, + "step": 2101 + }, + { + "epoch": 0.5197823936696341, + "grad_norm": 0.8473558657695541, + "learning_rate": 4.915793228216201e-06, + "loss": 0.4177, + "step": 2102 + }, + { + "epoch": 0.5200296735905044, + "grad_norm": 0.8168178012275521, + "learning_rate": 4.915709560609025e-06, + "loss": 0.4207, + "step": 2103 + }, + { + "epoch": 0.5202769535113749, + "grad_norm": 0.8349750801854026, + "learning_rate": 4.91562585216927e-06, + "loss": 0.4387, + "step": 2104 + }, + { + "epoch": 0.5205242334322453, + "grad_norm": 0.8332513834402295, + "learning_rate": 4.9155421028983515e-06, + "loss": 0.4475, + "step": 2105 + }, + { + "epoch": 0.5207715133531158, + "grad_norm": 0.823978542351043, + "learning_rate": 4.915458312797684e-06, + "loss": 0.4987, + "step": 2106 + }, + { + "epoch": 0.5210187932739861, + "grad_norm": 0.8171613069077716, + "learning_rate": 4.915374481868685e-06, + "loss": 0.4533, + "step": 2107 + }, + { + "epoch": 0.5212660731948566, + "grad_norm": 0.7912376433723926, + "learning_rate": 4.915290610112772e-06, + "loss": 0.4411, + "step": 2108 + }, + { + "epoch": 0.521513353115727, + "grad_norm": 0.8223882924185978, + "learning_rate": 4.915206697531361e-06, + "loss": 0.4469, + "step": 2109 + }, + { + "epoch": 0.5217606330365975, + "grad_norm": 0.8122382638898773, + "learning_rate": 4.91512274412587e-06, + "loss": 0.4555, + "step": 2110 + }, + { + "epoch": 0.5220079129574678, + "grad_norm": 0.8145767118335325, + "learning_rate": 4.9150387498977205e-06, + "loss": 0.4355, + "step": 2111 + }, + { + "epoch": 0.5222551928783383, + "grad_norm": 0.8041076958743947, + "learning_rate": 4.91495471484833e-06, + "loss": 0.436, + "step": 2112 + }, + { + "epoch": 0.5225024727992087, + "grad_norm": 0.8253205028728051, + "learning_rate": 4.91487063897912e-06, + "loss": 0.4334, + "step": 2113 + }, + { + "epoch": 0.5227497527200792, + "grad_norm": 0.8192406762636769, + "learning_rate": 4.9147865222915114e-06, + "loss": 0.4581, + "step": 2114 + }, + { + "epoch": 0.5229970326409495, + "grad_norm": 0.8354577329747063, + "learning_rate": 4.914702364786926e-06, + "loss": 0.4213, + "step": 2115 + }, + { + "epoch": 0.52324431256182, + "grad_norm": 0.8290837226364314, + "learning_rate": 4.914618166466787e-06, + "loss": 0.4234, + "step": 2116 + }, + { + "epoch": 0.5234915924826904, + "grad_norm": 0.827501095420979, + "learning_rate": 4.914533927332516e-06, + "loss": 0.3891, + "step": 2117 + }, + { + "epoch": 0.5237388724035609, + "grad_norm": 0.8342828261706431, + "learning_rate": 4.91444964738554e-06, + "loss": 0.4336, + "step": 2118 + }, + { + "epoch": 0.5239861523244312, + "grad_norm": 0.7885249793199698, + "learning_rate": 4.914365326627279e-06, + "loss": 0.4374, + "step": 2119 + }, + { + "epoch": 0.5242334322453017, + "grad_norm": 0.8226055988032711, + "learning_rate": 4.914280965059162e-06, + "loss": 0.4655, + "step": 2120 + }, + { + "epoch": 0.5244807121661721, + "grad_norm": 0.8010022488750518, + "learning_rate": 4.914196562682613e-06, + "loss": 0.4765, + "step": 2121 + }, + { + "epoch": 0.5247279920870426, + "grad_norm": 0.8721911721071652, + "learning_rate": 4.91411211949906e-06, + "loss": 0.4082, + "step": 2122 + }, + { + "epoch": 0.5249752720079129, + "grad_norm": 0.8054096169771939, + "learning_rate": 4.914027635509929e-06, + "loss": 0.4274, + "step": 2123 + }, + { + "epoch": 0.5252225519287834, + "grad_norm": 0.8140381801310196, + "learning_rate": 4.913943110716649e-06, + "loss": 0.4405, + "step": 2124 + }, + { + "epoch": 0.5254698318496538, + "grad_norm": 0.8264133414135034, + "learning_rate": 4.913858545120648e-06, + "loss": 0.4604, + "step": 2125 + }, + { + "epoch": 0.5257171117705243, + "grad_norm": 0.8296155953337557, + "learning_rate": 4.913773938723356e-06, + "loss": 0.431, + "step": 2126 + }, + { + "epoch": 0.5259643916913946, + "grad_norm": 0.8390771475583482, + "learning_rate": 4.913689291526203e-06, + "loss": 0.4416, + "step": 2127 + }, + { + "epoch": 0.5262116716122651, + "grad_norm": 0.7978475489974522, + "learning_rate": 4.91360460353062e-06, + "loss": 0.4449, + "step": 2128 + }, + { + "epoch": 0.5264589515331355, + "grad_norm": 0.829407786061133, + "learning_rate": 4.913519874738038e-06, + "loss": 0.4076, + "step": 2129 + }, + { + "epoch": 0.526706231454006, + "grad_norm": 0.8555564981475666, + "learning_rate": 4.913435105149889e-06, + "loss": 0.4267, + "step": 2130 + }, + { + "epoch": 0.5269535113748763, + "grad_norm": 0.7951753815561883, + "learning_rate": 4.913350294767606e-06, + "loss": 0.4283, + "step": 2131 + }, + { + "epoch": 0.5272007912957468, + "grad_norm": 0.7800972057420057, + "learning_rate": 4.913265443592623e-06, + "loss": 0.4148, + "step": 2132 + }, + { + "epoch": 0.5274480712166172, + "grad_norm": 0.8085138173904091, + "learning_rate": 4.913180551626375e-06, + "loss": 0.4119, + "step": 2133 + }, + { + "epoch": 0.5276953511374877, + "grad_norm": 0.8267372632890972, + "learning_rate": 4.913095618870295e-06, + "loss": 0.4228, + "step": 2134 + }, + { + "epoch": 0.527942631058358, + "grad_norm": 0.8510446154835541, + "learning_rate": 4.913010645325819e-06, + "loss": 0.4278, + "step": 2135 + }, + { + "epoch": 0.5281899109792285, + "grad_norm": 0.7552565986314601, + "learning_rate": 4.912925630994384e-06, + "loss": 0.4367, + "step": 2136 + }, + { + "epoch": 0.5284371909000989, + "grad_norm": 0.8004850586370468, + "learning_rate": 4.912840575877427e-06, + "loss": 0.4401, + "step": 2137 + }, + { + "epoch": 0.5286844708209694, + "grad_norm": 0.8428019963490445, + "learning_rate": 4.912755479976386e-06, + "loss": 0.4457, + "step": 2138 + }, + { + "epoch": 0.5289317507418397, + "grad_norm": 0.7976182228965645, + "learning_rate": 4.912670343292698e-06, + "loss": 0.453, + "step": 2139 + }, + { + "epoch": 0.5291790306627102, + "grad_norm": 0.8429254450002306, + "learning_rate": 4.912585165827803e-06, + "loss": 0.456, + "step": 2140 + }, + { + "epoch": 0.5294263105835806, + "grad_norm": 0.7952504632214655, + "learning_rate": 4.9124999475831406e-06, + "loss": 0.455, + "step": 2141 + }, + { + "epoch": 0.5296735905044511, + "grad_norm": 0.8351255206913245, + "learning_rate": 4.912414688560152e-06, + "loss": 0.45, + "step": 2142 + }, + { + "epoch": 0.5299208704253214, + "grad_norm": 0.8073379583341719, + "learning_rate": 4.912329388760277e-06, + "loss": 0.4188, + "step": 2143 + }, + { + "epoch": 0.5301681503461919, + "grad_norm": 0.8233470088787288, + "learning_rate": 4.912244048184958e-06, + "loss": 0.4293, + "step": 2144 + }, + { + "epoch": 0.5304154302670623, + "grad_norm": 0.8453326881951997, + "learning_rate": 4.912158666835638e-06, + "loss": 0.417, + "step": 2145 + }, + { + "epoch": 0.5306627101879328, + "grad_norm": 0.8241068934917218, + "learning_rate": 4.912073244713759e-06, + "loss": 0.4182, + "step": 2146 + }, + { + "epoch": 0.5309099901088031, + "grad_norm": 0.8519056160776881, + "learning_rate": 4.911987781820766e-06, + "loss": 0.4313, + "step": 2147 + }, + { + "epoch": 0.5311572700296736, + "grad_norm": 0.853659838700209, + "learning_rate": 4.911902278158104e-06, + "loss": 0.4107, + "step": 2148 + }, + { + "epoch": 0.531404549950544, + "grad_norm": 0.801594697582263, + "learning_rate": 4.911816733727216e-06, + "loss": 0.4403, + "step": 2149 + }, + { + "epoch": 0.5316518298714145, + "grad_norm": 0.8333840414724853, + "learning_rate": 4.9117311485295504e-06, + "loss": 0.4948, + "step": 2150 + }, + { + "epoch": 0.5318991097922848, + "grad_norm": 0.8255020361468853, + "learning_rate": 4.911645522566553e-06, + "loss": 0.4524, + "step": 2151 + }, + { + "epoch": 0.5321463897131553, + "grad_norm": 0.8213445420795159, + "learning_rate": 4.91155985583967e-06, + "loss": 0.4122, + "step": 2152 + }, + { + "epoch": 0.5323936696340257, + "grad_norm": 0.8492299494064399, + "learning_rate": 4.911474148350351e-06, + "loss": 0.4305, + "step": 2153 + }, + { + "epoch": 0.5326409495548962, + "grad_norm": 0.7898914841030381, + "learning_rate": 4.9113884001000434e-06, + "loss": 0.4021, + "step": 2154 + }, + { + "epoch": 0.5328882294757665, + "grad_norm": 0.854393304558985, + "learning_rate": 4.911302611090198e-06, + "loss": 0.4462, + "step": 2155 + }, + { + "epoch": 0.533135509396637, + "grad_norm": 0.8418573865966376, + "learning_rate": 4.911216781322264e-06, + "loss": 0.4522, + "step": 2156 + }, + { + "epoch": 0.5333827893175074, + "grad_norm": 0.8668734938595644, + "learning_rate": 4.911130910797693e-06, + "loss": 0.422, + "step": 2157 + }, + { + "epoch": 0.5336300692383779, + "grad_norm": 0.8161443060828762, + "learning_rate": 4.911044999517936e-06, + "loss": 0.4137, + "step": 2158 + }, + { + "epoch": 0.5338773491592482, + "grad_norm": 0.812891841442536, + "learning_rate": 4.910959047484443e-06, + "loss": 0.482, + "step": 2159 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 0.8152358678019482, + "learning_rate": 4.910873054698671e-06, + "loss": 0.4771, + "step": 2160 + }, + { + "epoch": 0.5343719090009891, + "grad_norm": 0.8338420312345085, + "learning_rate": 4.91078702116207e-06, + "loss": 0.4435, + "step": 2161 + }, + { + "epoch": 0.5346191889218596, + "grad_norm": 0.8266232547366531, + "learning_rate": 4.910700946876096e-06, + "loss": 0.4302, + "step": 2162 + }, + { + "epoch": 0.5348664688427299, + "grad_norm": 0.8521482137328993, + "learning_rate": 4.910614831842203e-06, + "loss": 0.4587, + "step": 2163 + }, + { + "epoch": 0.5351137487636004, + "grad_norm": 0.7961612413527702, + "learning_rate": 4.910528676061848e-06, + "loss": 0.4333, + "step": 2164 + }, + { + "epoch": 0.5353610286844708, + "grad_norm": 0.7767270958799087, + "learning_rate": 4.910442479536486e-06, + "loss": 0.4744, + "step": 2165 + }, + { + "epoch": 0.5356083086053413, + "grad_norm": 0.85859905757044, + "learning_rate": 4.910356242267573e-06, + "loss": 0.4421, + "step": 2166 + }, + { + "epoch": 0.5358555885262116, + "grad_norm": 0.8270715526912454, + "learning_rate": 4.91026996425657e-06, + "loss": 0.4013, + "step": 2167 + }, + { + "epoch": 0.5361028684470821, + "grad_norm": 0.8315477443532387, + "learning_rate": 4.910183645504932e-06, + "loss": 0.4191, + "step": 2168 + }, + { + "epoch": 0.5363501483679525, + "grad_norm": 0.8379944677971474, + "learning_rate": 4.91009728601412e-06, + "loss": 0.4465, + "step": 2169 + }, + { + "epoch": 0.536597428288823, + "grad_norm": 0.8729045253998553, + "learning_rate": 4.910010885785593e-06, + "loss": 0.3901, + "step": 2170 + }, + { + "epoch": 0.5368447082096933, + "grad_norm": 0.8667862657203399, + "learning_rate": 4.909924444820812e-06, + "loss": 0.4309, + "step": 2171 + }, + { + "epoch": 0.5370919881305638, + "grad_norm": 0.8580076116689658, + "learning_rate": 4.909837963121236e-06, + "loss": 0.459, + "step": 2172 + }, + { + "epoch": 0.5373392680514342, + "grad_norm": 0.8721818555952473, + "learning_rate": 4.90975144068833e-06, + "loss": 0.4297, + "step": 2173 + }, + { + "epoch": 0.5375865479723047, + "grad_norm": 0.8281536856693168, + "learning_rate": 4.9096648775235555e-06, + "loss": 0.4258, + "step": 2174 + }, + { + "epoch": 0.537833827893175, + "grad_norm": 0.8542025748193519, + "learning_rate": 4.909578273628374e-06, + "loss": 0.4518, + "step": 2175 + }, + { + "epoch": 0.5380811078140455, + "grad_norm": 0.8190991431494749, + "learning_rate": 4.909491629004251e-06, + "loss": 0.4541, + "step": 2176 + }, + { + "epoch": 0.5383283877349159, + "grad_norm": 0.8253186688235863, + "learning_rate": 4.909404943652649e-06, + "loss": 0.4359, + "step": 2177 + }, + { + "epoch": 0.5385756676557863, + "grad_norm": 0.7976389178773332, + "learning_rate": 4.909318217575036e-06, + "loss": 0.4315, + "step": 2178 + }, + { + "epoch": 0.5388229475766568, + "grad_norm": 0.8230735329961125, + "learning_rate": 4.909231450772877e-06, + "loss": 0.4417, + "step": 2179 + }, + { + "epoch": 0.5390702274975272, + "grad_norm": 0.8584664253796905, + "learning_rate": 4.909144643247637e-06, + "loss": 0.4229, + "step": 2180 + }, + { + "epoch": 0.5393175074183977, + "grad_norm": 0.8369916036027719, + "learning_rate": 4.909057795000786e-06, + "loss": 0.4209, + "step": 2181 + }, + { + "epoch": 0.539564787339268, + "grad_norm": 0.8213458816819943, + "learning_rate": 4.90897090603379e-06, + "loss": 0.4552, + "step": 2182 + }, + { + "epoch": 0.5398120672601385, + "grad_norm": 0.7906532652889825, + "learning_rate": 4.908883976348118e-06, + "loss": 0.4441, + "step": 2183 + }, + { + "epoch": 0.5400593471810089, + "grad_norm": 0.807640026645016, + "learning_rate": 4.908797005945239e-06, + "loss": 0.4002, + "step": 2184 + }, + { + "epoch": 0.5403066271018794, + "grad_norm": 0.8040101227089689, + "learning_rate": 4.908709994826625e-06, + "loss": 0.4424, + "step": 2185 + }, + { + "epoch": 0.5405539070227497, + "grad_norm": 0.7864043573853555, + "learning_rate": 4.9086229429937445e-06, + "loss": 0.438, + "step": 2186 + }, + { + "epoch": 0.5408011869436202, + "grad_norm": 0.8279325371607597, + "learning_rate": 4.908535850448071e-06, + "loss": 0.4508, + "step": 2187 + }, + { + "epoch": 0.5410484668644906, + "grad_norm": 0.8115008524086239, + "learning_rate": 4.908448717191074e-06, + "loss": 0.4323, + "step": 2188 + }, + { + "epoch": 0.5412957467853611, + "grad_norm": 0.9052085553109384, + "learning_rate": 4.9083615432242285e-06, + "loss": 0.4108, + "step": 2189 + }, + { + "epoch": 0.5415430267062314, + "grad_norm": 0.8303476204316234, + "learning_rate": 4.908274328549006e-06, + "loss": 0.4361, + "step": 2190 + }, + { + "epoch": 0.5417903066271019, + "grad_norm": 0.7947377340525147, + "learning_rate": 4.908187073166883e-06, + "loss": 0.4392, + "step": 2191 + }, + { + "epoch": 0.5420375865479723, + "grad_norm": 0.7966996712846387, + "learning_rate": 4.908099777079334e-06, + "loss": 0.4436, + "step": 2192 + }, + { + "epoch": 0.5422848664688428, + "grad_norm": 0.7775724874301819, + "learning_rate": 4.908012440287833e-06, + "loss": 0.446, + "step": 2193 + }, + { + "epoch": 0.5425321463897131, + "grad_norm": 0.8067580307740871, + "learning_rate": 4.907925062793858e-06, + "loss": 0.4667, + "step": 2194 + }, + { + "epoch": 0.5427794263105836, + "grad_norm": 0.8398863268604441, + "learning_rate": 4.907837644598884e-06, + "loss": 0.4295, + "step": 2195 + }, + { + "epoch": 0.543026706231454, + "grad_norm": 0.8404337735535851, + "learning_rate": 4.90775018570439e-06, + "loss": 0.4308, + "step": 2196 + }, + { + "epoch": 0.5432739861523245, + "grad_norm": 0.8528144316022201, + "learning_rate": 4.907662686111854e-06, + "loss": 0.4287, + "step": 2197 + }, + { + "epoch": 0.5435212660731948, + "grad_norm": 0.7777032861160702, + "learning_rate": 4.907575145822755e-06, + "loss": 0.4426, + "step": 2198 + }, + { + "epoch": 0.5437685459940653, + "grad_norm": 0.8135680163087625, + "learning_rate": 4.907487564838573e-06, + "loss": 0.448, + "step": 2199 + }, + { + "epoch": 0.5440158259149357, + "grad_norm": 0.800459169825187, + "learning_rate": 4.907399943160787e-06, + "loss": 0.4259, + "step": 2200 + }, + { + "epoch": 0.5442631058358062, + "grad_norm": 0.8160972567128584, + "learning_rate": 4.9073122807908815e-06, + "loss": 0.4278, + "step": 2201 + }, + { + "epoch": 0.5445103857566765, + "grad_norm": 0.8310785106901327, + "learning_rate": 4.907224577730334e-06, + "loss": 0.4305, + "step": 2202 + }, + { + "epoch": 0.544757665677547, + "grad_norm": 0.828287917795897, + "learning_rate": 4.907136833980629e-06, + "loss": 0.4288, + "step": 2203 + }, + { + "epoch": 0.5450049455984174, + "grad_norm": 0.8524021235419231, + "learning_rate": 4.907049049543249e-06, + "loss": 0.4648, + "step": 2204 + }, + { + "epoch": 0.5452522255192879, + "grad_norm": 0.8054678015571812, + "learning_rate": 4.906961224419679e-06, + "loss": 0.3964, + "step": 2205 + }, + { + "epoch": 0.5454995054401582, + "grad_norm": 0.870197504575575, + "learning_rate": 4.9068733586114025e-06, + "loss": 0.4401, + "step": 2206 + }, + { + "epoch": 0.5457467853610287, + "grad_norm": 0.7999008358756727, + "learning_rate": 4.9067854521199055e-06, + "loss": 0.4216, + "step": 2207 + }, + { + "epoch": 0.5459940652818991, + "grad_norm": 0.7757865860444793, + "learning_rate": 4.906697504946672e-06, + "loss": 0.4631, + "step": 2208 + }, + { + "epoch": 0.5462413452027696, + "grad_norm": 0.8179813608668347, + "learning_rate": 4.906609517093192e-06, + "loss": 0.4291, + "step": 2209 + }, + { + "epoch": 0.5464886251236399, + "grad_norm": 0.8254017964289843, + "learning_rate": 4.906521488560949e-06, + "loss": 0.4165, + "step": 2210 + }, + { + "epoch": 0.5467359050445104, + "grad_norm": 0.8365497028422612, + "learning_rate": 4.906433419351433e-06, + "loss": 0.414, + "step": 2211 + }, + { + "epoch": 0.5469831849653808, + "grad_norm": 0.8249052368071237, + "learning_rate": 4.906345309466131e-06, + "loss": 0.4337, + "step": 2212 + }, + { + "epoch": 0.5472304648862513, + "grad_norm": 0.793602182328121, + "learning_rate": 4.906257158906536e-06, + "loss": 0.4529, + "step": 2213 + }, + { + "epoch": 0.5474777448071216, + "grad_norm": 0.7932116491251698, + "learning_rate": 4.9061689676741335e-06, + "loss": 0.4417, + "step": 2214 + }, + { + "epoch": 0.5477250247279921, + "grad_norm": 0.8341760080668459, + "learning_rate": 4.906080735770417e-06, + "loss": 0.4301, + "step": 2215 + }, + { + "epoch": 0.5479723046488625, + "grad_norm": 0.8190689937567308, + "learning_rate": 4.905992463196877e-06, + "loss": 0.4548, + "step": 2216 + }, + { + "epoch": 0.548219584569733, + "grad_norm": 0.8905987162330603, + "learning_rate": 4.9059041499550055e-06, + "loss": 0.4423, + "step": 2217 + }, + { + "epoch": 0.5484668644906033, + "grad_norm": 0.8042704183629057, + "learning_rate": 4.905815796046296e-06, + "loss": 0.4159, + "step": 2218 + }, + { + "epoch": 0.5487141444114738, + "grad_norm": 0.7900037057341976, + "learning_rate": 4.905727401472241e-06, + "loss": 0.4726, + "step": 2219 + }, + { + "epoch": 0.5489614243323442, + "grad_norm": 0.8418688230703245, + "learning_rate": 4.905638966234335e-06, + "loss": 0.4332, + "step": 2220 + }, + { + "epoch": 0.5492087042532147, + "grad_norm": 0.8111111477828057, + "learning_rate": 4.905550490334072e-06, + "loss": 0.4698, + "step": 2221 + }, + { + "epoch": 0.549455984174085, + "grad_norm": 0.7790535493069579, + "learning_rate": 4.90546197377295e-06, + "loss": 0.4388, + "step": 2222 + }, + { + "epoch": 0.5497032640949555, + "grad_norm": 0.8159173255576104, + "learning_rate": 4.905373416552463e-06, + "loss": 0.4341, + "step": 2223 + }, + { + "epoch": 0.5499505440158259, + "grad_norm": 0.8068905887185784, + "learning_rate": 4.905284818674107e-06, + "loss": 0.4507, + "step": 2224 + }, + { + "epoch": 0.5501978239366964, + "grad_norm": 0.7975516282689951, + "learning_rate": 4.905196180139382e-06, + "loss": 0.4275, + "step": 2225 + }, + { + "epoch": 0.5504451038575667, + "grad_norm": 0.8363033551597234, + "learning_rate": 4.905107500949785e-06, + "loss": 0.4033, + "step": 2226 + }, + { + "epoch": 0.5506923837784372, + "grad_norm": 0.8446055816925565, + "learning_rate": 4.905018781106815e-06, + "loss": 0.4158, + "step": 2227 + }, + { + "epoch": 0.5509396636993076, + "grad_norm": 0.8059400557615801, + "learning_rate": 4.904930020611972e-06, + "loss": 0.4892, + "step": 2228 + }, + { + "epoch": 0.5511869436201781, + "grad_norm": 0.8230403015167589, + "learning_rate": 4.904841219466756e-06, + "loss": 0.4299, + "step": 2229 + }, + { + "epoch": 0.5514342235410484, + "grad_norm": 0.8136682923193483, + "learning_rate": 4.904752377672668e-06, + "loss": 0.4462, + "step": 2230 + }, + { + "epoch": 0.5516815034619189, + "grad_norm": 0.8024451412346982, + "learning_rate": 4.90466349523121e-06, + "loss": 0.4318, + "step": 2231 + }, + { + "epoch": 0.5519287833827893, + "grad_norm": 0.8662454751543324, + "learning_rate": 4.904574572143883e-06, + "loss": 0.4286, + "step": 2232 + }, + { + "epoch": 0.5521760633036598, + "grad_norm": 0.7864315924989522, + "learning_rate": 4.904485608412193e-06, + "loss": 0.4594, + "step": 2233 + }, + { + "epoch": 0.5524233432245301, + "grad_norm": 0.7968996130135781, + "learning_rate": 4.90439660403764e-06, + "loss": 0.4429, + "step": 2234 + }, + { + "epoch": 0.5526706231454006, + "grad_norm": 0.8056598325190589, + "learning_rate": 4.904307559021731e-06, + "loss": 0.4406, + "step": 2235 + }, + { + "epoch": 0.552917903066271, + "grad_norm": 0.7968152385905476, + "learning_rate": 4.9042184733659716e-06, + "loss": 0.4491, + "step": 2236 + }, + { + "epoch": 0.5531651829871415, + "grad_norm": 0.8240548698911649, + "learning_rate": 4.904129347071866e-06, + "loss": 0.4467, + "step": 2237 + }, + { + "epoch": 0.5534124629080118, + "grad_norm": 0.8459774707886466, + "learning_rate": 4.904040180140921e-06, + "loss": 0.4174, + "step": 2238 + }, + { + "epoch": 0.5536597428288823, + "grad_norm": 0.7962096726579939, + "learning_rate": 4.903950972574644e-06, + "loss": 0.4378, + "step": 2239 + }, + { + "epoch": 0.5539070227497527, + "grad_norm": 0.7956323439925366, + "learning_rate": 4.903861724374542e-06, + "loss": 0.4728, + "step": 2240 + }, + { + "epoch": 0.5541543026706232, + "grad_norm": 0.874177788619472, + "learning_rate": 4.903772435542126e-06, + "loss": 0.4302, + "step": 2241 + }, + { + "epoch": 0.5544015825914935, + "grad_norm": 0.8428091902882662, + "learning_rate": 4.9036831060789025e-06, + "loss": 0.4645, + "step": 2242 + }, + { + "epoch": 0.554648862512364, + "grad_norm": 0.8492857888020867, + "learning_rate": 4.903593735986383e-06, + "loss": 0.4369, + "step": 2243 + }, + { + "epoch": 0.5548961424332344, + "grad_norm": 0.8303933191114768, + "learning_rate": 4.903504325266077e-06, + "loss": 0.4068, + "step": 2244 + }, + { + "epoch": 0.5551434223541049, + "grad_norm": 0.827774144006841, + "learning_rate": 4.903414873919497e-06, + "loss": 0.4302, + "step": 2245 + }, + { + "epoch": 0.5553907022749752, + "grad_norm": 0.8032810656584819, + "learning_rate": 4.903325381948154e-06, + "loss": 0.4324, + "step": 2246 + }, + { + "epoch": 0.5556379821958457, + "grad_norm": 0.7855184005897174, + "learning_rate": 4.903235849353562e-06, + "loss": 0.4413, + "step": 2247 + }, + { + "epoch": 0.5558852621167161, + "grad_norm": 0.8569371908994023, + "learning_rate": 4.903146276137233e-06, + "loss": 0.4295, + "step": 2248 + }, + { + "epoch": 0.5561325420375866, + "grad_norm": 0.8228709188562383, + "learning_rate": 4.903056662300682e-06, + "loss": 0.4093, + "step": 2249 + }, + { + "epoch": 0.5563798219584569, + "grad_norm": 0.7978752521898264, + "learning_rate": 4.9029670078454225e-06, + "loss": 0.4483, + "step": 2250 + }, + { + "epoch": 0.5566271018793274, + "grad_norm": 0.8386855590201181, + "learning_rate": 4.902877312772973e-06, + "loss": 0.4367, + "step": 2251 + }, + { + "epoch": 0.5568743818001978, + "grad_norm": 0.8097659858522324, + "learning_rate": 4.902787577084844e-06, + "loss": 0.4458, + "step": 2252 + }, + { + "epoch": 0.5571216617210683, + "grad_norm": 0.7981580393798596, + "learning_rate": 4.902697800782558e-06, + "loss": 0.4582, + "step": 2253 + }, + { + "epoch": 0.5573689416419386, + "grad_norm": 0.8063797456708763, + "learning_rate": 4.9026079838676295e-06, + "loss": 0.4467, + "step": 2254 + }, + { + "epoch": 0.5576162215628091, + "grad_norm": 0.8263331039140084, + "learning_rate": 4.902518126341577e-06, + "loss": 0.4623, + "step": 2255 + }, + { + "epoch": 0.5578635014836796, + "grad_norm": 0.7836176844916037, + "learning_rate": 4.90242822820592e-06, + "loss": 0.4364, + "step": 2256 + }, + { + "epoch": 0.55811078140455, + "grad_norm": 0.8055018134426273, + "learning_rate": 4.9023382894621775e-06, + "loss": 0.4334, + "step": 2257 + }, + { + "epoch": 0.5583580613254204, + "grad_norm": 0.8561517123386011, + "learning_rate": 4.90224831011187e-06, + "loss": 0.4232, + "step": 2258 + }, + { + "epoch": 0.5586053412462908, + "grad_norm": 0.857649910835037, + "learning_rate": 4.902158290156518e-06, + "loss": 0.4254, + "step": 2259 + }, + { + "epoch": 0.5588526211671613, + "grad_norm": 0.847810648250041, + "learning_rate": 4.902068229597644e-06, + "loss": 0.4346, + "step": 2260 + }, + { + "epoch": 0.5590999010880316, + "grad_norm": 0.8041283790378243, + "learning_rate": 4.901978128436769e-06, + "loss": 0.4451, + "step": 2261 + }, + { + "epoch": 0.5593471810089021, + "grad_norm": 0.8463729671090765, + "learning_rate": 4.901887986675418e-06, + "loss": 0.4629, + "step": 2262 + }, + { + "epoch": 0.5595944609297725, + "grad_norm": 0.8255668378936192, + "learning_rate": 4.901797804315112e-06, + "loss": 0.4526, + "step": 2263 + }, + { + "epoch": 0.559841740850643, + "grad_norm": 0.8397646315208941, + "learning_rate": 4.901707581357377e-06, + "loss": 0.4475, + "step": 2264 + }, + { + "epoch": 0.5600890207715133, + "grad_norm": 0.8508504303490839, + "learning_rate": 4.901617317803738e-06, + "loss": 0.4254, + "step": 2265 + }, + { + "epoch": 0.5603363006923838, + "grad_norm": 0.7740850838820026, + "learning_rate": 4.9015270136557204e-06, + "loss": 0.4467, + "step": 2266 + }, + { + "epoch": 0.5605835806132542, + "grad_norm": 0.8227529902841786, + "learning_rate": 4.9014366689148504e-06, + "loss": 0.4195, + "step": 2267 + }, + { + "epoch": 0.5608308605341247, + "grad_norm": 0.777112849292443, + "learning_rate": 4.9013462835826564e-06, + "loss": 0.436, + "step": 2268 + }, + { + "epoch": 0.561078140454995, + "grad_norm": 0.802035885633998, + "learning_rate": 4.901255857660664e-06, + "loss": 0.4443, + "step": 2269 + }, + { + "epoch": 0.5613254203758655, + "grad_norm": 0.7937144984071973, + "learning_rate": 4.9011653911504035e-06, + "loss": 0.4623, + "step": 2270 + }, + { + "epoch": 0.5615727002967359, + "grad_norm": 0.8332439563357789, + "learning_rate": 4.901074884053403e-06, + "loss": 0.4373, + "step": 2271 + }, + { + "epoch": 0.5618199802176064, + "grad_norm": 0.826393197627037, + "learning_rate": 4.900984336371192e-06, + "loss": 0.4567, + "step": 2272 + }, + { + "epoch": 0.5620672601384767, + "grad_norm": 0.8725228788732688, + "learning_rate": 4.900893748105303e-06, + "loss": 0.4052, + "step": 2273 + }, + { + "epoch": 0.5623145400593472, + "grad_norm": 0.8350692058768774, + "learning_rate": 4.900803119257265e-06, + "loss": 0.4229, + "step": 2274 + }, + { + "epoch": 0.5625618199802176, + "grad_norm": 0.7833697423172672, + "learning_rate": 4.900712449828611e-06, + "loss": 0.4712, + "step": 2275 + }, + { + "epoch": 0.5628090999010881, + "grad_norm": 0.8084407515407549, + "learning_rate": 4.9006217398208735e-06, + "loss": 0.4767, + "step": 2276 + }, + { + "epoch": 0.5630563798219584, + "grad_norm": 0.8163996352400188, + "learning_rate": 4.900530989235586e-06, + "loss": 0.4593, + "step": 2277 + }, + { + "epoch": 0.5633036597428289, + "grad_norm": 0.8359981175249533, + "learning_rate": 4.9004401980742814e-06, + "loss": 0.4633, + "step": 2278 + }, + { + "epoch": 0.5635509396636993, + "grad_norm": 0.8165262581141532, + "learning_rate": 4.900349366338495e-06, + "loss": 0.4334, + "step": 2279 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 0.8640529713261024, + "learning_rate": 4.900258494029763e-06, + "loss": 0.4489, + "step": 2280 + }, + { + "epoch": 0.5640454995054401, + "grad_norm": 0.8400644684388208, + "learning_rate": 4.90016758114962e-06, + "loss": 0.4168, + "step": 2281 + }, + { + "epoch": 0.5642927794263106, + "grad_norm": 0.8594985484661135, + "learning_rate": 4.9000766276996025e-06, + "loss": 0.443, + "step": 2282 + }, + { + "epoch": 0.564540059347181, + "grad_norm": 0.7941467731546799, + "learning_rate": 4.8999856336812495e-06, + "loss": 0.4597, + "step": 2283 + }, + { + "epoch": 0.5647873392680515, + "grad_norm": 0.8334239105274157, + "learning_rate": 4.899894599096098e-06, + "loss": 0.4527, + "step": 2284 + }, + { + "epoch": 0.5650346191889218, + "grad_norm": 0.8680664900329874, + "learning_rate": 4.899803523945688e-06, + "loss": 0.4288, + "step": 2285 + }, + { + "epoch": 0.5652818991097923, + "grad_norm": 0.827260333304532, + "learning_rate": 4.899712408231556e-06, + "loss": 0.4289, + "step": 2286 + }, + { + "epoch": 0.5655291790306627, + "grad_norm": 0.8183857986951192, + "learning_rate": 4.899621251955245e-06, + "loss": 0.4149, + "step": 2287 + }, + { + "epoch": 0.5657764589515332, + "grad_norm": 0.8184921245944378, + "learning_rate": 4.899530055118295e-06, + "loss": 0.4581, + "step": 2288 + }, + { + "epoch": 0.5660237388724035, + "grad_norm": 0.846124403399315, + "learning_rate": 4.899438817722248e-06, + "loss": 0.4104, + "step": 2289 + }, + { + "epoch": 0.566271018793274, + "grad_norm": 0.8560378103575694, + "learning_rate": 4.899347539768644e-06, + "loss": 0.4358, + "step": 2290 + }, + { + "epoch": 0.5665182987141444, + "grad_norm": 0.8072714154604986, + "learning_rate": 4.899256221259028e-06, + "loss": 0.4087, + "step": 2291 + }, + { + "epoch": 0.5667655786350149, + "grad_norm": 0.7711814123606188, + "learning_rate": 4.899164862194943e-06, + "loss": 0.4623, + "step": 2292 + }, + { + "epoch": 0.5670128585558852, + "grad_norm": 0.7892717211222391, + "learning_rate": 4.899073462577933e-06, + "loss": 0.4566, + "step": 2293 + }, + { + "epoch": 0.5672601384767557, + "grad_norm": 0.8024640398701117, + "learning_rate": 4.898982022409543e-06, + "loss": 0.4082, + "step": 2294 + }, + { + "epoch": 0.5675074183976261, + "grad_norm": 0.8267656493786144, + "learning_rate": 4.898890541691319e-06, + "loss": 0.4373, + "step": 2295 + }, + { + "epoch": 0.5677546983184966, + "grad_norm": 0.804452429351204, + "learning_rate": 4.898799020424806e-06, + "loss": 0.4303, + "step": 2296 + }, + { + "epoch": 0.5680019782393669, + "grad_norm": 0.8341753209395127, + "learning_rate": 4.8987074586115535e-06, + "loss": 0.4246, + "step": 2297 + }, + { + "epoch": 0.5682492581602374, + "grad_norm": 0.8332330741798474, + "learning_rate": 4.898615856253107e-06, + "loss": 0.4177, + "step": 2298 + }, + { + "epoch": 0.5684965380811078, + "grad_norm": 0.7813581360985322, + "learning_rate": 4.898524213351015e-06, + "loss": 0.4176, + "step": 2299 + }, + { + "epoch": 0.5687438180019783, + "grad_norm": 0.822524089394135, + "learning_rate": 4.898432529906827e-06, + "loss": 0.465, + "step": 2300 + }, + { + "epoch": 0.5689910979228486, + "grad_norm": 0.9011958348262217, + "learning_rate": 4.8983408059220935e-06, + "loss": 0.4204, + "step": 2301 + }, + { + "epoch": 0.5692383778437191, + "grad_norm": 0.8306695477074517, + "learning_rate": 4.898249041398363e-06, + "loss": 0.4838, + "step": 2302 + }, + { + "epoch": 0.5694856577645895, + "grad_norm": 0.8522958342818597, + "learning_rate": 4.898157236337189e-06, + "loss": 0.4461, + "step": 2303 + }, + { + "epoch": 0.56973293768546, + "grad_norm": 0.8436724215518849, + "learning_rate": 4.898065390740121e-06, + "loss": 0.4555, + "step": 2304 + }, + { + "epoch": 0.5699802176063303, + "grad_norm": 0.7847752288452432, + "learning_rate": 4.8979735046087126e-06, + "loss": 0.4805, + "step": 2305 + }, + { + "epoch": 0.5702274975272008, + "grad_norm": 0.8250677875140624, + "learning_rate": 4.897881577944517e-06, + "loss": 0.4447, + "step": 2306 + }, + { + "epoch": 0.5704747774480712, + "grad_norm": 0.8081462932632263, + "learning_rate": 4.897789610749088e-06, + "loss": 0.4424, + "step": 2307 + }, + { + "epoch": 0.5707220573689417, + "grad_norm": 0.8776056253920698, + "learning_rate": 4.89769760302398e-06, + "loss": 0.4045, + "step": 2308 + }, + { + "epoch": 0.570969337289812, + "grad_norm": 0.8319100971376697, + "learning_rate": 4.897605554770747e-06, + "loss": 0.4583, + "step": 2309 + }, + { + "epoch": 0.5712166172106825, + "grad_norm": 0.8083704554589096, + "learning_rate": 4.897513465990947e-06, + "loss": 0.4305, + "step": 2310 + }, + { + "epoch": 0.5714638971315529, + "grad_norm": 0.8477139545852911, + "learning_rate": 4.897421336686136e-06, + "loss": 0.4139, + "step": 2311 + }, + { + "epoch": 0.5717111770524234, + "grad_norm": 0.7994312192421875, + "learning_rate": 4.8973291668578705e-06, + "loss": 0.4224, + "step": 2312 + }, + { + "epoch": 0.5719584569732937, + "grad_norm": 0.834166390095418, + "learning_rate": 4.897236956507708e-06, + "loss": 0.441, + "step": 2313 + }, + { + "epoch": 0.5722057368941642, + "grad_norm": 0.791463316947777, + "learning_rate": 4.897144705637209e-06, + "loss": 0.4207, + "step": 2314 + }, + { + "epoch": 0.5724530168150346, + "grad_norm": 0.8337894517778417, + "learning_rate": 4.897052414247931e-06, + "loss": 0.432, + "step": 2315 + }, + { + "epoch": 0.5727002967359051, + "grad_norm": 0.8463594355939713, + "learning_rate": 4.8969600823414344e-06, + "loss": 0.4503, + "step": 2316 + }, + { + "epoch": 0.5729475766567754, + "grad_norm": 0.8018986013541408, + "learning_rate": 4.896867709919281e-06, + "loss": 0.4249, + "step": 2317 + }, + { + "epoch": 0.5731948565776459, + "grad_norm": 0.7905890182712009, + "learning_rate": 4.896775296983031e-06, + "loss": 0.4289, + "step": 2318 + }, + { + "epoch": 0.5734421364985163, + "grad_norm": 0.8617968858880003, + "learning_rate": 4.896682843534247e-06, + "loss": 0.4388, + "step": 2319 + }, + { + "epoch": 0.5736894164193868, + "grad_norm": 0.7938863568430938, + "learning_rate": 4.896590349574492e-06, + "loss": 0.4278, + "step": 2320 + }, + { + "epoch": 0.5739366963402571, + "grad_norm": 0.8029197121608271, + "learning_rate": 4.8964978151053275e-06, + "loss": 0.4108, + "step": 2321 + }, + { + "epoch": 0.5741839762611276, + "grad_norm": 0.8299560729574813, + "learning_rate": 4.89640524012832e-06, + "loss": 0.4709, + "step": 2322 + }, + { + "epoch": 0.574431256181998, + "grad_norm": 0.796430638715101, + "learning_rate": 4.8963126246450335e-06, + "loss": 0.4285, + "step": 2323 + }, + { + "epoch": 0.5746785361028685, + "grad_norm": 0.8555593774864729, + "learning_rate": 4.8962199686570335e-06, + "loss": 0.4079, + "step": 2324 + }, + { + "epoch": 0.5749258160237388, + "grad_norm": 0.803728746348809, + "learning_rate": 4.896127272165886e-06, + "loss": 0.4427, + "step": 2325 + }, + { + "epoch": 0.5751730959446093, + "grad_norm": 0.8023320061091146, + "learning_rate": 4.896034535173158e-06, + "loss": 0.426, + "step": 2326 + }, + { + "epoch": 0.5754203758654797, + "grad_norm": 0.7665915360838667, + "learning_rate": 4.895941757680415e-06, + "loss": 0.4767, + "step": 2327 + }, + { + "epoch": 0.5756676557863502, + "grad_norm": 0.8435315407885349, + "learning_rate": 4.8958489396892286e-06, + "loss": 0.4291, + "step": 2328 + }, + { + "epoch": 0.5759149357072205, + "grad_norm": 0.804335725986931, + "learning_rate": 4.895756081201166e-06, + "loss": 0.4368, + "step": 2329 + }, + { + "epoch": 0.576162215628091, + "grad_norm": 0.8100890310943418, + "learning_rate": 4.895663182217797e-06, + "loss": 0.4465, + "step": 2330 + }, + { + "epoch": 0.5764094955489614, + "grad_norm": 0.8408293669759955, + "learning_rate": 4.895570242740692e-06, + "loss": 0.4538, + "step": 2331 + }, + { + "epoch": 0.5766567754698319, + "grad_norm": 0.8239576081794222, + "learning_rate": 4.895477262771422e-06, + "loss": 0.4191, + "step": 2332 + }, + { + "epoch": 0.5769040553907022, + "grad_norm": 0.8557709570378603, + "learning_rate": 4.895384242311557e-06, + "loss": 0.4175, + "step": 2333 + }, + { + "epoch": 0.5771513353115727, + "grad_norm": 0.8297806420249763, + "learning_rate": 4.895291181362673e-06, + "loss": 0.4594, + "step": 2334 + }, + { + "epoch": 0.5773986152324432, + "grad_norm": 0.8394220393681534, + "learning_rate": 4.895198079926339e-06, + "loss": 0.433, + "step": 2335 + }, + { + "epoch": 0.5776458951533135, + "grad_norm": 0.8445647987604912, + "learning_rate": 4.895104938004131e-06, + "loss": 0.4482, + "step": 2336 + }, + { + "epoch": 0.577893175074184, + "grad_norm": 0.764743578386273, + "learning_rate": 4.895011755597622e-06, + "loss": 0.4393, + "step": 2337 + }, + { + "epoch": 0.5781404549950544, + "grad_norm": 0.7726672754820177, + "learning_rate": 4.894918532708388e-06, + "loss": 0.4574, + "step": 2338 + }, + { + "epoch": 0.5783877349159249, + "grad_norm": 0.8421556128246851, + "learning_rate": 4.894825269338005e-06, + "loss": 0.4334, + "step": 2339 + }, + { + "epoch": 0.5786350148367952, + "grad_norm": 0.8046307352593715, + "learning_rate": 4.894731965488049e-06, + "loss": 0.46, + "step": 2340 + }, + { + "epoch": 0.5788822947576657, + "grad_norm": 0.8135881552965002, + "learning_rate": 4.894638621160097e-06, + "loss": 0.4574, + "step": 2341 + }, + { + "epoch": 0.5791295746785361, + "grad_norm": 0.8889421866638874, + "learning_rate": 4.894545236355728e-06, + "loss": 0.4498, + "step": 2342 + }, + { + "epoch": 0.5793768545994066, + "grad_norm": 0.801204639738931, + "learning_rate": 4.894451811076518e-06, + "loss": 0.4134, + "step": 2343 + }, + { + "epoch": 0.579624134520277, + "grad_norm": 0.7606419385924601, + "learning_rate": 4.894358345324047e-06, + "loss": 0.4414, + "step": 2344 + }, + { + "epoch": 0.5798714144411474, + "grad_norm": 0.7991068318546964, + "learning_rate": 4.894264839099897e-06, + "loss": 0.4416, + "step": 2345 + }, + { + "epoch": 0.5801186943620178, + "grad_norm": 0.7771589957974486, + "learning_rate": 4.894171292405646e-06, + "loss": 0.4342, + "step": 2346 + }, + { + "epoch": 0.5803659742828883, + "grad_norm": 0.7968997920942941, + "learning_rate": 4.894077705242877e-06, + "loss": 0.4354, + "step": 2347 + }, + { + "epoch": 0.5806132542037586, + "grad_norm": 0.8548778972922664, + "learning_rate": 4.8939840776131695e-06, + "loss": 0.4334, + "step": 2348 + }, + { + "epoch": 0.5808605341246291, + "grad_norm": 0.7816682107099382, + "learning_rate": 4.893890409518108e-06, + "loss": 0.4431, + "step": 2349 + }, + { + "epoch": 0.5811078140454995, + "grad_norm": 0.799351712968776, + "learning_rate": 4.893796700959277e-06, + "loss": 0.4222, + "step": 2350 + }, + { + "epoch": 0.58135509396637, + "grad_norm": 0.8482320335522718, + "learning_rate": 4.893702951938257e-06, + "loss": 0.4419, + "step": 2351 + }, + { + "epoch": 0.5816023738872403, + "grad_norm": 0.8284555077453484, + "learning_rate": 4.8936091624566355e-06, + "loss": 0.4476, + "step": 2352 + }, + { + "epoch": 0.5818496538081108, + "grad_norm": 0.8275396827906788, + "learning_rate": 4.893515332515996e-06, + "loss": 0.4343, + "step": 2353 + }, + { + "epoch": 0.5820969337289812, + "grad_norm": 0.7981461681609753, + "learning_rate": 4.893421462117926e-06, + "loss": 0.4716, + "step": 2354 + }, + { + "epoch": 0.5823442136498517, + "grad_norm": 0.8157820886340437, + "learning_rate": 4.893327551264011e-06, + "loss": 0.4425, + "step": 2355 + }, + { + "epoch": 0.582591493570722, + "grad_norm": 0.8458746953030688, + "learning_rate": 4.893233599955839e-06, + "loss": 0.4378, + "step": 2356 + }, + { + "epoch": 0.5828387734915925, + "grad_norm": 0.7723732034558244, + "learning_rate": 4.8931396081949975e-06, + "loss": 0.4288, + "step": 2357 + }, + { + "epoch": 0.5830860534124629, + "grad_norm": 0.8096170301395595, + "learning_rate": 4.893045575983076e-06, + "loss": 0.4779, + "step": 2358 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.9007402286717402, + "learning_rate": 4.892951503321664e-06, + "loss": 0.437, + "step": 2359 + }, + { + "epoch": 0.5835806132542037, + "grad_norm": 0.8482481245802722, + "learning_rate": 4.89285739021235e-06, + "loss": 0.4514, + "step": 2360 + }, + { + "epoch": 0.5838278931750742, + "grad_norm": 0.8062299158701528, + "learning_rate": 4.8927632366567275e-06, + "loss": 0.4478, + "step": 2361 + }, + { + "epoch": 0.5840751730959446, + "grad_norm": 0.8283318915862347, + "learning_rate": 4.892669042656385e-06, + "loss": 0.4219, + "step": 2362 + }, + { + "epoch": 0.5843224530168151, + "grad_norm": 0.8426695019394065, + "learning_rate": 4.892574808212917e-06, + "loss": 0.4448, + "step": 2363 + }, + { + "epoch": 0.5845697329376854, + "grad_norm": 0.8086019012718995, + "learning_rate": 4.892480533327915e-06, + "loss": 0.4214, + "step": 2364 + }, + { + "epoch": 0.5848170128585559, + "grad_norm": 0.7901079758203985, + "learning_rate": 4.892386218002973e-06, + "loss": 0.4535, + "step": 2365 + }, + { + "epoch": 0.5850642927794263, + "grad_norm": 0.8167317156898983, + "learning_rate": 4.892291862239684e-06, + "loss": 0.4087, + "step": 2366 + }, + { + "epoch": 0.5853115727002968, + "grad_norm": 0.8526609048941977, + "learning_rate": 4.892197466039646e-06, + "loss": 0.4308, + "step": 2367 + }, + { + "epoch": 0.5855588526211671, + "grad_norm": 0.8253932828103235, + "learning_rate": 4.8921030294044515e-06, + "loss": 0.403, + "step": 2368 + }, + { + "epoch": 0.5858061325420376, + "grad_norm": 0.8005452158190326, + "learning_rate": 4.892008552335697e-06, + "loss": 0.4069, + "step": 2369 + }, + { + "epoch": 0.586053412462908, + "grad_norm": 0.7694913501113785, + "learning_rate": 4.891914034834982e-06, + "loss": 0.4201, + "step": 2370 + }, + { + "epoch": 0.5863006923837785, + "grad_norm": 0.8099442709125199, + "learning_rate": 4.891819476903902e-06, + "loss": 0.4575, + "step": 2371 + }, + { + "epoch": 0.5865479723046488, + "grad_norm": 0.8306582160233137, + "learning_rate": 4.891724878544054e-06, + "loss": 0.4259, + "step": 2372 + }, + { + "epoch": 0.5867952522255193, + "grad_norm": 0.7982546667998933, + "learning_rate": 4.891630239757041e-06, + "loss": 0.4317, + "step": 2373 + }, + { + "epoch": 0.5870425321463897, + "grad_norm": 0.8258893090255622, + "learning_rate": 4.891535560544459e-06, + "loss": 0.4472, + "step": 2374 + }, + { + "epoch": 0.5872898120672602, + "grad_norm": 0.7941013746518771, + "learning_rate": 4.89144084090791e-06, + "loss": 0.422, + "step": 2375 + }, + { + "epoch": 0.5875370919881305, + "grad_norm": 0.8618283605659312, + "learning_rate": 4.891346080848995e-06, + "loss": 0.4099, + "step": 2376 + }, + { + "epoch": 0.587784371909001, + "grad_norm": 0.8359174158930102, + "learning_rate": 4.891251280369316e-06, + "loss": 0.4186, + "step": 2377 + }, + { + "epoch": 0.5880316518298714, + "grad_norm": 0.8476514457524892, + "learning_rate": 4.891156439470473e-06, + "loss": 0.4319, + "step": 2378 + }, + { + "epoch": 0.5882789317507419, + "grad_norm": 0.8448744222305574, + "learning_rate": 4.891061558154073e-06, + "loss": 0.4568, + "step": 2379 + }, + { + "epoch": 0.5885262116716122, + "grad_norm": 0.843451311072349, + "learning_rate": 4.890966636421717e-06, + "loss": 0.4319, + "step": 2380 + }, + { + "epoch": 0.5887734915924827, + "grad_norm": 0.8339217168520412, + "learning_rate": 4.890871674275011e-06, + "loss": 0.4351, + "step": 2381 + }, + { + "epoch": 0.5890207715133531, + "grad_norm": 0.833107783986654, + "learning_rate": 4.890776671715558e-06, + "loss": 0.4035, + "step": 2382 + }, + { + "epoch": 0.5892680514342236, + "grad_norm": 0.8137168641583187, + "learning_rate": 4.890681628744966e-06, + "loss": 0.4259, + "step": 2383 + }, + { + "epoch": 0.5895153313550939, + "grad_norm": 0.805892766745161, + "learning_rate": 4.890586545364841e-06, + "loss": 0.4333, + "step": 2384 + }, + { + "epoch": 0.5897626112759644, + "grad_norm": 0.8375188065735549, + "learning_rate": 4.890491421576788e-06, + "loss": 0.4335, + "step": 2385 + }, + { + "epoch": 0.5900098911968348, + "grad_norm": 0.8230336151396773, + "learning_rate": 4.8903962573824185e-06, + "loss": 0.4406, + "step": 2386 + }, + { + "epoch": 0.5902571711177053, + "grad_norm": 0.8155143896059736, + "learning_rate": 4.890301052783339e-06, + "loss": 0.4545, + "step": 2387 + }, + { + "epoch": 0.5905044510385756, + "grad_norm": 0.7717233653578202, + "learning_rate": 4.890205807781159e-06, + "loss": 0.4469, + "step": 2388 + }, + { + "epoch": 0.5907517309594461, + "grad_norm": 0.8009676088638478, + "learning_rate": 4.8901105223774885e-06, + "loss": 0.4335, + "step": 2389 + }, + { + "epoch": 0.5909990108803165, + "grad_norm": 0.8379245612231933, + "learning_rate": 4.890015196573938e-06, + "loss": 0.4205, + "step": 2390 + }, + { + "epoch": 0.591246290801187, + "grad_norm": 0.8086859042796692, + "learning_rate": 4.889919830372118e-06, + "loss": 0.4363, + "step": 2391 + }, + { + "epoch": 0.5914935707220573, + "grad_norm": 0.8130939334847568, + "learning_rate": 4.889824423773642e-06, + "loss": 0.4109, + "step": 2392 + }, + { + "epoch": 0.5917408506429278, + "grad_norm": 0.8476467595351704, + "learning_rate": 4.8897289767801225e-06, + "loss": 0.4031, + "step": 2393 + }, + { + "epoch": 0.5919881305637982, + "grad_norm": 0.8461748435957103, + "learning_rate": 4.889633489393173e-06, + "loss": 0.4091, + "step": 2394 + }, + { + "epoch": 0.5922354104846687, + "grad_norm": 0.8376353137944713, + "learning_rate": 4.889537961614405e-06, + "loss": 0.4146, + "step": 2395 + }, + { + "epoch": 0.592482690405539, + "grad_norm": 0.8239443055595109, + "learning_rate": 4.889442393445435e-06, + "loss": 0.4447, + "step": 2396 + }, + { + "epoch": 0.5927299703264095, + "grad_norm": 0.849161150005281, + "learning_rate": 4.88934678488788e-06, + "loss": 0.4547, + "step": 2397 + }, + { + "epoch": 0.5929772502472799, + "grad_norm": 0.790329735581731, + "learning_rate": 4.889251135943353e-06, + "loss": 0.4401, + "step": 2398 + }, + { + "epoch": 0.5932245301681504, + "grad_norm": 0.8193562709687785, + "learning_rate": 4.889155446613473e-06, + "loss": 0.4093, + "step": 2399 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 0.8170696936647929, + "learning_rate": 4.889059716899857e-06, + "loss": 0.4233, + "step": 2400 + }, + { + "epoch": 0.5937190900098912, + "grad_norm": 0.8168596273728271, + "learning_rate": 4.888963946804122e-06, + "loss": 0.3982, + "step": 2401 + }, + { + "epoch": 0.5939663699307616, + "grad_norm": 0.8114324383935596, + "learning_rate": 4.888868136327888e-06, + "loss": 0.4336, + "step": 2402 + }, + { + "epoch": 0.594213649851632, + "grad_norm": 0.814161224922398, + "learning_rate": 4.888772285472773e-06, + "loss": 0.4445, + "step": 2403 + }, + { + "epoch": 0.5944609297725024, + "grad_norm": 0.8203037927462447, + "learning_rate": 4.888676394240399e-06, + "loss": 0.4143, + "step": 2404 + }, + { + "epoch": 0.5947082096933729, + "grad_norm": 0.7833477571195605, + "learning_rate": 4.888580462632386e-06, + "loss": 0.4369, + "step": 2405 + }, + { + "epoch": 0.5949554896142433, + "grad_norm": 0.8367330917074817, + "learning_rate": 4.888484490650355e-06, + "loss": 0.423, + "step": 2406 + }, + { + "epoch": 0.5952027695351138, + "grad_norm": 0.7996837665018429, + "learning_rate": 4.888388478295929e-06, + "loss": 0.4215, + "step": 2407 + }, + { + "epoch": 0.5954500494559841, + "grad_norm": 0.8099673447111903, + "learning_rate": 4.888292425570731e-06, + "loss": 0.4565, + "step": 2408 + }, + { + "epoch": 0.5956973293768546, + "grad_norm": 0.8315706071687368, + "learning_rate": 4.888196332476385e-06, + "loss": 0.4251, + "step": 2409 + }, + { + "epoch": 0.595944609297725, + "grad_norm": 0.8641851324877825, + "learning_rate": 4.8881001990145125e-06, + "loss": 0.431, + "step": 2410 + }, + { + "epoch": 0.5961918892185954, + "grad_norm": 0.8151888626416508, + "learning_rate": 4.888004025186742e-06, + "loss": 0.4106, + "step": 2411 + }, + { + "epoch": 0.5964391691394659, + "grad_norm": 0.8143057678231387, + "learning_rate": 4.887907810994697e-06, + "loss": 0.434, + "step": 2412 + }, + { + "epoch": 0.5966864490603363, + "grad_norm": 0.8169851007290792, + "learning_rate": 4.887811556440004e-06, + "loss": 0.4379, + "step": 2413 + }, + { + "epoch": 0.5969337289812068, + "grad_norm": 0.8156004947234917, + "learning_rate": 4.887715261524291e-06, + "loss": 0.4556, + "step": 2414 + }, + { + "epoch": 0.5971810089020771, + "grad_norm": 0.8226055472128943, + "learning_rate": 4.887618926249185e-06, + "loss": 0.4323, + "step": 2415 + }, + { + "epoch": 0.5974282888229476, + "grad_norm": 0.8130042612264833, + "learning_rate": 4.887522550616314e-06, + "loss": 0.4472, + "step": 2416 + }, + { + "epoch": 0.597675568743818, + "grad_norm": 0.7757441744985806, + "learning_rate": 4.887426134627308e-06, + "loss": 0.4422, + "step": 2417 + }, + { + "epoch": 0.5979228486646885, + "grad_norm": 0.8293693320678572, + "learning_rate": 4.887329678283795e-06, + "loss": 0.4411, + "step": 2418 + }, + { + "epoch": 0.5981701285855588, + "grad_norm": 0.7909863669793495, + "learning_rate": 4.887233181587407e-06, + "loss": 0.4363, + "step": 2419 + }, + { + "epoch": 0.5984174085064293, + "grad_norm": 0.8034735828656278, + "learning_rate": 4.887136644539775e-06, + "loss": 0.4383, + "step": 2420 + }, + { + "epoch": 0.5986646884272997, + "grad_norm": 0.8340913456941164, + "learning_rate": 4.887040067142529e-06, + "loss": 0.4121, + "step": 2421 + }, + { + "epoch": 0.5989119683481702, + "grad_norm": 0.7963542690809786, + "learning_rate": 4.886943449397304e-06, + "loss": 0.4293, + "step": 2422 + }, + { + "epoch": 0.5991592482690405, + "grad_norm": 0.8308598116178003, + "learning_rate": 4.886846791305732e-06, + "loss": 0.4524, + "step": 2423 + }, + { + "epoch": 0.599406528189911, + "grad_norm": 0.8014680489986464, + "learning_rate": 4.886750092869446e-06, + "loss": 0.4267, + "step": 2424 + }, + { + "epoch": 0.5996538081107814, + "grad_norm": 0.7930535832840587, + "learning_rate": 4.88665335409008e-06, + "loss": 0.4369, + "step": 2425 + }, + { + "epoch": 0.5999010880316519, + "grad_norm": 0.8039444096609584, + "learning_rate": 4.886556574969273e-06, + "loss": 0.4326, + "step": 2426 + }, + { + "epoch": 0.6001483679525222, + "grad_norm": 0.8152125898259113, + "learning_rate": 4.886459755508657e-06, + "loss": 0.395, + "step": 2427 + }, + { + "epoch": 0.6003956478733927, + "grad_norm": 0.7958138668006676, + "learning_rate": 4.88636289570987e-06, + "loss": 0.4426, + "step": 2428 + }, + { + "epoch": 0.6006429277942631, + "grad_norm": 0.8406325006773193, + "learning_rate": 4.886265995574548e-06, + "loss": 0.4242, + "step": 2429 + }, + { + "epoch": 0.6008902077151336, + "grad_norm": 0.7785418865541055, + "learning_rate": 4.886169055104331e-06, + "loss": 0.4458, + "step": 2430 + }, + { + "epoch": 0.6011374876360039, + "grad_norm": 0.819082154131098, + "learning_rate": 4.886072074300855e-06, + "loss": 0.4547, + "step": 2431 + }, + { + "epoch": 0.6013847675568744, + "grad_norm": 0.8524435533885798, + "learning_rate": 4.885975053165762e-06, + "loss": 0.4217, + "step": 2432 + }, + { + "epoch": 0.6016320474777448, + "grad_norm": 0.7912817382969713, + "learning_rate": 4.88587799170069e-06, + "loss": 0.4079, + "step": 2433 + }, + { + "epoch": 0.6018793273986153, + "grad_norm": 0.7786233064046465, + "learning_rate": 4.88578088990728e-06, + "loss": 0.4147, + "step": 2434 + }, + { + "epoch": 0.6021266073194856, + "grad_norm": 0.7967905199446587, + "learning_rate": 4.885683747787174e-06, + "loss": 0.4346, + "step": 2435 + }, + { + "epoch": 0.6023738872403561, + "grad_norm": 0.8044827879750851, + "learning_rate": 4.885586565342014e-06, + "loss": 0.4604, + "step": 2436 + }, + { + "epoch": 0.6026211671612265, + "grad_norm": 0.8160589521379803, + "learning_rate": 4.885489342573441e-06, + "loss": 0.4083, + "step": 2437 + }, + { + "epoch": 0.602868447082097, + "grad_norm": 0.8213002853671916, + "learning_rate": 4.885392079483101e-06, + "loss": 0.4524, + "step": 2438 + }, + { + "epoch": 0.6031157270029673, + "grad_norm": 0.8522852301398287, + "learning_rate": 4.885294776072636e-06, + "loss": 0.4345, + "step": 2439 + }, + { + "epoch": 0.6033630069238378, + "grad_norm": 0.859578041404694, + "learning_rate": 4.88519743234369e-06, + "loss": 0.437, + "step": 2440 + }, + { + "epoch": 0.6036102868447082, + "grad_norm": 0.9126217387901533, + "learning_rate": 4.885100048297911e-06, + "loss": 0.4386, + "step": 2441 + }, + { + "epoch": 0.6038575667655787, + "grad_norm": 0.8240466430659698, + "learning_rate": 4.8850026239369435e-06, + "loss": 0.4156, + "step": 2442 + }, + { + "epoch": 0.604104846686449, + "grad_norm": 0.8498653504755754, + "learning_rate": 4.884905159262435e-06, + "loss": 0.4311, + "step": 2443 + }, + { + "epoch": 0.6043521266073195, + "grad_norm": 0.8118568331305821, + "learning_rate": 4.884807654276031e-06, + "loss": 0.4453, + "step": 2444 + }, + { + "epoch": 0.6045994065281899, + "grad_norm": 0.8184288967709394, + "learning_rate": 4.884710108979383e-06, + "loss": 0.4434, + "step": 2445 + }, + { + "epoch": 0.6048466864490604, + "grad_norm": 0.8282720476073526, + "learning_rate": 4.884612523374137e-06, + "loss": 0.466, + "step": 2446 + }, + { + "epoch": 0.6050939663699307, + "grad_norm": 0.8750263587818898, + "learning_rate": 4.8845148974619435e-06, + "loss": 0.4173, + "step": 2447 + }, + { + "epoch": 0.6053412462908012, + "grad_norm": 0.8459312708646205, + "learning_rate": 4.884417231244452e-06, + "loss": 0.4133, + "step": 2448 + }, + { + "epoch": 0.6055885262116716, + "grad_norm": 0.8712529102031519, + "learning_rate": 4.8843195247233145e-06, + "loss": 0.43, + "step": 2449 + }, + { + "epoch": 0.6058358061325421, + "grad_norm": 0.8252571385372829, + "learning_rate": 4.884221777900182e-06, + "loss": 0.4319, + "step": 2450 + }, + { + "epoch": 0.6060830860534124, + "grad_norm": 0.7656825993441855, + "learning_rate": 4.884123990776706e-06, + "loss": 0.4252, + "step": 2451 + }, + { + "epoch": 0.6063303659742829, + "grad_norm": 0.8633963817614291, + "learning_rate": 4.88402616335454e-06, + "loss": 0.4558, + "step": 2452 + }, + { + "epoch": 0.6065776458951533, + "grad_norm": 0.8335583680412979, + "learning_rate": 4.883928295635338e-06, + "loss": 0.4372, + "step": 2453 + }, + { + "epoch": 0.6068249258160238, + "grad_norm": 0.8826702767603091, + "learning_rate": 4.883830387620754e-06, + "loss": 0.426, + "step": 2454 + }, + { + "epoch": 0.6070722057368941, + "grad_norm": 0.8559725178211727, + "learning_rate": 4.8837324393124425e-06, + "loss": 0.4267, + "step": 2455 + }, + { + "epoch": 0.6073194856577646, + "grad_norm": 0.7989825274835176, + "learning_rate": 4.8836344507120595e-06, + "loss": 0.4341, + "step": 2456 + }, + { + "epoch": 0.607566765578635, + "grad_norm": 0.8038816796422794, + "learning_rate": 4.883536421821261e-06, + "loss": 0.4224, + "step": 2457 + }, + { + "epoch": 0.6078140454995055, + "grad_norm": 0.8681157087578908, + "learning_rate": 4.883438352641704e-06, + "loss": 0.4334, + "step": 2458 + }, + { + "epoch": 0.6080613254203758, + "grad_norm": 0.799734995402737, + "learning_rate": 4.883340243175047e-06, + "loss": 0.4226, + "step": 2459 + }, + { + "epoch": 0.6083086053412463, + "grad_norm": 0.8547585200505449, + "learning_rate": 4.883242093422947e-06, + "loss": 0.461, + "step": 2460 + }, + { + "epoch": 0.6085558852621167, + "grad_norm": 0.8419176540033854, + "learning_rate": 4.883143903387063e-06, + "loss": 0.4581, + "step": 2461 + }, + { + "epoch": 0.6088031651829872, + "grad_norm": 0.8190904513189141, + "learning_rate": 4.8830456730690565e-06, + "loss": 0.4149, + "step": 2462 + }, + { + "epoch": 0.6090504451038575, + "grad_norm": 0.8075726467540836, + "learning_rate": 4.882947402470586e-06, + "loss": 0.4328, + "step": 2463 + }, + { + "epoch": 0.609297725024728, + "grad_norm": 0.8180651312891082, + "learning_rate": 4.882849091593314e-06, + "loss": 0.4176, + "step": 2464 + }, + { + "epoch": 0.6095450049455984, + "grad_norm": 0.8270321732282798, + "learning_rate": 4.882750740438902e-06, + "loss": 0.4385, + "step": 2465 + }, + { + "epoch": 0.6097922848664689, + "grad_norm": 0.813487741003457, + "learning_rate": 4.8826523490090104e-06, + "loss": 0.4152, + "step": 2466 + }, + { + "epoch": 0.6100395647873392, + "grad_norm": 0.8395319099633974, + "learning_rate": 4.882553917305305e-06, + "loss": 0.3927, + "step": 2467 + }, + { + "epoch": 0.6102868447082097, + "grad_norm": 0.8595231311190774, + "learning_rate": 4.882455445329448e-06, + "loss": 0.4379, + "step": 2468 + }, + { + "epoch": 0.6105341246290801, + "grad_norm": 0.8683672743262101, + "learning_rate": 4.8823569330831045e-06, + "loss": 0.4235, + "step": 2469 + }, + { + "epoch": 0.6107814045499506, + "grad_norm": 0.8659235169816588, + "learning_rate": 4.882258380567939e-06, + "loss": 0.3991, + "step": 2470 + }, + { + "epoch": 0.6110286844708209, + "grad_norm": 0.8208473928771434, + "learning_rate": 4.882159787785618e-06, + "loss": 0.4522, + "step": 2471 + }, + { + "epoch": 0.6112759643916914, + "grad_norm": 0.8733025812767434, + "learning_rate": 4.882061154737809e-06, + "loss": 0.4098, + "step": 2472 + }, + { + "epoch": 0.6115232443125618, + "grad_norm": 0.8931585574147225, + "learning_rate": 4.881962481426176e-06, + "loss": 0.4389, + "step": 2473 + }, + { + "epoch": 0.6117705242334323, + "grad_norm": 0.834574300801114, + "learning_rate": 4.88186376785239e-06, + "loss": 0.4195, + "step": 2474 + }, + { + "epoch": 0.6120178041543026, + "grad_norm": 0.8370196298648357, + "learning_rate": 4.881765014018118e-06, + "loss": 0.4347, + "step": 2475 + }, + { + "epoch": 0.6122650840751731, + "grad_norm": 0.8634205008935408, + "learning_rate": 4.88166621992503e-06, + "loss": 0.4259, + "step": 2476 + }, + { + "epoch": 0.6125123639960435, + "grad_norm": 0.8360755499657381, + "learning_rate": 4.881567385574795e-06, + "loss": 0.4374, + "step": 2477 + }, + { + "epoch": 0.612759643916914, + "grad_norm": 0.8426066875433872, + "learning_rate": 4.8814685109690846e-06, + "loss": 0.4193, + "step": 2478 + }, + { + "epoch": 0.6130069238377843, + "grad_norm": 0.87903730703909, + "learning_rate": 4.8813695961095694e-06, + "loss": 0.4389, + "step": 2479 + }, + { + "epoch": 0.6132542037586548, + "grad_norm": 0.8943708464365521, + "learning_rate": 4.881270640997921e-06, + "loss": 0.4317, + "step": 2480 + }, + { + "epoch": 0.6135014836795252, + "grad_norm": 0.8261054283912324, + "learning_rate": 4.881171645635814e-06, + "loss": 0.4407, + "step": 2481 + }, + { + "epoch": 0.6137487636003957, + "grad_norm": 0.8064405085915288, + "learning_rate": 4.88107261002492e-06, + "loss": 0.4425, + "step": 2482 + }, + { + "epoch": 0.613996043521266, + "grad_norm": 0.784802011195429, + "learning_rate": 4.880973534166912e-06, + "loss": 0.4488, + "step": 2483 + }, + { + "epoch": 0.6142433234421365, + "grad_norm": 0.8176221625872651, + "learning_rate": 4.880874418063467e-06, + "loss": 0.4129, + "step": 2484 + }, + { + "epoch": 0.6144906033630069, + "grad_norm": 0.8343618719243155, + "learning_rate": 4.880775261716259e-06, + "loss": 0.3911, + "step": 2485 + }, + { + "epoch": 0.6147378832838774, + "grad_norm": 0.8644375812951496, + "learning_rate": 4.880676065126965e-06, + "loss": 0.4122, + "step": 2486 + }, + { + "epoch": 0.6149851632047477, + "grad_norm": 0.7800771831644979, + "learning_rate": 4.88057682829726e-06, + "loss": 0.4497, + "step": 2487 + }, + { + "epoch": 0.6152324431256182, + "grad_norm": 0.850924422669018, + "learning_rate": 4.880477551228823e-06, + "loss": 0.4184, + "step": 2488 + }, + { + "epoch": 0.6154797230464887, + "grad_norm": 0.8561928941850189, + "learning_rate": 4.880378233923332e-06, + "loss": 0.4311, + "step": 2489 + }, + { + "epoch": 0.615727002967359, + "grad_norm": 0.8350502975323144, + "learning_rate": 4.880278876382465e-06, + "loss": 0.4259, + "step": 2490 + }, + { + "epoch": 0.6159742828882295, + "grad_norm": 0.7847878516103233, + "learning_rate": 4.8801794786079e-06, + "loss": 0.4432, + "step": 2491 + }, + { + "epoch": 0.6162215628090999, + "grad_norm": 0.7967723156172767, + "learning_rate": 4.880080040601322e-06, + "loss": 0.4076, + "step": 2492 + }, + { + "epoch": 0.6164688427299704, + "grad_norm": 0.7983503377091417, + "learning_rate": 4.879980562364406e-06, + "loss": 0.4051, + "step": 2493 + }, + { + "epoch": 0.6167161226508407, + "grad_norm": 0.8022288887944488, + "learning_rate": 4.879881043898838e-06, + "loss": 0.4443, + "step": 2494 + }, + { + "epoch": 0.6169634025717112, + "grad_norm": 0.8080184803379807, + "learning_rate": 4.8797814852062965e-06, + "loss": 0.4544, + "step": 2495 + }, + { + "epoch": 0.6172106824925816, + "grad_norm": 0.8432338832511567, + "learning_rate": 4.879681886288467e-06, + "loss": 0.4269, + "step": 2496 + }, + { + "epoch": 0.6174579624134521, + "grad_norm": 0.8274845517353767, + "learning_rate": 4.8795822471470326e-06, + "loss": 0.4548, + "step": 2497 + }, + { + "epoch": 0.6177052423343224, + "grad_norm": 0.8422453088573836, + "learning_rate": 4.879482567783675e-06, + "loss": 0.4357, + "step": 2498 + }, + { + "epoch": 0.6179525222551929, + "grad_norm": 0.8313181938884414, + "learning_rate": 4.8793828482000834e-06, + "loss": 0.4106, + "step": 2499 + }, + { + "epoch": 0.6181998021760633, + "grad_norm": 0.7956229961490899, + "learning_rate": 4.87928308839794e-06, + "loss": 0.44, + "step": 2500 + }, + { + "epoch": 0.6184470820969338, + "grad_norm": 0.8021572150909114, + "learning_rate": 4.879183288378932e-06, + "loss": 0.4505, + "step": 2501 + }, + { + "epoch": 0.6186943620178041, + "grad_norm": 0.870409141916946, + "learning_rate": 4.879083448144747e-06, + "loss": 0.4065, + "step": 2502 + }, + { + "epoch": 0.6189416419386746, + "grad_norm": 0.8154769479555604, + "learning_rate": 4.878983567697071e-06, + "loss": 0.4026, + "step": 2503 + }, + { + "epoch": 0.619188921859545, + "grad_norm": 0.7614555739597262, + "learning_rate": 4.8788836470375935e-06, + "loss": 0.433, + "step": 2504 + }, + { + "epoch": 0.6194362017804155, + "grad_norm": 0.8216234222194428, + "learning_rate": 4.878783686168004e-06, + "loss": 0.4329, + "step": 2505 + }, + { + "epoch": 0.6196834817012858, + "grad_norm": 0.847598350182186, + "learning_rate": 4.878683685089991e-06, + "loss": 0.4035, + "step": 2506 + }, + { + "epoch": 0.6199307616221563, + "grad_norm": 0.8281932559908807, + "learning_rate": 4.878583643805244e-06, + "loss": 0.4334, + "step": 2507 + }, + { + "epoch": 0.6201780415430267, + "grad_norm": 0.8013841803051386, + "learning_rate": 4.878483562315456e-06, + "loss": 0.4092, + "step": 2508 + }, + { + "epoch": 0.6204253214638972, + "grad_norm": 0.7827821396675764, + "learning_rate": 4.878383440622318e-06, + "loss": 0.4216, + "step": 2509 + }, + { + "epoch": 0.6206726013847675, + "grad_norm": 0.811958550694751, + "learning_rate": 4.878283278727522e-06, + "loss": 0.4272, + "step": 2510 + }, + { + "epoch": 0.620919881305638, + "grad_norm": 0.8295542833041976, + "learning_rate": 4.878183076632761e-06, + "loss": 0.4248, + "step": 2511 + }, + { + "epoch": 0.6211671612265084, + "grad_norm": 0.7848392977501718, + "learning_rate": 4.878082834339729e-06, + "loss": 0.4474, + "step": 2512 + }, + { + "epoch": 0.6214144411473789, + "grad_norm": 0.7896984047945792, + "learning_rate": 4.87798255185012e-06, + "loss": 0.4375, + "step": 2513 + }, + { + "epoch": 0.6216617210682492, + "grad_norm": 0.786506360369637, + "learning_rate": 4.87788222916563e-06, + "loss": 0.4779, + "step": 2514 + }, + { + "epoch": 0.6219090009891197, + "grad_norm": 0.7869355956930445, + "learning_rate": 4.877781866287953e-06, + "loss": 0.4449, + "step": 2515 + }, + { + "epoch": 0.6221562809099901, + "grad_norm": 0.7693278810302654, + "learning_rate": 4.877681463218787e-06, + "loss": 0.4643, + "step": 2516 + }, + { + "epoch": 0.6224035608308606, + "grad_norm": 0.8335221592746881, + "learning_rate": 4.877581019959829e-06, + "loss": 0.4018, + "step": 2517 + }, + { + "epoch": 0.6226508407517309, + "grad_norm": 0.8045163991097942, + "learning_rate": 4.877480536512777e-06, + "loss": 0.4178, + "step": 2518 + }, + { + "epoch": 0.6228981206726014, + "grad_norm": 0.7830727482060992, + "learning_rate": 4.877380012879328e-06, + "loss": 0.448, + "step": 2519 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 0.7963686363614251, + "learning_rate": 4.877279449061182e-06, + "loss": 0.4804, + "step": 2520 + }, + { + "epoch": 0.6233926805143423, + "grad_norm": 0.7780549531823838, + "learning_rate": 4.8771788450600384e-06, + "loss": 0.4472, + "step": 2521 + }, + { + "epoch": 0.6236399604352126, + "grad_norm": 0.7994742514234306, + "learning_rate": 4.877078200877599e-06, + "loss": 0.4585, + "step": 2522 + }, + { + "epoch": 0.6238872403560831, + "grad_norm": 0.7852106883065006, + "learning_rate": 4.876977516515564e-06, + "loss": 0.4613, + "step": 2523 + }, + { + "epoch": 0.6241345202769535, + "grad_norm": 0.7876985072103233, + "learning_rate": 4.876876791975635e-06, + "loss": 0.4222, + "step": 2524 + }, + { + "epoch": 0.624381800197824, + "grad_norm": 0.7671611530115617, + "learning_rate": 4.876776027259516e-06, + "loss": 0.451, + "step": 2525 + }, + { + "epoch": 0.6246290801186943, + "grad_norm": 0.7946021739256844, + "learning_rate": 4.876675222368907e-06, + "loss": 0.446, + "step": 2526 + }, + { + "epoch": 0.6248763600395648, + "grad_norm": 0.8459845780214671, + "learning_rate": 4.876574377305516e-06, + "loss": 0.4532, + "step": 2527 + }, + { + "epoch": 0.6251236399604352, + "grad_norm": 0.8008984159682268, + "learning_rate": 4.876473492071045e-06, + "loss": 0.4374, + "step": 2528 + }, + { + "epoch": 0.6253709198813057, + "grad_norm": 0.7682503991359931, + "learning_rate": 4.876372566667199e-06, + "loss": 0.4116, + "step": 2529 + }, + { + "epoch": 0.625618199802176, + "grad_norm": 0.8258691494063493, + "learning_rate": 4.876271601095686e-06, + "loss": 0.42, + "step": 2530 + }, + { + "epoch": 0.6258654797230465, + "grad_norm": 0.7927208100659665, + "learning_rate": 4.876170595358211e-06, + "loss": 0.434, + "step": 2531 + }, + { + "epoch": 0.6261127596439169, + "grad_norm": 0.8137626732834563, + "learning_rate": 4.8760695494564815e-06, + "loss": 0.4102, + "step": 2532 + }, + { + "epoch": 0.6263600395647874, + "grad_norm": 0.779955387753274, + "learning_rate": 4.875968463392206e-06, + "loss": 0.4557, + "step": 2533 + }, + { + "epoch": 0.6266073194856577, + "grad_norm": 0.8225478650613147, + "learning_rate": 4.875867337167093e-06, + "loss": 0.4651, + "step": 2534 + }, + { + "epoch": 0.6268545994065282, + "grad_norm": 0.7874547156708713, + "learning_rate": 4.875766170782852e-06, + "loss": 0.4218, + "step": 2535 + }, + { + "epoch": 0.6271018793273986, + "grad_norm": 0.8484394458254143, + "learning_rate": 4.875664964241191e-06, + "loss": 0.4554, + "step": 2536 + }, + { + "epoch": 0.6273491592482691, + "grad_norm": 0.8084521995284009, + "learning_rate": 4.875563717543824e-06, + "loss": 0.4319, + "step": 2537 + }, + { + "epoch": 0.6275964391691394, + "grad_norm": 0.7910596133893639, + "learning_rate": 4.87546243069246e-06, + "loss": 0.4598, + "step": 2538 + }, + { + "epoch": 0.6278437190900099, + "grad_norm": 0.796909735607753, + "learning_rate": 4.875361103688812e-06, + "loss": 0.4447, + "step": 2539 + }, + { + "epoch": 0.6280909990108803, + "grad_norm": 0.7894299591362924, + "learning_rate": 4.875259736534593e-06, + "loss": 0.4143, + "step": 2540 + }, + { + "epoch": 0.6283382789317508, + "grad_norm": 0.8215464045091878, + "learning_rate": 4.8751583292315156e-06, + "loss": 0.4337, + "step": 2541 + }, + { + "epoch": 0.6285855588526211, + "grad_norm": 0.8097485457219312, + "learning_rate": 4.875056881781294e-06, + "loss": 0.423, + "step": 2542 + }, + { + "epoch": 0.6288328387734916, + "grad_norm": 0.832716101729499, + "learning_rate": 4.874955394185643e-06, + "loss": 0.4168, + "step": 2543 + }, + { + "epoch": 0.629080118694362, + "grad_norm": 0.778723112231423, + "learning_rate": 4.874853866446279e-06, + "loss": 0.4576, + "step": 2544 + }, + { + "epoch": 0.6293273986152325, + "grad_norm": 0.8635182250262682, + "learning_rate": 4.874752298564916e-06, + "loss": 0.4457, + "step": 2545 + }, + { + "epoch": 0.6295746785361028, + "grad_norm": 0.7917765753480467, + "learning_rate": 4.874650690543273e-06, + "loss": 0.4059, + "step": 2546 + }, + { + "epoch": 0.6298219584569733, + "grad_norm": 0.8248753697058309, + "learning_rate": 4.874549042383066e-06, + "loss": 0.3954, + "step": 2547 + }, + { + "epoch": 0.6300692383778437, + "grad_norm": 0.8478879231826125, + "learning_rate": 4.8744473540860136e-06, + "loss": 0.4136, + "step": 2548 + }, + { + "epoch": 0.6303165182987142, + "grad_norm": 0.8212763456739502, + "learning_rate": 4.874345625653836e-06, + "loss": 0.4684, + "step": 2549 + }, + { + "epoch": 0.6305637982195845, + "grad_norm": 0.757161231064773, + "learning_rate": 4.874243857088251e-06, + "loss": 0.4436, + "step": 2550 + }, + { + "epoch": 0.630811078140455, + "grad_norm": 0.872176501099029, + "learning_rate": 4.874142048390978e-06, + "loss": 0.4187, + "step": 2551 + }, + { + "epoch": 0.6310583580613254, + "grad_norm": 0.8151427391928595, + "learning_rate": 4.87404019956374e-06, + "loss": 0.4146, + "step": 2552 + }, + { + "epoch": 0.6313056379821959, + "grad_norm": 0.8257722842621575, + "learning_rate": 4.873938310608258e-06, + "loss": 0.4088, + "step": 2553 + }, + { + "epoch": 0.6315529179030662, + "grad_norm": 0.824778895269139, + "learning_rate": 4.8738363815262535e-06, + "loss": 0.4103, + "step": 2554 + }, + { + "epoch": 0.6318001978239367, + "grad_norm": 0.8206243653518269, + "learning_rate": 4.8737344123194495e-06, + "loss": 0.443, + "step": 2555 + }, + { + "epoch": 0.6320474777448071, + "grad_norm": 0.833260027971966, + "learning_rate": 4.873632402989571e-06, + "loss": 0.4403, + "step": 2556 + }, + { + "epoch": 0.6322947576656776, + "grad_norm": 0.8392388817195789, + "learning_rate": 4.87353035353834e-06, + "loss": 0.4167, + "step": 2557 + }, + { + "epoch": 0.6325420375865479, + "grad_norm": 0.8045955688235943, + "learning_rate": 4.873428263967483e-06, + "loss": 0.4345, + "step": 2558 + }, + { + "epoch": 0.6327893175074184, + "grad_norm": 0.8073482847741306, + "learning_rate": 4.873326134278725e-06, + "loss": 0.4465, + "step": 2559 + }, + { + "epoch": 0.6330365974282888, + "grad_norm": 0.8417721947718284, + "learning_rate": 4.873223964473792e-06, + "loss": 0.437, + "step": 2560 + }, + { + "epoch": 0.6332838773491593, + "grad_norm": 0.835567113423516, + "learning_rate": 4.873121754554413e-06, + "loss": 0.4601, + "step": 2561 + }, + { + "epoch": 0.6335311572700296, + "grad_norm": 0.7979539900749805, + "learning_rate": 4.873019504522313e-06, + "loss": 0.4195, + "step": 2562 + }, + { + "epoch": 0.6337784371909001, + "grad_norm": 0.7912474454955465, + "learning_rate": 4.872917214379221e-06, + "loss": 0.4487, + "step": 2563 + }, + { + "epoch": 0.6340257171117705, + "grad_norm": 0.8307214918215886, + "learning_rate": 4.872814884126867e-06, + "loss": 0.4173, + "step": 2564 + }, + { + "epoch": 0.634272997032641, + "grad_norm": 0.8101781344145439, + "learning_rate": 4.87271251376698e-06, + "loss": 0.4501, + "step": 2565 + }, + { + "epoch": 0.6345202769535113, + "grad_norm": 0.8204980030353368, + "learning_rate": 4.872610103301289e-06, + "loss": 0.4757, + "step": 2566 + }, + { + "epoch": 0.6347675568743818, + "grad_norm": 0.8005074209067652, + "learning_rate": 4.872507652731529e-06, + "loss": 0.4078, + "step": 2567 + }, + { + "epoch": 0.6350148367952523, + "grad_norm": 0.7997515924249456, + "learning_rate": 4.872405162059428e-06, + "loss": 0.45, + "step": 2568 + }, + { + "epoch": 0.6352621167161226, + "grad_norm": 0.8137264810694476, + "learning_rate": 4.87230263128672e-06, + "loss": 0.4241, + "step": 2569 + }, + { + "epoch": 0.6355093966369931, + "grad_norm": 0.824910213201859, + "learning_rate": 4.872200060415136e-06, + "loss": 0.4292, + "step": 2570 + }, + { + "epoch": 0.6357566765578635, + "grad_norm": 0.8033275021348851, + "learning_rate": 4.872097449446413e-06, + "loss": 0.4323, + "step": 2571 + }, + { + "epoch": 0.636003956478734, + "grad_norm": 0.7746126846914729, + "learning_rate": 4.871994798382284e-06, + "loss": 0.4206, + "step": 2572 + }, + { + "epoch": 0.6362512363996043, + "grad_norm": 0.8549318207086319, + "learning_rate": 4.871892107224483e-06, + "loss": 0.4053, + "step": 2573 + }, + { + "epoch": 0.6364985163204748, + "grad_norm": 0.7938662480945529, + "learning_rate": 4.8717893759747475e-06, + "loss": 0.4344, + "step": 2574 + }, + { + "epoch": 0.6367457962413452, + "grad_norm": 0.7915283947882275, + "learning_rate": 4.8716866046348135e-06, + "loss": 0.4422, + "step": 2575 + }, + { + "epoch": 0.6369930761622157, + "grad_norm": 0.7688073874395939, + "learning_rate": 4.871583793206417e-06, + "loss": 0.4506, + "step": 2576 + }, + { + "epoch": 0.637240356083086, + "grad_norm": 0.7912905811315122, + "learning_rate": 4.871480941691297e-06, + "loss": 0.4166, + "step": 2577 + }, + { + "epoch": 0.6374876360039565, + "grad_norm": 0.8120497456748971, + "learning_rate": 4.871378050091191e-06, + "loss": 0.4181, + "step": 2578 + }, + { + "epoch": 0.6377349159248269, + "grad_norm": 0.8028994592647819, + "learning_rate": 4.871275118407839e-06, + "loss": 0.4311, + "step": 2579 + }, + { + "epoch": 0.6379821958456974, + "grad_norm": 0.7834446209323545, + "learning_rate": 4.871172146642981e-06, + "loss": 0.4279, + "step": 2580 + }, + { + "epoch": 0.6382294757665677, + "grad_norm": 0.8348542999908406, + "learning_rate": 4.871069134798357e-06, + "loss": 0.4329, + "step": 2581 + }, + { + "epoch": 0.6384767556874382, + "grad_norm": 0.8272271677793834, + "learning_rate": 4.8709660828757084e-06, + "loss": 0.4182, + "step": 2582 + }, + { + "epoch": 0.6387240356083086, + "grad_norm": 0.8689124353130055, + "learning_rate": 4.8708629908767765e-06, + "loss": 0.4006, + "step": 2583 + }, + { + "epoch": 0.6389713155291791, + "grad_norm": 0.7859560070726559, + "learning_rate": 4.870759858803306e-06, + "loss": 0.4099, + "step": 2584 + }, + { + "epoch": 0.6392185954500494, + "grad_norm": 0.8065301022490496, + "learning_rate": 4.870656686657037e-06, + "loss": 0.4429, + "step": 2585 + }, + { + "epoch": 0.6394658753709199, + "grad_norm": 0.8046008098865215, + "learning_rate": 4.870553474439715e-06, + "loss": 0.4355, + "step": 2586 + }, + { + "epoch": 0.6397131552917903, + "grad_norm": 0.8173022893337369, + "learning_rate": 4.870450222153086e-06, + "loss": 0.4541, + "step": 2587 + }, + { + "epoch": 0.6399604352126608, + "grad_norm": 0.8241623751017143, + "learning_rate": 4.870346929798893e-06, + "loss": 0.4322, + "step": 2588 + }, + { + "epoch": 0.6402077151335311, + "grad_norm": 0.7625457826822414, + "learning_rate": 4.870243597378882e-06, + "loss": 0.4425, + "step": 2589 + }, + { + "epoch": 0.6404549950544016, + "grad_norm": 0.8650909181528662, + "learning_rate": 4.870140224894801e-06, + "loss": 0.4208, + "step": 2590 + }, + { + "epoch": 0.640702274975272, + "grad_norm": 0.807605893764015, + "learning_rate": 4.870036812348397e-06, + "loss": 0.4301, + "step": 2591 + }, + { + "epoch": 0.6409495548961425, + "grad_norm": 0.8261404636614745, + "learning_rate": 4.8699333597414166e-06, + "loss": 0.4278, + "step": 2592 + }, + { + "epoch": 0.6411968348170128, + "grad_norm": 0.8284238869122891, + "learning_rate": 4.869829867075611e-06, + "loss": 0.3984, + "step": 2593 + }, + { + "epoch": 0.6414441147378833, + "grad_norm": 0.7975722051769433, + "learning_rate": 4.869726334352727e-06, + "loss": 0.4709, + "step": 2594 + }, + { + "epoch": 0.6416913946587537, + "grad_norm": 0.8507481794439908, + "learning_rate": 4.869622761574516e-06, + "loss": 0.4238, + "step": 2595 + }, + { + "epoch": 0.6419386745796242, + "grad_norm": 0.8131973370387947, + "learning_rate": 4.869519148742728e-06, + "loss": 0.4163, + "step": 2596 + }, + { + "epoch": 0.6421859545004945, + "grad_norm": 0.7971351475313032, + "learning_rate": 4.8694154958591145e-06, + "loss": 0.4265, + "step": 2597 + }, + { + "epoch": 0.642433234421365, + "grad_norm": 0.8116345108123881, + "learning_rate": 4.869311802925428e-06, + "loss": 0.4123, + "step": 2598 + }, + { + "epoch": 0.6426805143422354, + "grad_norm": 0.8095032265641844, + "learning_rate": 4.8692080699434205e-06, + "loss": 0.4268, + "step": 2599 + }, + { + "epoch": 0.6429277942631059, + "grad_norm": 0.8217190775852606, + "learning_rate": 4.869104296914847e-06, + "loss": 0.4485, + "step": 2600 + }, + { + "epoch": 0.6431750741839762, + "grad_norm": 0.8025278787335687, + "learning_rate": 4.869000483841459e-06, + "loss": 0.4606, + "step": 2601 + }, + { + "epoch": 0.6434223541048467, + "grad_norm": 0.8116294256653767, + "learning_rate": 4.868896630725014e-06, + "loss": 0.4358, + "step": 2602 + }, + { + "epoch": 0.6436696340257171, + "grad_norm": 0.7967259432878298, + "learning_rate": 4.868792737567266e-06, + "loss": 0.4422, + "step": 2603 + }, + { + "epoch": 0.6439169139465876, + "grad_norm": 0.8050752940748637, + "learning_rate": 4.86868880436997e-06, + "loss": 0.4417, + "step": 2604 + }, + { + "epoch": 0.6441641938674579, + "grad_norm": 0.7917551367908968, + "learning_rate": 4.868584831134885e-06, + "loss": 0.4185, + "step": 2605 + }, + { + "epoch": 0.6444114737883284, + "grad_norm": 0.8045620985739201, + "learning_rate": 4.868480817863766e-06, + "loss": 0.4031, + "step": 2606 + }, + { + "epoch": 0.6446587537091988, + "grad_norm": 0.7708976802059638, + "learning_rate": 4.868376764558374e-06, + "loss": 0.4827, + "step": 2607 + }, + { + "epoch": 0.6449060336300693, + "grad_norm": 0.8344243360154323, + "learning_rate": 4.868272671220465e-06, + "loss": 0.4184, + "step": 2608 + }, + { + "epoch": 0.6451533135509396, + "grad_norm": 0.7928287190522306, + "learning_rate": 4.868168537851801e-06, + "loss": 0.4405, + "step": 2609 + }, + { + "epoch": 0.6454005934718101, + "grad_norm": 0.8301762127855165, + "learning_rate": 4.868064364454141e-06, + "loss": 0.4264, + "step": 2610 + }, + { + "epoch": 0.6456478733926805, + "grad_norm": 0.8234622479401585, + "learning_rate": 4.867960151029245e-06, + "loss": 0.4101, + "step": 2611 + }, + { + "epoch": 0.645895153313551, + "grad_norm": 0.8042270905776066, + "learning_rate": 4.867855897578876e-06, + "loss": 0.4088, + "step": 2612 + }, + { + "epoch": 0.6461424332344213, + "grad_norm": 0.8118602734915629, + "learning_rate": 4.867751604104795e-06, + "loss": 0.4279, + "step": 2613 + }, + { + "epoch": 0.6463897131552918, + "grad_norm": 0.8157638003872915, + "learning_rate": 4.8676472706087655e-06, + "loss": 0.3937, + "step": 2614 + }, + { + "epoch": 0.6466369930761622, + "grad_norm": 0.7960341855478246, + "learning_rate": 4.867542897092551e-06, + "loss": 0.4683, + "step": 2615 + }, + { + "epoch": 0.6468842729970327, + "grad_norm": 0.8650800961392032, + "learning_rate": 4.867438483557916e-06, + "loss": 0.41, + "step": 2616 + }, + { + "epoch": 0.647131552917903, + "grad_norm": 0.7994797297627116, + "learning_rate": 4.867334030006624e-06, + "loss": 0.3952, + "step": 2617 + }, + { + "epoch": 0.6473788328387735, + "grad_norm": 0.8182129634502129, + "learning_rate": 4.867229536440442e-06, + "loss": 0.4538, + "step": 2618 + }, + { + "epoch": 0.6476261127596439, + "grad_norm": 0.8291856382801164, + "learning_rate": 4.867125002861136e-06, + "loss": 0.421, + "step": 2619 + }, + { + "epoch": 0.6478733926805144, + "grad_norm": 0.8143024931629848, + "learning_rate": 4.867020429270473e-06, + "loss": 0.4306, + "step": 2620 + }, + { + "epoch": 0.6481206726013847, + "grad_norm": 0.7831998758499393, + "learning_rate": 4.866915815670221e-06, + "loss": 0.4269, + "step": 2621 + }, + { + "epoch": 0.6483679525222552, + "grad_norm": 0.7830711376483871, + "learning_rate": 4.866811162062146e-06, + "loss": 0.4514, + "step": 2622 + }, + { + "epoch": 0.6486152324431256, + "grad_norm": 0.8088024343741351, + "learning_rate": 4.86670646844802e-06, + "loss": 0.4059, + "step": 2623 + }, + { + "epoch": 0.6488625123639961, + "grad_norm": 0.8646016295169404, + "learning_rate": 4.86660173482961e-06, + "loss": 0.4262, + "step": 2624 + }, + { + "epoch": 0.6491097922848664, + "grad_norm": 0.832596400950109, + "learning_rate": 4.866496961208689e-06, + "loss": 0.4289, + "step": 2625 + }, + { + "epoch": 0.6493570722057369, + "grad_norm": 0.8067755589491356, + "learning_rate": 4.866392147587026e-06, + "loss": 0.4026, + "step": 2626 + }, + { + "epoch": 0.6496043521266073, + "grad_norm": 0.7575527345016138, + "learning_rate": 4.8662872939663925e-06, + "loss": 0.4915, + "step": 2627 + }, + { + "epoch": 0.6498516320474778, + "grad_norm": 0.882603184161495, + "learning_rate": 4.866182400348562e-06, + "loss": 0.4028, + "step": 2628 + }, + { + "epoch": 0.6500989119683481, + "grad_norm": 0.820325274503156, + "learning_rate": 4.866077466735307e-06, + "loss": 0.4334, + "step": 2629 + }, + { + "epoch": 0.6503461918892186, + "grad_norm": 0.8311177188830288, + "learning_rate": 4.8659724931284014e-06, + "loss": 0.4505, + "step": 2630 + }, + { + "epoch": 0.650593471810089, + "grad_norm": 0.7836144268998563, + "learning_rate": 4.865867479529619e-06, + "loss": 0.4185, + "step": 2631 + }, + { + "epoch": 0.6508407517309595, + "grad_norm": 0.769144550632546, + "learning_rate": 4.865762425940735e-06, + "loss": 0.4447, + "step": 2632 + }, + { + "epoch": 0.6510880316518298, + "grad_norm": 0.7983531419796737, + "learning_rate": 4.865657332363526e-06, + "loss": 0.4451, + "step": 2633 + }, + { + "epoch": 0.6513353115727003, + "grad_norm": 0.8263434045298292, + "learning_rate": 4.865552198799767e-06, + "loss": 0.4073, + "step": 2634 + }, + { + "epoch": 0.6515825914935707, + "grad_norm": 0.8110069368531261, + "learning_rate": 4.865447025251237e-06, + "loss": 0.4525, + "step": 2635 + }, + { + "epoch": 0.6518298714144412, + "grad_norm": 0.8143820165970546, + "learning_rate": 4.86534181171971e-06, + "loss": 0.412, + "step": 2636 + }, + { + "epoch": 0.6520771513353115, + "grad_norm": 0.8320001375672842, + "learning_rate": 4.865236558206969e-06, + "loss": 0.4605, + "step": 2637 + }, + { + "epoch": 0.652324431256182, + "grad_norm": 0.8097042928776802, + "learning_rate": 4.865131264714791e-06, + "loss": 0.3915, + "step": 2638 + }, + { + "epoch": 0.6525717111770524, + "grad_norm": 0.7994068536215166, + "learning_rate": 4.865025931244955e-06, + "loss": 0.4416, + "step": 2639 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 0.833191717549721, + "learning_rate": 4.864920557799243e-06, + "loss": 0.3927, + "step": 2640 + }, + { + "epoch": 0.6530662710187932, + "grad_norm": 0.8287763555318937, + "learning_rate": 4.864815144379435e-06, + "loss": 0.4285, + "step": 2641 + }, + { + "epoch": 0.6533135509396637, + "grad_norm": 0.8376260262533985, + "learning_rate": 4.864709690987313e-06, + "loss": 0.4058, + "step": 2642 + }, + { + "epoch": 0.6535608308605341, + "grad_norm": 0.8012222473334901, + "learning_rate": 4.8646041976246595e-06, + "loss": 0.4342, + "step": 2643 + }, + { + "epoch": 0.6538081107814046, + "grad_norm": 0.7915340668570944, + "learning_rate": 4.864498664293258e-06, + "loss": 0.4375, + "step": 2644 + }, + { + "epoch": 0.654055390702275, + "grad_norm": 0.7914492576635661, + "learning_rate": 4.864393090994892e-06, + "loss": 0.4365, + "step": 2645 + }, + { + "epoch": 0.6543026706231454, + "grad_norm": 0.7693345141545859, + "learning_rate": 4.864287477731346e-06, + "loss": 0.4217, + "step": 2646 + }, + { + "epoch": 0.6545499505440159, + "grad_norm": 0.7878375532840123, + "learning_rate": 4.8641818245044065e-06, + "loss": 0.4313, + "step": 2647 + }, + { + "epoch": 0.6547972304648862, + "grad_norm": 0.776760393600863, + "learning_rate": 4.8640761313158565e-06, + "loss": 0.4121, + "step": 2648 + }, + { + "epoch": 0.6550445103857567, + "grad_norm": 0.8010628469676324, + "learning_rate": 4.8639703981674854e-06, + "loss": 0.4049, + "step": 2649 + }, + { + "epoch": 0.6552917903066271, + "grad_norm": 0.8374209078452766, + "learning_rate": 4.863864625061079e-06, + "loss": 0.3779, + "step": 2650 + }, + { + "epoch": 0.6555390702274976, + "grad_norm": 0.8174278279572348, + "learning_rate": 4.8637588119984245e-06, + "loss": 0.42, + "step": 2651 + }, + { + "epoch": 0.655786350148368, + "grad_norm": 0.7978953213607124, + "learning_rate": 4.863652958981312e-06, + "loss": 0.4014, + "step": 2652 + }, + { + "epoch": 0.6560336300692384, + "grad_norm": 0.839369529972231, + "learning_rate": 4.863547066011529e-06, + "loss": 0.4557, + "step": 2653 + }, + { + "epoch": 0.6562809099901088, + "grad_norm": 0.8486133636067605, + "learning_rate": 4.863441133090867e-06, + "loss": 0.4262, + "step": 2654 + }, + { + "epoch": 0.6565281899109793, + "grad_norm": 0.8001265604884367, + "learning_rate": 4.863335160221116e-06, + "loss": 0.3965, + "step": 2655 + }, + { + "epoch": 0.6567754698318496, + "grad_norm": 0.7686223929805192, + "learning_rate": 4.863229147404067e-06, + "loss": 0.4275, + "step": 2656 + }, + { + "epoch": 0.6570227497527201, + "grad_norm": 0.7772075862282518, + "learning_rate": 4.863123094641513e-06, + "loss": 0.4033, + "step": 2657 + }, + { + "epoch": 0.6572700296735905, + "grad_norm": 0.7775090647355424, + "learning_rate": 4.8630170019352455e-06, + "loss": 0.4308, + "step": 2658 + }, + { + "epoch": 0.657517309594461, + "grad_norm": 0.7985056923918036, + "learning_rate": 4.862910869287058e-06, + "loss": 0.4574, + "step": 2659 + }, + { + "epoch": 0.6577645895153313, + "grad_norm": 0.8176547204705846, + "learning_rate": 4.862804696698743e-06, + "loss": 0.4177, + "step": 2660 + }, + { + "epoch": 0.6580118694362018, + "grad_norm": 0.7913133572047072, + "learning_rate": 4.8626984841720985e-06, + "loss": 0.4181, + "step": 2661 + }, + { + "epoch": 0.6582591493570722, + "grad_norm": 0.8017262292735567, + "learning_rate": 4.862592231708917e-06, + "loss": 0.4138, + "step": 2662 + }, + { + "epoch": 0.6585064292779427, + "grad_norm": 0.823785805842316, + "learning_rate": 4.862485939310996e-06, + "loss": 0.4178, + "step": 2663 + }, + { + "epoch": 0.658753709198813, + "grad_norm": 0.8011536361874997, + "learning_rate": 4.862379606980131e-06, + "loss": 0.42, + "step": 2664 + }, + { + "epoch": 0.6590009891196835, + "grad_norm": 0.8131762554088784, + "learning_rate": 4.86227323471812e-06, + "loss": 0.4171, + "step": 2665 + }, + { + "epoch": 0.6592482690405539, + "grad_norm": 0.7743146594982807, + "learning_rate": 4.86216682252676e-06, + "loss": 0.4358, + "step": 2666 + }, + { + "epoch": 0.6594955489614244, + "grad_norm": 0.7984111657102294, + "learning_rate": 4.862060370407852e-06, + "loss": 0.4085, + "step": 2667 + }, + { + "epoch": 0.6597428288822947, + "grad_norm": 0.8081662281215509, + "learning_rate": 4.861953878363193e-06, + "loss": 0.406, + "step": 2668 + }, + { + "epoch": 0.6599901088031652, + "grad_norm": 0.847834361380235, + "learning_rate": 4.8618473463945846e-06, + "loss": 0.4487, + "step": 2669 + }, + { + "epoch": 0.6602373887240356, + "grad_norm": 0.8062903347818331, + "learning_rate": 4.861740774503827e-06, + "loss": 0.4064, + "step": 2670 + }, + { + "epoch": 0.6604846686449061, + "grad_norm": 0.8131245152077919, + "learning_rate": 4.861634162692721e-06, + "loss": 0.4103, + "step": 2671 + }, + { + "epoch": 0.6607319485657764, + "grad_norm": 0.780375927043985, + "learning_rate": 4.86152751096307e-06, + "loss": 0.4342, + "step": 2672 + }, + { + "epoch": 0.6609792284866469, + "grad_norm": 0.8053037494452797, + "learning_rate": 4.861420819316674e-06, + "loss": 0.4044, + "step": 2673 + }, + { + "epoch": 0.6612265084075173, + "grad_norm": 0.7860517182947623, + "learning_rate": 4.861314087755339e-06, + "loss": 0.4325, + "step": 2674 + }, + { + "epoch": 0.6614737883283878, + "grad_norm": 0.8359112628884461, + "learning_rate": 4.8612073162808685e-06, + "loss": 0.4046, + "step": 2675 + }, + { + "epoch": 0.6617210682492581, + "grad_norm": 0.8207892956870797, + "learning_rate": 4.861100504895067e-06, + "loss": 0.4135, + "step": 2676 + }, + { + "epoch": 0.6619683481701286, + "grad_norm": 0.825628094557653, + "learning_rate": 4.86099365359974e-06, + "loss": 0.4151, + "step": 2677 + }, + { + "epoch": 0.662215628090999, + "grad_norm": 0.8562140165251803, + "learning_rate": 4.860886762396694e-06, + "loss": 0.4417, + "step": 2678 + }, + { + "epoch": 0.6624629080118695, + "grad_norm": 0.8366896031458023, + "learning_rate": 4.860779831287735e-06, + "loss": 0.3959, + "step": 2679 + }, + { + "epoch": 0.6627101879327398, + "grad_norm": 0.862785681802581, + "learning_rate": 4.86067286027467e-06, + "loss": 0.4187, + "step": 2680 + }, + { + "epoch": 0.6629574678536103, + "grad_norm": 0.7714723304346427, + "learning_rate": 4.860565849359309e-06, + "loss": 0.4396, + "step": 2681 + }, + { + "epoch": 0.6632047477744807, + "grad_norm": 0.8142081920122305, + "learning_rate": 4.860458798543459e-06, + "loss": 0.428, + "step": 2682 + }, + { + "epoch": 0.6634520276953512, + "grad_norm": 0.7798010043722234, + "learning_rate": 4.8603517078289305e-06, + "loss": 0.4388, + "step": 2683 + }, + { + "epoch": 0.6636993076162215, + "grad_norm": 0.7755345393059091, + "learning_rate": 4.860244577217533e-06, + "loss": 0.4107, + "step": 2684 + }, + { + "epoch": 0.663946587537092, + "grad_norm": 0.7674983481753626, + "learning_rate": 4.860137406711079e-06, + "loss": 0.4143, + "step": 2685 + }, + { + "epoch": 0.6641938674579624, + "grad_norm": 0.8273272184248035, + "learning_rate": 4.860030196311377e-06, + "loss": 0.414, + "step": 2686 + }, + { + "epoch": 0.6644411473788329, + "grad_norm": 0.817916038266538, + "learning_rate": 4.859922946020241e-06, + "loss": 0.4047, + "step": 2687 + }, + { + "epoch": 0.6646884272997032, + "grad_norm": 0.8207357627703306, + "learning_rate": 4.8598156558394835e-06, + "loss": 0.4294, + "step": 2688 + }, + { + "epoch": 0.6649357072205737, + "grad_norm": 0.8194821245448161, + "learning_rate": 4.859708325770919e-06, + "loss": 0.4013, + "step": 2689 + }, + { + "epoch": 0.6651829871414441, + "grad_norm": 0.7698555674508966, + "learning_rate": 4.859600955816361e-06, + "loss": 0.4349, + "step": 2690 + }, + { + "epoch": 0.6654302670623146, + "grad_norm": 0.776934367789398, + "learning_rate": 4.859493545977624e-06, + "loss": 0.4098, + "step": 2691 + }, + { + "epoch": 0.6656775469831849, + "grad_norm": 0.8173710162756613, + "learning_rate": 4.859386096256523e-06, + "loss": 0.3915, + "step": 2692 + }, + { + "epoch": 0.6659248269040554, + "grad_norm": 0.8276330948906291, + "learning_rate": 4.859278606654876e-06, + "loss": 0.4198, + "step": 2693 + }, + { + "epoch": 0.6661721068249258, + "grad_norm": 0.775277988894492, + "learning_rate": 4.859171077174498e-06, + "loss": 0.4151, + "step": 2694 + }, + { + "epoch": 0.6664193867457963, + "grad_norm": 0.8367562396642206, + "learning_rate": 4.8590635078172086e-06, + "loss": 0.4578, + "step": 2695 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.8159711563660788, + "learning_rate": 4.858955898584824e-06, + "loss": 0.4159, + "step": 2696 + }, + { + "epoch": 0.6669139465875371, + "grad_norm": 0.8408357991825046, + "learning_rate": 4.858848249479165e-06, + "loss": 0.4011, + "step": 2697 + }, + { + "epoch": 0.6671612265084075, + "grad_norm": 0.841572463582256, + "learning_rate": 4.858740560502049e-06, + "loss": 0.4123, + "step": 2698 + }, + { + "epoch": 0.667408506429278, + "grad_norm": 0.8179256974131708, + "learning_rate": 4.8586328316552974e-06, + "loss": 0.439, + "step": 2699 + }, + { + "epoch": 0.6676557863501483, + "grad_norm": 0.8055382933055434, + "learning_rate": 4.858525062940732e-06, + "loss": 0.4276, + "step": 2700 + }, + { + "epoch": 0.6679030662710188, + "grad_norm": 0.8132449133301869, + "learning_rate": 4.858417254360173e-06, + "loss": 0.4247, + "step": 2701 + }, + { + "epoch": 0.6681503461918892, + "grad_norm": 0.8017582573504225, + "learning_rate": 4.858309405915443e-06, + "loss": 0.4138, + "step": 2702 + }, + { + "epoch": 0.6683976261127597, + "grad_norm": 0.8102328296173832, + "learning_rate": 4.858201517608366e-06, + "loss": 0.4615, + "step": 2703 + }, + { + "epoch": 0.66864490603363, + "grad_norm": 0.8565741417342391, + "learning_rate": 4.858093589440765e-06, + "loss": 0.4398, + "step": 2704 + }, + { + "epoch": 0.6688921859545005, + "grad_norm": 0.8203380198987199, + "learning_rate": 4.8579856214144635e-06, + "loss": 0.4477, + "step": 2705 + }, + { + "epoch": 0.6691394658753709, + "grad_norm": 0.8154367679562924, + "learning_rate": 4.8578776135312876e-06, + "loss": 0.4182, + "step": 2706 + }, + { + "epoch": 0.6693867457962414, + "grad_norm": 0.796586375252954, + "learning_rate": 4.8577695657930625e-06, + "loss": 0.417, + "step": 2707 + }, + { + "epoch": 0.6696340257171117, + "grad_norm": 0.7857952936500736, + "learning_rate": 4.857661478201614e-06, + "loss": 0.425, + "step": 2708 + }, + { + "epoch": 0.6698813056379822, + "grad_norm": 0.7839475383973936, + "learning_rate": 4.85755335075877e-06, + "loss": 0.4323, + "step": 2709 + }, + { + "epoch": 0.6701285855588526, + "grad_norm": 0.8317686188823157, + "learning_rate": 4.857445183466357e-06, + "loss": 0.4038, + "step": 2710 + }, + { + "epoch": 0.670375865479723, + "grad_norm": 0.7735299641172824, + "learning_rate": 4.857336976326205e-06, + "loss": 0.4349, + "step": 2711 + }, + { + "epoch": 0.6706231454005934, + "grad_norm": 0.7993198548331951, + "learning_rate": 4.857228729340142e-06, + "loss": 0.4505, + "step": 2712 + }, + { + "epoch": 0.6708704253214639, + "grad_norm": 0.8005486544012455, + "learning_rate": 4.8571204425099976e-06, + "loss": 0.4316, + "step": 2713 + }, + { + "epoch": 0.6711177052423343, + "grad_norm": 0.8235107641996137, + "learning_rate": 4.857012115837602e-06, + "loss": 0.414, + "step": 2714 + }, + { + "epoch": 0.6713649851632048, + "grad_norm": 0.7544707697511732, + "learning_rate": 4.856903749324787e-06, + "loss": 0.4061, + "step": 2715 + }, + { + "epoch": 0.6716122650840751, + "grad_norm": 0.837063786540837, + "learning_rate": 4.856795342973385e-06, + "loss": 0.4236, + "step": 2716 + }, + { + "epoch": 0.6718595450049456, + "grad_norm": 0.7794131174029756, + "learning_rate": 4.856686896785226e-06, + "loss": 0.4053, + "step": 2717 + }, + { + "epoch": 0.672106824925816, + "grad_norm": 0.8167949776353572, + "learning_rate": 4.856578410762145e-06, + "loss": 0.3939, + "step": 2718 + }, + { + "epoch": 0.6723541048466865, + "grad_norm": 0.8159754290439838, + "learning_rate": 4.856469884905974e-06, + "loss": 0.4235, + "step": 2719 + }, + { + "epoch": 0.6726013847675568, + "grad_norm": 0.7811653270798669, + "learning_rate": 4.8563613192185495e-06, + "loss": 0.3975, + "step": 2720 + }, + { + "epoch": 0.6728486646884273, + "grad_norm": 0.7749258733290558, + "learning_rate": 4.856252713701706e-06, + "loss": 0.4407, + "step": 2721 + }, + { + "epoch": 0.6730959446092978, + "grad_norm": 0.8416011342065808, + "learning_rate": 4.856144068357279e-06, + "loss": 0.4084, + "step": 2722 + }, + { + "epoch": 0.6733432245301681, + "grad_norm": 0.7965495607636626, + "learning_rate": 4.8560353831871035e-06, + "loss": 0.4325, + "step": 2723 + }, + { + "epoch": 0.6735905044510386, + "grad_norm": 0.8046182312914216, + "learning_rate": 4.855926658193019e-06, + "loss": 0.4498, + "step": 2724 + }, + { + "epoch": 0.673837784371909, + "grad_norm": 0.7897180049006142, + "learning_rate": 4.855817893376862e-06, + "loss": 0.4226, + "step": 2725 + }, + { + "epoch": 0.6740850642927795, + "grad_norm": 0.7673796995879293, + "learning_rate": 4.85570908874047e-06, + "loss": 0.4749, + "step": 2726 + }, + { + "epoch": 0.6743323442136498, + "grad_norm": 0.8066313586664126, + "learning_rate": 4.855600244285684e-06, + "loss": 0.4353, + "step": 2727 + }, + { + "epoch": 0.6745796241345203, + "grad_norm": 0.8043440775563024, + "learning_rate": 4.855491360014343e-06, + "loss": 0.4377, + "step": 2728 + }, + { + "epoch": 0.6748269040553907, + "grad_norm": 0.8052759257323537, + "learning_rate": 4.855382435928287e-06, + "loss": 0.4242, + "step": 2729 + }, + { + "epoch": 0.6750741839762612, + "grad_norm": 0.816064941591361, + "learning_rate": 4.855273472029358e-06, + "loss": 0.4073, + "step": 2730 + }, + { + "epoch": 0.6753214638971315, + "grad_norm": 0.7606831743034189, + "learning_rate": 4.855164468319398e-06, + "loss": 0.4127, + "step": 2731 + }, + { + "epoch": 0.675568743818002, + "grad_norm": 0.8075604491375517, + "learning_rate": 4.855055424800249e-06, + "loss": 0.4494, + "step": 2732 + }, + { + "epoch": 0.6758160237388724, + "grad_norm": 0.7534279728737382, + "learning_rate": 4.854946341473753e-06, + "loss": 0.4519, + "step": 2733 + }, + { + "epoch": 0.6760633036597429, + "grad_norm": 0.8069270952474595, + "learning_rate": 4.8548372183417556e-06, + "loss": 0.4065, + "step": 2734 + }, + { + "epoch": 0.6763105835806132, + "grad_norm": 0.8506776561935846, + "learning_rate": 4.854728055406101e-06, + "loss": 0.4306, + "step": 2735 + }, + { + "epoch": 0.6765578635014837, + "grad_norm": 0.8035438540666233, + "learning_rate": 4.854618852668632e-06, + "loss": 0.4606, + "step": 2736 + }, + { + "epoch": 0.6768051434223541, + "grad_norm": 0.7934236069117082, + "learning_rate": 4.854509610131198e-06, + "loss": 0.4424, + "step": 2737 + }, + { + "epoch": 0.6770524233432246, + "grad_norm": 0.7654027889805268, + "learning_rate": 4.854400327795644e-06, + "loss": 0.4361, + "step": 2738 + }, + { + "epoch": 0.6772997032640949, + "grad_norm": 0.8258240340169412, + "learning_rate": 4.854291005663816e-06, + "loss": 0.4033, + "step": 2739 + }, + { + "epoch": 0.6775469831849654, + "grad_norm": 0.8126929867632539, + "learning_rate": 4.854181643737564e-06, + "loss": 0.432, + "step": 2740 + }, + { + "epoch": 0.6777942631058358, + "grad_norm": 0.8059082743497984, + "learning_rate": 4.854072242018734e-06, + "loss": 0.422, + "step": 2741 + }, + { + "epoch": 0.6780415430267063, + "grad_norm": 0.8181609925538584, + "learning_rate": 4.853962800509179e-06, + "loss": 0.4423, + "step": 2742 + }, + { + "epoch": 0.6782888229475766, + "grad_norm": 0.7901271764796842, + "learning_rate": 4.853853319210745e-06, + "loss": 0.4251, + "step": 2743 + }, + { + "epoch": 0.6785361028684471, + "grad_norm": 0.8030211456123973, + "learning_rate": 4.853743798125285e-06, + "loss": 0.4312, + "step": 2744 + }, + { + "epoch": 0.6787833827893175, + "grad_norm": 0.7965447474952801, + "learning_rate": 4.8536342372546494e-06, + "loss": 0.4439, + "step": 2745 + }, + { + "epoch": 0.679030662710188, + "grad_norm": 0.7792761987365541, + "learning_rate": 4.85352463660069e-06, + "loss": 0.4361, + "step": 2746 + }, + { + "epoch": 0.6792779426310583, + "grad_norm": 0.8066811438330524, + "learning_rate": 4.853414996165258e-06, + "loss": 0.4504, + "step": 2747 + }, + { + "epoch": 0.6795252225519288, + "grad_norm": 0.8388942890539145, + "learning_rate": 4.85330531595021e-06, + "loss": 0.4208, + "step": 2748 + }, + { + "epoch": 0.6797725024727992, + "grad_norm": 0.8154470508323961, + "learning_rate": 4.853195595957398e-06, + "loss": 0.4192, + "step": 2749 + }, + { + "epoch": 0.6800197823936697, + "grad_norm": 0.7756267102531477, + "learning_rate": 4.853085836188676e-06, + "loss": 0.3995, + "step": 2750 + }, + { + "epoch": 0.68026706231454, + "grad_norm": 0.8401001254872341, + "learning_rate": 4.852976036645899e-06, + "loss": 0.4341, + "step": 2751 + }, + { + "epoch": 0.6805143422354105, + "grad_norm": 0.8190568217830237, + "learning_rate": 4.852866197330925e-06, + "loss": 0.432, + "step": 2752 + }, + { + "epoch": 0.6807616221562809, + "grad_norm": 0.776414872949731, + "learning_rate": 4.852756318245609e-06, + "loss": 0.412, + "step": 2753 + }, + { + "epoch": 0.6810089020771514, + "grad_norm": 0.7750759553622607, + "learning_rate": 4.852646399391808e-06, + "loss": 0.4521, + "step": 2754 + }, + { + "epoch": 0.6812561819980217, + "grad_norm": 0.8000941607465487, + "learning_rate": 4.8525364407713825e-06, + "loss": 0.4095, + "step": 2755 + }, + { + "epoch": 0.6815034619188922, + "grad_norm": 0.8196128109500314, + "learning_rate": 4.852426442386188e-06, + "loss": 0.441, + "step": 2756 + }, + { + "epoch": 0.6817507418397626, + "grad_norm": 0.8147986557860766, + "learning_rate": 4.852316404238085e-06, + "loss": 0.4571, + "step": 2757 + }, + { + "epoch": 0.6819980217606331, + "grad_norm": 0.8017639356034257, + "learning_rate": 4.8522063263289336e-06, + "loss": 0.4231, + "step": 2758 + }, + { + "epoch": 0.6822453016815034, + "grad_norm": 0.8257170171004764, + "learning_rate": 4.8520962086605945e-06, + "loss": 0.442, + "step": 2759 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 0.8078217424320431, + "learning_rate": 4.8519860512349295e-06, + "loss": 0.4357, + "step": 2760 + }, + { + "epoch": 0.6827398615232443, + "grad_norm": 0.8127981123442076, + "learning_rate": 4.851875854053799e-06, + "loss": 0.4018, + "step": 2761 + }, + { + "epoch": 0.6829871414441148, + "grad_norm": 0.808121246312467, + "learning_rate": 4.8517656171190665e-06, + "loss": 0.4395, + "step": 2762 + }, + { + "epoch": 0.6832344213649851, + "grad_norm": 0.8616502396837019, + "learning_rate": 4.8516553404325965e-06, + "loss": 0.421, + "step": 2763 + }, + { + "epoch": 0.6834817012858556, + "grad_norm": 0.8211179328198835, + "learning_rate": 4.851545023996252e-06, + "loss": 0.4151, + "step": 2764 + }, + { + "epoch": 0.683728981206726, + "grad_norm": 0.7971065636543179, + "learning_rate": 4.851434667811896e-06, + "loss": 0.3961, + "step": 2765 + }, + { + "epoch": 0.6839762611275965, + "grad_norm": 0.7959678571700568, + "learning_rate": 4.851324271881397e-06, + "loss": 0.4542, + "step": 2766 + }, + { + "epoch": 0.6842235410484668, + "grad_norm": 0.8297470002867344, + "learning_rate": 4.8512138362066185e-06, + "loss": 0.3831, + "step": 2767 + }, + { + "epoch": 0.6844708209693373, + "grad_norm": 0.8012939597110195, + "learning_rate": 4.851103360789428e-06, + "loss": 0.4234, + "step": 2768 + }, + { + "epoch": 0.6847181008902077, + "grad_norm": 0.8087329469986386, + "learning_rate": 4.850992845631694e-06, + "loss": 0.4243, + "step": 2769 + }, + { + "epoch": 0.6849653808110782, + "grad_norm": 0.8236396320864248, + "learning_rate": 4.850882290735283e-06, + "loss": 0.4246, + "step": 2770 + }, + { + "epoch": 0.6852126607319485, + "grad_norm": 0.8167830829538719, + "learning_rate": 4.850771696102066e-06, + "loss": 0.4284, + "step": 2771 + }, + { + "epoch": 0.685459940652819, + "grad_norm": 0.7654752845074516, + "learning_rate": 4.850661061733909e-06, + "loss": 0.4589, + "step": 2772 + }, + { + "epoch": 0.6857072205736894, + "grad_norm": 0.8547620176700507, + "learning_rate": 4.850550387632683e-06, + "loss": 0.4125, + "step": 2773 + }, + { + "epoch": 0.6859545004945599, + "grad_norm": 0.7917980650045635, + "learning_rate": 4.85043967380026e-06, + "loss": 0.4348, + "step": 2774 + }, + { + "epoch": 0.6862017804154302, + "grad_norm": 0.8345478477879663, + "learning_rate": 4.850328920238512e-06, + "loss": 0.4178, + "step": 2775 + }, + { + "epoch": 0.6864490603363007, + "grad_norm": 0.8332777404049719, + "learning_rate": 4.8502181269493084e-06, + "loss": 0.4122, + "step": 2776 + }, + { + "epoch": 0.6866963402571711, + "grad_norm": 0.7759039805494313, + "learning_rate": 4.850107293934524e-06, + "loss": 0.4256, + "step": 2777 + }, + { + "epoch": 0.6869436201780416, + "grad_norm": 0.8252611335287267, + "learning_rate": 4.849996421196031e-06, + "loss": 0.416, + "step": 2778 + }, + { + "epoch": 0.6871909000989119, + "grad_norm": 0.788591873666151, + "learning_rate": 4.849885508735704e-06, + "loss": 0.4711, + "step": 2779 + }, + { + "epoch": 0.6874381800197824, + "grad_norm": 0.8189554759316249, + "learning_rate": 4.849774556555419e-06, + "loss": 0.4125, + "step": 2780 + }, + { + "epoch": 0.6876854599406528, + "grad_norm": 0.7836784844389106, + "learning_rate": 4.849663564657049e-06, + "loss": 0.4231, + "step": 2781 + }, + { + "epoch": 0.6879327398615233, + "grad_norm": 0.8058795307766686, + "learning_rate": 4.849552533042472e-06, + "loss": 0.4194, + "step": 2782 + }, + { + "epoch": 0.6881800197823936, + "grad_norm": 0.7907983381026115, + "learning_rate": 4.8494414617135635e-06, + "loss": 0.4243, + "step": 2783 + }, + { + "epoch": 0.6884272997032641, + "grad_norm": 0.8183733832559971, + "learning_rate": 4.8493303506722025e-06, + "loss": 0.4179, + "step": 2784 + }, + { + "epoch": 0.6886745796241345, + "grad_norm": 0.8188930609183032, + "learning_rate": 4.849219199920266e-06, + "loss": 0.4405, + "step": 2785 + }, + { + "epoch": 0.688921859545005, + "grad_norm": 0.8305368966836002, + "learning_rate": 4.849108009459632e-06, + "loss": 0.4278, + "step": 2786 + }, + { + "epoch": 0.6891691394658753, + "grad_norm": 0.8366002879385498, + "learning_rate": 4.8489967792921806e-06, + "loss": 0.4391, + "step": 2787 + }, + { + "epoch": 0.6894164193867458, + "grad_norm": 0.7871906369909654, + "learning_rate": 4.848885509419793e-06, + "loss": 0.4691, + "step": 2788 + }, + { + "epoch": 0.6896636993076162, + "grad_norm": 0.8251697333494993, + "learning_rate": 4.848774199844348e-06, + "loss": 0.4225, + "step": 2789 + }, + { + "epoch": 0.6899109792284867, + "grad_norm": 0.8012321325556169, + "learning_rate": 4.848662850567729e-06, + "loss": 0.3791, + "step": 2790 + }, + { + "epoch": 0.690158259149357, + "grad_norm": 0.7895786794853111, + "learning_rate": 4.848551461591817e-06, + "loss": 0.4197, + "step": 2791 + }, + { + "epoch": 0.6904055390702275, + "grad_norm": 0.8122224358345084, + "learning_rate": 4.848440032918496e-06, + "loss": 0.4122, + "step": 2792 + }, + { + "epoch": 0.6906528189910979, + "grad_norm": 0.8011124583088066, + "learning_rate": 4.848328564549648e-06, + "loss": 0.3933, + "step": 2793 + }, + { + "epoch": 0.6909000989119684, + "grad_norm": 0.8069567267692628, + "learning_rate": 4.848217056487158e-06, + "loss": 0.4225, + "step": 2794 + }, + { + "epoch": 0.6911473788328387, + "grad_norm": 0.8025647116960141, + "learning_rate": 4.84810550873291e-06, + "loss": 0.4056, + "step": 2795 + }, + { + "epoch": 0.6913946587537092, + "grad_norm": 0.7795941271097871, + "learning_rate": 4.84799392128879e-06, + "loss": 0.4069, + "step": 2796 + }, + { + "epoch": 0.6916419386745796, + "grad_norm": 0.8206549077084073, + "learning_rate": 4.847882294156684e-06, + "loss": 0.4144, + "step": 2797 + }, + { + "epoch": 0.69188921859545, + "grad_norm": 0.7993709593343162, + "learning_rate": 4.847770627338479e-06, + "loss": 0.4189, + "step": 2798 + }, + { + "epoch": 0.6921364985163204, + "grad_norm": 0.7983736260715664, + "learning_rate": 4.847658920836063e-06, + "loss": 0.4471, + "step": 2799 + }, + { + "epoch": 0.6923837784371909, + "grad_norm": 0.8042491425514566, + "learning_rate": 4.847547174651325e-06, + "loss": 0.4155, + "step": 2800 + }, + { + "epoch": 0.6926310583580614, + "grad_norm": 0.8234388367328737, + "learning_rate": 4.84743538878615e-06, + "loss": 0.4272, + "step": 2801 + }, + { + "epoch": 0.6928783382789317, + "grad_norm": 0.7744695018229808, + "learning_rate": 4.847323563242431e-06, + "loss": 0.4224, + "step": 2802 + }, + { + "epoch": 0.6931256181998022, + "grad_norm": 0.7872893346801324, + "learning_rate": 4.847211698022058e-06, + "loss": 0.4265, + "step": 2803 + }, + { + "epoch": 0.6933728981206726, + "grad_norm": 0.8187757340572908, + "learning_rate": 4.84709979312692e-06, + "loss": 0.44, + "step": 2804 + }, + { + "epoch": 0.6936201780415431, + "grad_norm": 0.801267739135154, + "learning_rate": 4.84698784855891e-06, + "loss": 0.4532, + "step": 2805 + }, + { + "epoch": 0.6938674579624134, + "grad_norm": 0.8573956823059902, + "learning_rate": 4.84687586431992e-06, + "loss": 0.4569, + "step": 2806 + }, + { + "epoch": 0.6941147378832839, + "grad_norm": 0.810081502663985, + "learning_rate": 4.846763840411842e-06, + "loss": 0.4453, + "step": 2807 + }, + { + "epoch": 0.6943620178041543, + "grad_norm": 0.7970434492165938, + "learning_rate": 4.8466517768365705e-06, + "loss": 0.4149, + "step": 2808 + }, + { + "epoch": 0.6946092977250248, + "grad_norm": 0.7767778121773069, + "learning_rate": 4.846539673595999e-06, + "loss": 0.4063, + "step": 2809 + }, + { + "epoch": 0.6948565776458951, + "grad_norm": 0.8376570930059907, + "learning_rate": 4.846427530692023e-06, + "loss": 0.4213, + "step": 2810 + }, + { + "epoch": 0.6951038575667656, + "grad_norm": 0.8110481108261645, + "learning_rate": 4.846315348126538e-06, + "loss": 0.4211, + "step": 2811 + }, + { + "epoch": 0.695351137487636, + "grad_norm": 0.8193719014283674, + "learning_rate": 4.84620312590144e-06, + "loss": 0.4341, + "step": 2812 + }, + { + "epoch": 0.6955984174085065, + "grad_norm": 0.7941431266613678, + "learning_rate": 4.846090864018625e-06, + "loss": 0.4064, + "step": 2813 + }, + { + "epoch": 0.6958456973293768, + "grad_norm": 0.7931169995062347, + "learning_rate": 4.845978562479993e-06, + "loss": 0.4086, + "step": 2814 + }, + { + "epoch": 0.6960929772502473, + "grad_norm": 0.831336166924252, + "learning_rate": 4.84586622128744e-06, + "loss": 0.4216, + "step": 2815 + }, + { + "epoch": 0.6963402571711177, + "grad_norm": 0.7731150224481598, + "learning_rate": 4.845753840442865e-06, + "loss": 0.4164, + "step": 2816 + }, + { + "epoch": 0.6965875370919882, + "grad_norm": 0.7920674915282415, + "learning_rate": 4.845641419948168e-06, + "loss": 0.4356, + "step": 2817 + }, + { + "epoch": 0.6968348170128585, + "grad_norm": 0.875976281556326, + "learning_rate": 4.84552895980525e-06, + "loss": 0.4334, + "step": 2818 + }, + { + "epoch": 0.697082096933729, + "grad_norm": 0.8001556368006945, + "learning_rate": 4.845416460016011e-06, + "loss": 0.4052, + "step": 2819 + }, + { + "epoch": 0.6973293768545994, + "grad_norm": 0.7875811128771384, + "learning_rate": 4.845303920582353e-06, + "loss": 0.4402, + "step": 2820 + }, + { + "epoch": 0.6975766567754699, + "grad_norm": 0.8171665238112196, + "learning_rate": 4.845191341506178e-06, + "loss": 0.4224, + "step": 2821 + }, + { + "epoch": 0.6978239366963402, + "grad_norm": 0.7908125499885357, + "learning_rate": 4.845078722789388e-06, + "loss": 0.4352, + "step": 2822 + }, + { + "epoch": 0.6980712166172107, + "grad_norm": 0.823467984833555, + "learning_rate": 4.844966064433889e-06, + "loss": 0.4124, + "step": 2823 + }, + { + "epoch": 0.6983184965380811, + "grad_norm": 0.7508391242666943, + "learning_rate": 4.844853366441583e-06, + "loss": 0.4066, + "step": 2824 + }, + { + "epoch": 0.6985657764589516, + "grad_norm": 0.7815866665699481, + "learning_rate": 4.844740628814376e-06, + "loss": 0.4236, + "step": 2825 + }, + { + "epoch": 0.6988130563798219, + "grad_norm": 0.797117432450336, + "learning_rate": 4.8446278515541735e-06, + "loss": 0.3852, + "step": 2826 + }, + { + "epoch": 0.6990603363006924, + "grad_norm": 0.7708383132040468, + "learning_rate": 4.844515034662882e-06, + "loss": 0.4233, + "step": 2827 + }, + { + "epoch": 0.6993076162215628, + "grad_norm": 0.8234459455209941, + "learning_rate": 4.844402178142408e-06, + "loss": 0.4306, + "step": 2828 + }, + { + "epoch": 0.6995548961424333, + "grad_norm": 0.7891526183943096, + "learning_rate": 4.844289281994659e-06, + "loss": 0.43, + "step": 2829 + }, + { + "epoch": 0.6998021760633036, + "grad_norm": 0.8269880587344088, + "learning_rate": 4.844176346221543e-06, + "loss": 0.382, + "step": 2830 + }, + { + "epoch": 0.7000494559841741, + "grad_norm": 0.7979649115794843, + "learning_rate": 4.844063370824969e-06, + "loss": 0.4233, + "step": 2831 + }, + { + "epoch": 0.7002967359050445, + "grad_norm": 0.7802581744052987, + "learning_rate": 4.843950355806848e-06, + "loss": 0.4148, + "step": 2832 + }, + { + "epoch": 0.700544015825915, + "grad_norm": 0.7770547238028475, + "learning_rate": 4.84383730116909e-06, + "loss": 0.4468, + "step": 2833 + }, + { + "epoch": 0.7007912957467853, + "grad_norm": 0.79419809474251, + "learning_rate": 4.843724206913604e-06, + "loss": 0.4328, + "step": 2834 + }, + { + "epoch": 0.7010385756676558, + "grad_norm": 0.8038231205976583, + "learning_rate": 4.843611073042303e-06, + "loss": 0.4266, + "step": 2835 + }, + { + "epoch": 0.7012858555885262, + "grad_norm": 0.7842622977893237, + "learning_rate": 4.843497899557099e-06, + "loss": 0.4553, + "step": 2836 + }, + { + "epoch": 0.7015331355093967, + "grad_norm": 0.8707104909836916, + "learning_rate": 4.843384686459906e-06, + "loss": 0.4047, + "step": 2837 + }, + { + "epoch": 0.701780415430267, + "grad_norm": 0.8075300737210989, + "learning_rate": 4.843271433752635e-06, + "loss": 0.3966, + "step": 2838 + }, + { + "epoch": 0.7020276953511375, + "grad_norm": 0.7771082005270675, + "learning_rate": 4.843158141437204e-06, + "loss": 0.4482, + "step": 2839 + }, + { + "epoch": 0.7022749752720079, + "grad_norm": 0.826855389864659, + "learning_rate": 4.843044809515525e-06, + "loss": 0.4077, + "step": 2840 + }, + { + "epoch": 0.7025222551928784, + "grad_norm": 0.8162412200875937, + "learning_rate": 4.842931437989515e-06, + "loss": 0.4583, + "step": 2841 + }, + { + "epoch": 0.7027695351137487, + "grad_norm": 0.8050089615685367, + "learning_rate": 4.84281802686109e-06, + "loss": 0.3962, + "step": 2842 + }, + { + "epoch": 0.7030168150346192, + "grad_norm": 0.8163750044094787, + "learning_rate": 4.8427045761321675e-06, + "loss": 0.4327, + "step": 2843 + }, + { + "epoch": 0.7032640949554896, + "grad_norm": 0.8563912067654783, + "learning_rate": 4.842591085804664e-06, + "loss": 0.4566, + "step": 2844 + }, + { + "epoch": 0.7035113748763601, + "grad_norm": 0.8223661537798799, + "learning_rate": 4.842477555880498e-06, + "loss": 0.4242, + "step": 2845 + }, + { + "epoch": 0.7037586547972304, + "grad_norm": 0.7730639792659905, + "learning_rate": 4.84236398636159e-06, + "loss": 0.4461, + "step": 2846 + }, + { + "epoch": 0.7040059347181009, + "grad_norm": 0.7769243044865499, + "learning_rate": 4.842250377249858e-06, + "loss": 0.4306, + "step": 2847 + }, + { + "epoch": 0.7042532146389713, + "grad_norm": 0.8167018892846426, + "learning_rate": 4.842136728547223e-06, + "loss": 0.4237, + "step": 2848 + }, + { + "epoch": 0.7045004945598418, + "grad_norm": 0.8385873076201484, + "learning_rate": 4.842023040255606e-06, + "loss": 0.41, + "step": 2849 + }, + { + "epoch": 0.7047477744807121, + "grad_norm": 0.794449770554456, + "learning_rate": 4.841909312376928e-06, + "loss": 0.3948, + "step": 2850 + }, + { + "epoch": 0.7049950544015826, + "grad_norm": 0.8085818080942666, + "learning_rate": 4.841795544913112e-06, + "loss": 0.4333, + "step": 2851 + }, + { + "epoch": 0.705242334322453, + "grad_norm": 0.8347190989084778, + "learning_rate": 4.841681737866082e-06, + "loss": 0.4061, + "step": 2852 + }, + { + "epoch": 0.7054896142433235, + "grad_norm": 0.7808124598357045, + "learning_rate": 4.84156789123776e-06, + "loss": 0.4412, + "step": 2853 + }, + { + "epoch": 0.7057368941641938, + "grad_norm": 0.8393493189297654, + "learning_rate": 4.841454005030071e-06, + "loss": 0.415, + "step": 2854 + }, + { + "epoch": 0.7059841740850643, + "grad_norm": 0.786373432859627, + "learning_rate": 4.84134007924494e-06, + "loss": 0.4337, + "step": 2855 + }, + { + "epoch": 0.7062314540059347, + "grad_norm": 0.7791712896571225, + "learning_rate": 4.841226113884292e-06, + "loss": 0.4308, + "step": 2856 + }, + { + "epoch": 0.7064787339268052, + "grad_norm": 0.7758379505569559, + "learning_rate": 4.841112108950055e-06, + "loss": 0.4105, + "step": 2857 + }, + { + "epoch": 0.7067260138476755, + "grad_norm": 0.7336906709851186, + "learning_rate": 4.840998064444154e-06, + "loss": 0.4304, + "step": 2858 + }, + { + "epoch": 0.706973293768546, + "grad_norm": 0.7709631481004076, + "learning_rate": 4.840883980368518e-06, + "loss": 0.4417, + "step": 2859 + }, + { + "epoch": 0.7072205736894164, + "grad_norm": 0.861736396380686, + "learning_rate": 4.840769856725076e-06, + "loss": 0.426, + "step": 2860 + }, + { + "epoch": 0.7074678536102869, + "grad_norm": 0.8644517295452442, + "learning_rate": 4.840655693515754e-06, + "loss": 0.3996, + "step": 2861 + }, + { + "epoch": 0.7077151335311572, + "grad_norm": 0.7940413003498069, + "learning_rate": 4.840541490742485e-06, + "loss": 0.4244, + "step": 2862 + }, + { + "epoch": 0.7079624134520277, + "grad_norm": 0.8098229584210634, + "learning_rate": 4.840427248407199e-06, + "loss": 0.4284, + "step": 2863 + }, + { + "epoch": 0.7082096933728981, + "grad_norm": 0.8133450773470225, + "learning_rate": 4.840312966511825e-06, + "loss": 0.4395, + "step": 2864 + }, + { + "epoch": 0.7084569732937686, + "grad_norm": 0.7919331528428261, + "learning_rate": 4.840198645058296e-06, + "loss": 0.447, + "step": 2865 + }, + { + "epoch": 0.7087042532146389, + "grad_norm": 0.7965762781992587, + "learning_rate": 4.840084284048544e-06, + "loss": 0.4315, + "step": 2866 + }, + { + "epoch": 0.7089515331355094, + "grad_norm": 0.8215063582230092, + "learning_rate": 4.839969883484502e-06, + "loss": 0.4236, + "step": 2867 + }, + { + "epoch": 0.7091988130563798, + "grad_norm": 0.8198839600468166, + "learning_rate": 4.8398554433681056e-06, + "loss": 0.4358, + "step": 2868 + }, + { + "epoch": 0.7094460929772503, + "grad_norm": 0.7736256143569525, + "learning_rate": 4.839740963701286e-06, + "loss": 0.4558, + "step": 2869 + }, + { + "epoch": 0.7096933728981206, + "grad_norm": 0.7829134148515651, + "learning_rate": 4.83962644448598e-06, + "loss": 0.4205, + "step": 2870 + }, + { + "epoch": 0.7099406528189911, + "grad_norm": 0.8252334612510036, + "learning_rate": 4.839511885724123e-06, + "loss": 0.4337, + "step": 2871 + }, + { + "epoch": 0.7101879327398615, + "grad_norm": 0.8022554245927136, + "learning_rate": 4.839397287417652e-06, + "loss": 0.457, + "step": 2872 + }, + { + "epoch": 0.710435212660732, + "grad_norm": 0.8192708918207695, + "learning_rate": 4.8392826495685036e-06, + "loss": 0.4167, + "step": 2873 + }, + { + "epoch": 0.7106824925816023, + "grad_norm": 0.803077397303975, + "learning_rate": 4.839167972178615e-06, + "loss": 0.4098, + "step": 2874 + }, + { + "epoch": 0.7109297725024728, + "grad_norm": 0.8468082968841901, + "learning_rate": 4.839053255249925e-06, + "loss": 0.4242, + "step": 2875 + }, + { + "epoch": 0.7111770524233432, + "grad_norm": 0.8148986206105447, + "learning_rate": 4.838938498784373e-06, + "loss": 0.4286, + "step": 2876 + }, + { + "epoch": 0.7114243323442137, + "grad_norm": 0.8170745630142042, + "learning_rate": 4.838823702783898e-06, + "loss": 0.423, + "step": 2877 + }, + { + "epoch": 0.7116716122650841, + "grad_norm": 0.7996079185522241, + "learning_rate": 4.838708867250441e-06, + "loss": 0.4333, + "step": 2878 + }, + { + "epoch": 0.7119188921859545, + "grad_norm": 0.7439191783301492, + "learning_rate": 4.838593992185942e-06, + "loss": 0.4235, + "step": 2879 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 0.7948869447365353, + "learning_rate": 4.838479077592345e-06, + "loss": 0.4185, + "step": 2880 + }, + { + "epoch": 0.7124134520276953, + "grad_norm": 0.7963172123490325, + "learning_rate": 4.83836412347159e-06, + "loss": 0.4368, + "step": 2881 + }, + { + "epoch": 0.7126607319485658, + "grad_norm": 0.7621389475987713, + "learning_rate": 4.838249129825622e-06, + "loss": 0.4544, + "step": 2882 + }, + { + "epoch": 0.7129080118694362, + "grad_norm": 0.8008573013469324, + "learning_rate": 4.838134096656383e-06, + "loss": 0.4303, + "step": 2883 + }, + { + "epoch": 0.7131552917903067, + "grad_norm": 0.7845189586215126, + "learning_rate": 4.838019023965818e-06, + "loss": 0.4131, + "step": 2884 + }, + { + "epoch": 0.713402571711177, + "grad_norm": 0.8049398377356689, + "learning_rate": 4.837903911755872e-06, + "loss": 0.4288, + "step": 2885 + }, + { + "epoch": 0.7136498516320475, + "grad_norm": 0.804018737817397, + "learning_rate": 4.837788760028491e-06, + "loss": 0.422, + "step": 2886 + }, + { + "epoch": 0.7138971315529179, + "grad_norm": 0.8148238252599712, + "learning_rate": 4.8376735687856215e-06, + "loss": 0.4527, + "step": 2887 + }, + { + "epoch": 0.7141444114737884, + "grad_norm": 0.8173379766329714, + "learning_rate": 4.837558338029211e-06, + "loss": 0.4449, + "step": 2888 + }, + { + "epoch": 0.7143916913946587, + "grad_norm": 0.8174244199346332, + "learning_rate": 4.837443067761206e-06, + "loss": 0.4345, + "step": 2889 + }, + { + "epoch": 0.7146389713155292, + "grad_norm": 0.7358219327556542, + "learning_rate": 4.837327757983556e-06, + "loss": 0.4379, + "step": 2890 + }, + { + "epoch": 0.7148862512363996, + "grad_norm": 0.7666685418017337, + "learning_rate": 4.837212408698209e-06, + "loss": 0.4546, + "step": 2891 + }, + { + "epoch": 0.7151335311572701, + "grad_norm": 0.7880217194687349, + "learning_rate": 4.837097019907116e-06, + "loss": 0.3937, + "step": 2892 + }, + { + "epoch": 0.7153808110781404, + "grad_norm": 0.7914607948486829, + "learning_rate": 4.836981591612226e-06, + "loss": 0.4528, + "step": 2893 + }, + { + "epoch": 0.7156280909990109, + "grad_norm": 0.8349074820163436, + "learning_rate": 4.836866123815492e-06, + "loss": 0.4242, + "step": 2894 + }, + { + "epoch": 0.7158753709198813, + "grad_norm": 0.7726278159771962, + "learning_rate": 4.836750616518864e-06, + "loss": 0.4623, + "step": 2895 + }, + { + "epoch": 0.7161226508407518, + "grad_norm": 0.8124863076626552, + "learning_rate": 4.836635069724295e-06, + "loss": 0.3905, + "step": 2896 + }, + { + "epoch": 0.7163699307616221, + "grad_norm": 0.7841970322379082, + "learning_rate": 4.836519483433738e-06, + "loss": 0.42, + "step": 2897 + }, + { + "epoch": 0.7166172106824926, + "grad_norm": 0.8169770200971669, + "learning_rate": 4.8364038576491465e-06, + "loss": 0.4281, + "step": 2898 + }, + { + "epoch": 0.716864490603363, + "grad_norm": 0.7934550788486586, + "learning_rate": 4.836288192372476e-06, + "loss": 0.4258, + "step": 2899 + }, + { + "epoch": 0.7171117705242335, + "grad_norm": 0.7811046579681694, + "learning_rate": 4.8361724876056804e-06, + "loss": 0.4334, + "step": 2900 + }, + { + "epoch": 0.7173590504451038, + "grad_norm": 0.8288956689415943, + "learning_rate": 4.836056743350717e-06, + "loss": 0.4114, + "step": 2901 + }, + { + "epoch": 0.7176063303659743, + "grad_norm": 0.8100905415131514, + "learning_rate": 4.83594095960954e-06, + "loss": 0.4193, + "step": 2902 + }, + { + "epoch": 0.7178536102868447, + "grad_norm": 0.8349181394617355, + "learning_rate": 4.835825136384107e-06, + "loss": 0.4089, + "step": 2903 + }, + { + "epoch": 0.7181008902077152, + "grad_norm": 0.8189708968152819, + "learning_rate": 4.835709273676377e-06, + "loss": 0.3892, + "step": 2904 + }, + { + "epoch": 0.7183481701285855, + "grad_norm": 0.8103038676070768, + "learning_rate": 4.835593371488308e-06, + "loss": 0.4453, + "step": 2905 + }, + { + "epoch": 0.718595450049456, + "grad_norm": 0.8690922003528119, + "learning_rate": 4.835477429821859e-06, + "loss": 0.4004, + "step": 2906 + }, + { + "epoch": 0.7188427299703264, + "grad_norm": 0.8919261323122275, + "learning_rate": 4.835361448678989e-06, + "loss": 0.3887, + "step": 2907 + }, + { + "epoch": 0.7190900098911969, + "grad_norm": 0.8034616323291977, + "learning_rate": 4.835245428061659e-06, + "loss": 0.4287, + "step": 2908 + }, + { + "epoch": 0.7193372898120672, + "grad_norm": 0.8147872678673408, + "learning_rate": 4.8351293679718305e-06, + "loss": 0.4226, + "step": 2909 + }, + { + "epoch": 0.7195845697329377, + "grad_norm": 0.819089910267115, + "learning_rate": 4.835013268411465e-06, + "loss": 0.4631, + "step": 2910 + }, + { + "epoch": 0.7198318496538081, + "grad_norm": 0.7978691069618967, + "learning_rate": 4.8348971293825245e-06, + "loss": 0.4237, + "step": 2911 + }, + { + "epoch": 0.7200791295746786, + "grad_norm": 0.8174621419121835, + "learning_rate": 4.834780950886973e-06, + "loss": 0.4376, + "step": 2912 + }, + { + "epoch": 0.7203264094955489, + "grad_norm": 0.8309583432333282, + "learning_rate": 4.834664732926773e-06, + "loss": 0.4429, + "step": 2913 + }, + { + "epoch": 0.7205736894164194, + "grad_norm": 0.8000154938904277, + "learning_rate": 4.8345484755038895e-06, + "loss": 0.4185, + "step": 2914 + }, + { + "epoch": 0.7208209693372898, + "grad_norm": 0.7947941622328786, + "learning_rate": 4.834432178620288e-06, + "loss": 0.414, + "step": 2915 + }, + { + "epoch": 0.7210682492581603, + "grad_norm": 0.7923969199565712, + "learning_rate": 4.834315842277934e-06, + "loss": 0.4357, + "step": 2916 + }, + { + "epoch": 0.7213155291790306, + "grad_norm": 0.7960039620189656, + "learning_rate": 4.834199466478793e-06, + "loss": 0.438, + "step": 2917 + }, + { + "epoch": 0.7215628090999011, + "grad_norm": 0.8361359088575793, + "learning_rate": 4.8340830512248335e-06, + "loss": 0.3967, + "step": 2918 + }, + { + "epoch": 0.7218100890207715, + "grad_norm": 0.8108901771349343, + "learning_rate": 4.833966596518023e-06, + "loss": 0.4363, + "step": 2919 + }, + { + "epoch": 0.722057368941642, + "grad_norm": 0.8549440580224905, + "learning_rate": 4.833850102360329e-06, + "loss": 0.41, + "step": 2920 + }, + { + "epoch": 0.7223046488625123, + "grad_norm": 0.8003186776319219, + "learning_rate": 4.833733568753721e-06, + "loss": 0.4216, + "step": 2921 + }, + { + "epoch": 0.7225519287833828, + "grad_norm": 0.8035872812352234, + "learning_rate": 4.83361699570017e-06, + "loss": 0.4182, + "step": 2922 + }, + { + "epoch": 0.7227992087042532, + "grad_norm": 0.8297400236558385, + "learning_rate": 4.8335003832016444e-06, + "loss": 0.4316, + "step": 2923 + }, + { + "epoch": 0.7230464886251237, + "grad_norm": 0.8790681235417003, + "learning_rate": 4.833383731260118e-06, + "loss": 0.4063, + "step": 2924 + }, + { + "epoch": 0.723293768545994, + "grad_norm": 0.8268063083338992, + "learning_rate": 4.833267039877559e-06, + "loss": 0.3856, + "step": 2925 + }, + { + "epoch": 0.7235410484668645, + "grad_norm": 0.8406419166486392, + "learning_rate": 4.833150309055942e-06, + "loss": 0.4236, + "step": 2926 + }, + { + "epoch": 0.7237883283877349, + "grad_norm": 0.7926519696834897, + "learning_rate": 4.833033538797241e-06, + "loss": 0.4174, + "step": 2927 + }, + { + "epoch": 0.7240356083086054, + "grad_norm": 0.8187942841023771, + "learning_rate": 4.832916729103427e-06, + "loss": 0.4387, + "step": 2928 + }, + { + "epoch": 0.7242828882294757, + "grad_norm": 0.8221501951469953, + "learning_rate": 4.832799879976476e-06, + "loss": 0.3999, + "step": 2929 + }, + { + "epoch": 0.7245301681503462, + "grad_norm": 0.7934800047804927, + "learning_rate": 4.832682991418364e-06, + "loss": 0.4058, + "step": 2930 + }, + { + "epoch": 0.7247774480712166, + "grad_norm": 0.8330724261819362, + "learning_rate": 4.832566063431066e-06, + "loss": 0.4321, + "step": 2931 + }, + { + "epoch": 0.7250247279920871, + "grad_norm": 0.8228536121668583, + "learning_rate": 4.832449096016557e-06, + "loss": 0.4173, + "step": 2932 + }, + { + "epoch": 0.7252720079129574, + "grad_norm": 0.837175853076657, + "learning_rate": 4.8323320891768166e-06, + "loss": 0.4395, + "step": 2933 + }, + { + "epoch": 0.7255192878338279, + "grad_norm": 0.8578406462579389, + "learning_rate": 4.83221504291382e-06, + "loss": 0.4025, + "step": 2934 + }, + { + "epoch": 0.7257665677546983, + "grad_norm": 0.8276438543678113, + "learning_rate": 4.832097957229548e-06, + "loss": 0.4128, + "step": 2935 + }, + { + "epoch": 0.7260138476755688, + "grad_norm": 0.7945420798891535, + "learning_rate": 4.831980832125978e-06, + "loss": 0.4318, + "step": 2936 + }, + { + "epoch": 0.7262611275964391, + "grad_norm": 0.8624450431948255, + "learning_rate": 4.8318636676050906e-06, + "loss": 0.4077, + "step": 2937 + }, + { + "epoch": 0.7265084075173096, + "grad_norm": 0.7943945249796411, + "learning_rate": 4.831746463668866e-06, + "loss": 0.4147, + "step": 2938 + }, + { + "epoch": 0.72675568743818, + "grad_norm": 0.7865076359278911, + "learning_rate": 4.831629220319285e-06, + "loss": 0.4437, + "step": 2939 + }, + { + "epoch": 0.7270029673590505, + "grad_norm": 0.8245008586056316, + "learning_rate": 4.83151193755833e-06, + "loss": 0.4172, + "step": 2940 + }, + { + "epoch": 0.7272502472799208, + "grad_norm": 0.8682372634570225, + "learning_rate": 4.831394615387983e-06, + "loss": 0.423, + "step": 2941 + }, + { + "epoch": 0.7274975272007913, + "grad_norm": 0.8425489215333106, + "learning_rate": 4.831277253810227e-06, + "loss": 0.4299, + "step": 2942 + }, + { + "epoch": 0.7277448071216617, + "grad_norm": 0.8057793479253231, + "learning_rate": 4.831159852827046e-06, + "loss": 0.4122, + "step": 2943 + }, + { + "epoch": 0.7279920870425322, + "grad_norm": 0.7887003730131898, + "learning_rate": 4.831042412440424e-06, + "loss": 0.4345, + "step": 2944 + }, + { + "epoch": 0.7282393669634025, + "grad_norm": 0.8003641551533328, + "learning_rate": 4.8309249326523475e-06, + "loss": 0.4256, + "step": 2945 + }, + { + "epoch": 0.728486646884273, + "grad_norm": 0.8050096172563999, + "learning_rate": 4.8308074134648e-06, + "loss": 0.4363, + "step": 2946 + }, + { + "epoch": 0.7287339268051434, + "grad_norm": 0.8501449867311948, + "learning_rate": 4.83068985487977e-06, + "loss": 0.4167, + "step": 2947 + }, + { + "epoch": 0.7289812067260139, + "grad_norm": 0.7875128051256901, + "learning_rate": 4.830572256899243e-06, + "loss": 0.4462, + "step": 2948 + }, + { + "epoch": 0.7292284866468842, + "grad_norm": 0.8374013219223678, + "learning_rate": 4.830454619525207e-06, + "loss": 0.4518, + "step": 2949 + }, + { + "epoch": 0.7294757665677547, + "grad_norm": 0.817008859188431, + "learning_rate": 4.830336942759651e-06, + "loss": 0.3903, + "step": 2950 + }, + { + "epoch": 0.7297230464886251, + "grad_norm": 0.8014171810642367, + "learning_rate": 4.830219226604565e-06, + "loss": 0.4285, + "step": 2951 + }, + { + "epoch": 0.7299703264094956, + "grad_norm": 0.7856834039638679, + "learning_rate": 4.830101471061936e-06, + "loss": 0.3934, + "step": 2952 + }, + { + "epoch": 0.7302176063303659, + "grad_norm": 0.7748087866010082, + "learning_rate": 4.829983676133758e-06, + "loss": 0.4545, + "step": 2953 + }, + { + "epoch": 0.7304648862512364, + "grad_norm": 0.7989513159267758, + "learning_rate": 4.829865841822019e-06, + "loss": 0.4253, + "step": 2954 + }, + { + "epoch": 0.7307121661721068, + "grad_norm": 0.7764358392610177, + "learning_rate": 4.829747968128712e-06, + "loss": 0.4249, + "step": 2955 + }, + { + "epoch": 0.7309594460929772, + "grad_norm": 0.8279849821092882, + "learning_rate": 4.829630055055829e-06, + "loss": 0.4512, + "step": 2956 + }, + { + "epoch": 0.7312067260138477, + "grad_norm": 0.8260366275350098, + "learning_rate": 4.8295121026053644e-06, + "loss": 0.4295, + "step": 2957 + }, + { + "epoch": 0.7314540059347181, + "grad_norm": 0.8117838736807556, + "learning_rate": 4.82939411077931e-06, + "loss": 0.4171, + "step": 2958 + }, + { + "epoch": 0.7317012858555886, + "grad_norm": 0.7806874920384149, + "learning_rate": 4.829276079579662e-06, + "loss": 0.4076, + "step": 2959 + }, + { + "epoch": 0.731948565776459, + "grad_norm": 0.8156644012373938, + "learning_rate": 4.829158009008414e-06, + "loss": 0.4367, + "step": 2960 + }, + { + "epoch": 0.7321958456973294, + "grad_norm": 0.8361228747154713, + "learning_rate": 4.829039899067563e-06, + "loss": 0.4495, + "step": 2961 + }, + { + "epoch": 0.7324431256181998, + "grad_norm": 0.7970591862132284, + "learning_rate": 4.828921749759104e-06, + "loss": 0.4057, + "step": 2962 + }, + { + "epoch": 0.7326904055390703, + "grad_norm": 0.799202037507185, + "learning_rate": 4.828803561085034e-06, + "loss": 0.4048, + "step": 2963 + }, + { + "epoch": 0.7329376854599406, + "grad_norm": 0.7759952798464693, + "learning_rate": 4.8286853330473535e-06, + "loss": 0.4005, + "step": 2964 + }, + { + "epoch": 0.7331849653808111, + "grad_norm": 0.8369606917142974, + "learning_rate": 4.828567065648057e-06, + "loss": 0.4266, + "step": 2965 + }, + { + "epoch": 0.7334322453016815, + "grad_norm": 0.8127543973780628, + "learning_rate": 4.828448758889147e-06, + "loss": 0.3879, + "step": 2966 + }, + { + "epoch": 0.733679525222552, + "grad_norm": 0.834168778505788, + "learning_rate": 4.828330412772622e-06, + "loss": 0.4255, + "step": 2967 + }, + { + "epoch": 0.7339268051434223, + "grad_norm": 0.7851856770790845, + "learning_rate": 4.828212027300481e-06, + "loss": 0.4608, + "step": 2968 + }, + { + "epoch": 0.7341740850642928, + "grad_norm": 0.7632714078688472, + "learning_rate": 4.828093602474727e-06, + "loss": 0.4416, + "step": 2969 + }, + { + "epoch": 0.7344213649851632, + "grad_norm": 0.8347666891173107, + "learning_rate": 4.827975138297361e-06, + "loss": 0.4122, + "step": 2970 + }, + { + "epoch": 0.7346686449060337, + "grad_norm": 0.8067829929634867, + "learning_rate": 4.827856634770385e-06, + "loss": 0.4411, + "step": 2971 + }, + { + "epoch": 0.734915924826904, + "grad_norm": 0.8017276159688024, + "learning_rate": 4.8277380918958015e-06, + "loss": 0.3973, + "step": 2972 + }, + { + "epoch": 0.7351632047477745, + "grad_norm": 0.812076660288236, + "learning_rate": 4.827619509675616e-06, + "loss": 0.4197, + "step": 2973 + }, + { + "epoch": 0.7354104846686449, + "grad_norm": 0.7788714236767956, + "learning_rate": 4.827500888111833e-06, + "loss": 0.4337, + "step": 2974 + }, + { + "epoch": 0.7356577645895154, + "grad_norm": 0.8090103487550155, + "learning_rate": 4.8273822272064555e-06, + "loss": 0.419, + "step": 2975 + }, + { + "epoch": 0.7359050445103857, + "grad_norm": 0.8265966587257293, + "learning_rate": 4.8272635269614895e-06, + "loss": 0.4082, + "step": 2976 + }, + { + "epoch": 0.7361523244312562, + "grad_norm": 0.7846445990321484, + "learning_rate": 4.827144787378944e-06, + "loss": 0.437, + "step": 2977 + }, + { + "epoch": 0.7363996043521266, + "grad_norm": 0.8312527404300895, + "learning_rate": 4.827026008460823e-06, + "loss": 0.4016, + "step": 2978 + }, + { + "epoch": 0.7366468842729971, + "grad_norm": 0.7910678678346987, + "learning_rate": 4.826907190209136e-06, + "loss": 0.427, + "step": 2979 + }, + { + "epoch": 0.7368941641938674, + "grad_norm": 0.8081223109051165, + "learning_rate": 4.82678833262589e-06, + "loss": 0.3933, + "step": 2980 + }, + { + "epoch": 0.7371414441147379, + "grad_norm": 0.8047433161446556, + "learning_rate": 4.826669435713096e-06, + "loss": 0.4079, + "step": 2981 + }, + { + "epoch": 0.7373887240356083, + "grad_norm": 0.7999842935873025, + "learning_rate": 4.826550499472761e-06, + "loss": 0.4615, + "step": 2982 + }, + { + "epoch": 0.7376360039564788, + "grad_norm": 0.792716091172535, + "learning_rate": 4.826431523906898e-06, + "loss": 0.391, + "step": 2983 + }, + { + "epoch": 0.7378832838773491, + "grad_norm": 0.8303626433497433, + "learning_rate": 4.826312509017517e-06, + "loss": 0.404, + "step": 2984 + }, + { + "epoch": 0.7381305637982196, + "grad_norm": 0.8374450240264267, + "learning_rate": 4.826193454806629e-06, + "loss": 0.4143, + "step": 2985 + }, + { + "epoch": 0.73837784371909, + "grad_norm": 0.7785793171820266, + "learning_rate": 4.826074361276247e-06, + "loss": 0.4179, + "step": 2986 + }, + { + "epoch": 0.7386251236399605, + "grad_norm": 0.7920577899878737, + "learning_rate": 4.825955228428385e-06, + "loss": 0.4276, + "step": 2987 + }, + { + "epoch": 0.7388724035608308, + "grad_norm": 0.8200588077966792, + "learning_rate": 4.825836056265055e-06, + "loss": 0.4317, + "step": 2988 + }, + { + "epoch": 0.7391196834817013, + "grad_norm": 0.8116664468428142, + "learning_rate": 4.8257168447882725e-06, + "loss": 0.4042, + "step": 2989 + }, + { + "epoch": 0.7393669634025717, + "grad_norm": 0.776120292987068, + "learning_rate": 4.825597594000052e-06, + "loss": 0.4418, + "step": 2990 + }, + { + "epoch": 0.7396142433234422, + "grad_norm": 0.8081847545003181, + "learning_rate": 4.825478303902409e-06, + "loss": 0.412, + "step": 2991 + }, + { + "epoch": 0.7398615232443125, + "grad_norm": 0.7950133188013835, + "learning_rate": 4.825358974497361e-06, + "loss": 0.4246, + "step": 2992 + }, + { + "epoch": 0.740108803165183, + "grad_norm": 0.7993263391124805, + "learning_rate": 4.825239605786924e-06, + "loss": 0.4061, + "step": 2993 + }, + { + "epoch": 0.7403560830860534, + "grad_norm": 0.7997981436722923, + "learning_rate": 4.825120197773114e-06, + "loss": 0.4563, + "step": 2994 + }, + { + "epoch": 0.7406033630069239, + "grad_norm": 0.7705130115086049, + "learning_rate": 4.825000750457953e-06, + "loss": 0.4198, + "step": 2995 + }, + { + "epoch": 0.7408506429277942, + "grad_norm": 0.8002552475075952, + "learning_rate": 4.824881263843458e-06, + "loss": 0.4432, + "step": 2996 + }, + { + "epoch": 0.7410979228486647, + "grad_norm": 0.8018310738296338, + "learning_rate": 4.824761737931649e-06, + "loss": 0.4402, + "step": 2997 + }, + { + "epoch": 0.7413452027695351, + "grad_norm": 0.8593903578168341, + "learning_rate": 4.8246421727245465e-06, + "loss": 0.4054, + "step": 2998 + }, + { + "epoch": 0.7415924826904056, + "grad_norm": 0.7889330438300733, + "learning_rate": 4.8245225682241705e-06, + "loss": 0.4422, + "step": 2999 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 0.7938169220464205, + "learning_rate": 4.824402924432543e-06, + "loss": 0.441, + "step": 3000 + }, + { + "epoch": 0.7420870425321464, + "grad_norm": 0.7844611533247873, + "learning_rate": 4.8242832413516874e-06, + "loss": 0.4336, + "step": 3001 + }, + { + "epoch": 0.7423343224530168, + "grad_norm": 0.7749679668683314, + "learning_rate": 4.824163518983627e-06, + "loss": 0.4142, + "step": 3002 + }, + { + "epoch": 0.7425816023738873, + "grad_norm": 0.8197901469393682, + "learning_rate": 4.824043757330384e-06, + "loss": 0.4245, + "step": 3003 + }, + { + "epoch": 0.7428288822947576, + "grad_norm": 0.8261013183178937, + "learning_rate": 4.823923956393982e-06, + "loss": 0.4037, + "step": 3004 + }, + { + "epoch": 0.7430761622156281, + "grad_norm": 0.7558923203936677, + "learning_rate": 4.8238041161764475e-06, + "loss": 0.4175, + "step": 3005 + }, + { + "epoch": 0.7433234421364985, + "grad_norm": 0.8320092925247135, + "learning_rate": 4.823684236679807e-06, + "loss": 0.4256, + "step": 3006 + }, + { + "epoch": 0.743570722057369, + "grad_norm": 0.8217439102346995, + "learning_rate": 4.823564317906085e-06, + "loss": 0.427, + "step": 3007 + }, + { + "epoch": 0.7438180019782393, + "grad_norm": 0.8199819751186989, + "learning_rate": 4.823444359857308e-06, + "loss": 0.382, + "step": 3008 + }, + { + "epoch": 0.7440652818991098, + "grad_norm": 0.817959773474465, + "learning_rate": 4.823324362535506e-06, + "loss": 0.4329, + "step": 3009 + }, + { + "epoch": 0.7443125618199802, + "grad_norm": 0.8187584290387997, + "learning_rate": 4.823204325942706e-06, + "loss": 0.4485, + "step": 3010 + }, + { + "epoch": 0.7445598417408507, + "grad_norm": 0.7898625944514868, + "learning_rate": 4.823084250080937e-06, + "loss": 0.4069, + "step": 3011 + }, + { + "epoch": 0.744807121661721, + "grad_norm": 0.7940276326242832, + "learning_rate": 4.822964134952229e-06, + "loss": 0.4197, + "step": 3012 + }, + { + "epoch": 0.7450544015825915, + "grad_norm": 0.7927876642320121, + "learning_rate": 4.822843980558611e-06, + "loss": 0.4299, + "step": 3013 + }, + { + "epoch": 0.7453016815034619, + "grad_norm": 0.7994978248822859, + "learning_rate": 4.8227237869021165e-06, + "loss": 0.4234, + "step": 3014 + }, + { + "epoch": 0.7455489614243324, + "grad_norm": 0.7543327971185407, + "learning_rate": 4.822603553984775e-06, + "loss": 0.4427, + "step": 3015 + }, + { + "epoch": 0.7457962413452027, + "grad_norm": 0.8259426315239943, + "learning_rate": 4.822483281808619e-06, + "loss": 0.4323, + "step": 3016 + }, + { + "epoch": 0.7460435212660732, + "grad_norm": 0.7824668111245527, + "learning_rate": 4.822362970375682e-06, + "loss": 0.4243, + "step": 3017 + }, + { + "epoch": 0.7462908011869436, + "grad_norm": 0.8090132078040282, + "learning_rate": 4.822242619687997e-06, + "loss": 0.4681, + "step": 3018 + }, + { + "epoch": 0.746538081107814, + "grad_norm": 0.7841620643031427, + "learning_rate": 4.8221222297476e-06, + "loss": 0.4296, + "step": 3019 + }, + { + "epoch": 0.7467853610286844, + "grad_norm": 0.7995190592508252, + "learning_rate": 4.822001800556523e-06, + "loss": 0.4018, + "step": 3020 + }, + { + "epoch": 0.7470326409495549, + "grad_norm": 0.7860214365572894, + "learning_rate": 4.821881332116804e-06, + "loss": 0.4409, + "step": 3021 + }, + { + "epoch": 0.7472799208704253, + "grad_norm": 0.8040174870471135, + "learning_rate": 4.8217608244304794e-06, + "loss": 0.4187, + "step": 3022 + }, + { + "epoch": 0.7475272007912958, + "grad_norm": 0.8042489812626473, + "learning_rate": 4.821640277499584e-06, + "loss": 0.4127, + "step": 3023 + }, + { + "epoch": 0.7477744807121661, + "grad_norm": 0.834905466029092, + "learning_rate": 4.8215196913261575e-06, + "loss": 0.4198, + "step": 3024 + }, + { + "epoch": 0.7480217606330366, + "grad_norm": 0.7667416755370126, + "learning_rate": 4.821399065912237e-06, + "loss": 0.4247, + "step": 3025 + }, + { + "epoch": 0.748269040553907, + "grad_norm": 0.7910830801831669, + "learning_rate": 4.821278401259861e-06, + "loss": 0.4077, + "step": 3026 + }, + { + "epoch": 0.7485163204747775, + "grad_norm": 0.7382810269836959, + "learning_rate": 4.8211576973710714e-06, + "loss": 0.4525, + "step": 3027 + }, + { + "epoch": 0.7487636003956478, + "grad_norm": 0.8121105617204082, + "learning_rate": 4.8210369542479055e-06, + "loss": 0.4052, + "step": 3028 + }, + { + "epoch": 0.7490108803165183, + "grad_norm": 0.8058548126440102, + "learning_rate": 4.820916171892407e-06, + "loss": 0.4351, + "step": 3029 + }, + { + "epoch": 0.7492581602373887, + "grad_norm": 0.8577264933262061, + "learning_rate": 4.820795350306615e-06, + "loss": 0.3981, + "step": 3030 + }, + { + "epoch": 0.7495054401582592, + "grad_norm": 0.7453444539845234, + "learning_rate": 4.820674489492573e-06, + "loss": 0.4223, + "step": 3031 + }, + { + "epoch": 0.7497527200791295, + "grad_norm": 0.8466748330823123, + "learning_rate": 4.820553589452323e-06, + "loss": 0.3995, + "step": 3032 + }, + { + "epoch": 0.75, + "grad_norm": 0.7790821202863654, + "learning_rate": 4.820432650187911e-06, + "loss": 0.404, + "step": 3033 + }, + { + "epoch": 0.7502472799208705, + "grad_norm": 0.8153973710234346, + "learning_rate": 4.820311671701379e-06, + "loss": 0.4276, + "step": 3034 + }, + { + "epoch": 0.7504945598417408, + "grad_norm": 0.7753722323128266, + "learning_rate": 4.8201906539947715e-06, + "loss": 0.4306, + "step": 3035 + }, + { + "epoch": 0.7507418397626113, + "grad_norm": 0.7744896340611823, + "learning_rate": 4.8200695970701356e-06, + "loss": 0.4356, + "step": 3036 + }, + { + "epoch": 0.7509891196834817, + "grad_norm": 0.7690189042676627, + "learning_rate": 4.8199485009295166e-06, + "loss": 0.44, + "step": 3037 + }, + { + "epoch": 0.7512363996043522, + "grad_norm": 0.8033947179211088, + "learning_rate": 4.819827365574963e-06, + "loss": 0.4139, + "step": 3038 + }, + { + "epoch": 0.7514836795252225, + "grad_norm": 0.7576932851777864, + "learning_rate": 4.819706191008519e-06, + "loss": 0.4081, + "step": 3039 + }, + { + "epoch": 0.751730959446093, + "grad_norm": 0.7967928320460952, + "learning_rate": 4.819584977232236e-06, + "loss": 0.4257, + "step": 3040 + }, + { + "epoch": 0.7519782393669634, + "grad_norm": 0.8392913645000412, + "learning_rate": 4.8194637242481615e-06, + "loss": 0.4197, + "step": 3041 + }, + { + "epoch": 0.7522255192878339, + "grad_norm": 0.7730924432116476, + "learning_rate": 4.819342432058345e-06, + "loss": 0.4321, + "step": 3042 + }, + { + "epoch": 0.7524727992087042, + "grad_norm": 0.8068109563132863, + "learning_rate": 4.819221100664836e-06, + "loss": 0.3994, + "step": 3043 + }, + { + "epoch": 0.7527200791295747, + "grad_norm": 0.8260660558085546, + "learning_rate": 4.819099730069688e-06, + "loss": 0.4048, + "step": 3044 + }, + { + "epoch": 0.7529673590504451, + "grad_norm": 0.770330180817139, + "learning_rate": 4.8189783202749495e-06, + "loss": 0.4362, + "step": 3045 + }, + { + "epoch": 0.7532146389713156, + "grad_norm": 0.831087174442937, + "learning_rate": 4.818856871282674e-06, + "loss": 0.402, + "step": 3046 + }, + { + "epoch": 0.753461918892186, + "grad_norm": 0.8055486831247699, + "learning_rate": 4.818735383094915e-06, + "loss": 0.4169, + "step": 3047 + }, + { + "epoch": 0.7537091988130564, + "grad_norm": 0.8349348167930836, + "learning_rate": 4.818613855713725e-06, + "loss": 0.4063, + "step": 3048 + }, + { + "epoch": 0.7539564787339268, + "grad_norm": 0.7832541815851745, + "learning_rate": 4.818492289141159e-06, + "loss": 0.42, + "step": 3049 + }, + { + "epoch": 0.7542037586547973, + "grad_norm": 0.8058628554987075, + "learning_rate": 4.818370683379271e-06, + "loss": 0.4038, + "step": 3050 + }, + { + "epoch": 0.7544510385756676, + "grad_norm": 0.7869318129819252, + "learning_rate": 4.818249038430117e-06, + "loss": 0.4022, + "step": 3051 + }, + { + "epoch": 0.7546983184965381, + "grad_norm": 0.7429522741807416, + "learning_rate": 4.818127354295752e-06, + "loss": 0.433, + "step": 3052 + }, + { + "epoch": 0.7549455984174085, + "grad_norm": 0.8058306975517127, + "learning_rate": 4.818005630978235e-06, + "loss": 0.4429, + "step": 3053 + }, + { + "epoch": 0.755192878338279, + "grad_norm": 0.8114294962955277, + "learning_rate": 4.817883868479622e-06, + "loss": 0.4374, + "step": 3054 + }, + { + "epoch": 0.7554401582591493, + "grad_norm": 0.7666783664854765, + "learning_rate": 4.817762066801971e-06, + "loss": 0.4114, + "step": 3055 + }, + { + "epoch": 0.7556874381800198, + "grad_norm": 0.8336654948850036, + "learning_rate": 4.817640225947341e-06, + "loss": 0.3976, + "step": 3056 + }, + { + "epoch": 0.7559347181008902, + "grad_norm": 0.8036876409711966, + "learning_rate": 4.817518345917792e-06, + "loss": 0.4335, + "step": 3057 + }, + { + "epoch": 0.7561819980217607, + "grad_norm": 0.8720285437742171, + "learning_rate": 4.817396426715384e-06, + "loss": 0.4038, + "step": 3058 + }, + { + "epoch": 0.756429277942631, + "grad_norm": 0.8182607660403057, + "learning_rate": 4.8172744683421765e-06, + "loss": 0.427, + "step": 3059 + }, + { + "epoch": 0.7566765578635015, + "grad_norm": 0.8027583929885022, + "learning_rate": 4.8171524708002335e-06, + "loss": 0.4123, + "step": 3060 + }, + { + "epoch": 0.7569238377843719, + "grad_norm": 0.8232173834519446, + "learning_rate": 4.817030434091615e-06, + "loss": 0.413, + "step": 3061 + }, + { + "epoch": 0.7571711177052424, + "grad_norm": 0.805678272221054, + "learning_rate": 4.816908358218384e-06, + "loss": 0.4007, + "step": 3062 + }, + { + "epoch": 0.7574183976261127, + "grad_norm": 0.7726560827200997, + "learning_rate": 4.8167862431826054e-06, + "loss": 0.3941, + "step": 3063 + }, + { + "epoch": 0.7576656775469832, + "grad_norm": 0.8068589530095359, + "learning_rate": 4.816664088986342e-06, + "loss": 0.4074, + "step": 3064 + }, + { + "epoch": 0.7579129574678536, + "grad_norm": 0.8193830699266273, + "learning_rate": 4.816541895631659e-06, + "loss": 0.4048, + "step": 3065 + }, + { + "epoch": 0.7581602373887241, + "grad_norm": 0.802439000408714, + "learning_rate": 4.816419663120621e-06, + "loss": 0.4228, + "step": 3066 + }, + { + "epoch": 0.7584075173095944, + "grad_norm": 0.7831520315560794, + "learning_rate": 4.816297391455296e-06, + "loss": 0.4416, + "step": 3067 + }, + { + "epoch": 0.7586547972304649, + "grad_norm": 0.8013772553913207, + "learning_rate": 4.816175080637748e-06, + "loss": 0.4063, + "step": 3068 + }, + { + "epoch": 0.7589020771513353, + "grad_norm": 0.8069423659312832, + "learning_rate": 4.816052730670047e-06, + "loss": 0.4101, + "step": 3069 + }, + { + "epoch": 0.7591493570722058, + "grad_norm": 0.8042760358148239, + "learning_rate": 4.815930341554259e-06, + "loss": 0.437, + "step": 3070 + }, + { + "epoch": 0.7593966369930761, + "grad_norm": 0.8528508748773457, + "learning_rate": 4.815807913292454e-06, + "loss": 0.4203, + "step": 3071 + }, + { + "epoch": 0.7596439169139466, + "grad_norm": 0.7788038832278472, + "learning_rate": 4.815685445886702e-06, + "loss": 0.4122, + "step": 3072 + }, + { + "epoch": 0.759891196834817, + "grad_norm": 0.7895557689997091, + "learning_rate": 4.815562939339072e-06, + "loss": 0.439, + "step": 3073 + }, + { + "epoch": 0.7601384767556875, + "grad_norm": 0.7665727269321336, + "learning_rate": 4.815440393651635e-06, + "loss": 0.4272, + "step": 3074 + }, + { + "epoch": 0.7603857566765578, + "grad_norm": 0.8193909964159897, + "learning_rate": 4.815317808826462e-06, + "loss": 0.3975, + "step": 3075 + }, + { + "epoch": 0.7606330365974283, + "grad_norm": 0.7904789376121926, + "learning_rate": 4.815195184865625e-06, + "loss": 0.4269, + "step": 3076 + }, + { + "epoch": 0.7608803165182987, + "grad_norm": 0.8179670797014685, + "learning_rate": 4.815072521771197e-06, + "loss": 0.3968, + "step": 3077 + }, + { + "epoch": 0.7611275964391692, + "grad_norm": 0.7733754206107383, + "learning_rate": 4.814949819545252e-06, + "loss": 0.4223, + "step": 3078 + }, + { + "epoch": 0.7613748763600395, + "grad_norm": 0.7996203606510405, + "learning_rate": 4.8148270781898635e-06, + "loss": 0.4292, + "step": 3079 + }, + { + "epoch": 0.76162215628091, + "grad_norm": 0.8190133067708448, + "learning_rate": 4.814704297707105e-06, + "loss": 0.4145, + "step": 3080 + }, + { + "epoch": 0.7618694362017804, + "grad_norm": 0.7937567763174175, + "learning_rate": 4.814581478099054e-06, + "loss": 0.4103, + "step": 3081 + }, + { + "epoch": 0.7621167161226509, + "grad_norm": 0.7713599014763641, + "learning_rate": 4.814458619367785e-06, + "loss": 0.4263, + "step": 3082 + }, + { + "epoch": 0.7623639960435212, + "grad_norm": 0.819891250654944, + "learning_rate": 4.814335721515376e-06, + "loss": 0.4097, + "step": 3083 + }, + { + "epoch": 0.7626112759643917, + "grad_norm": 0.7885235017605721, + "learning_rate": 4.814212784543902e-06, + "loss": 0.449, + "step": 3084 + }, + { + "epoch": 0.7628585558852621, + "grad_norm": 0.7893477319871918, + "learning_rate": 4.814089808455444e-06, + "loss": 0.4257, + "step": 3085 + }, + { + "epoch": 0.7631058358061326, + "grad_norm": 0.7928417601407314, + "learning_rate": 4.813966793252079e-06, + "loss": 0.4282, + "step": 3086 + }, + { + "epoch": 0.7633531157270029, + "grad_norm": 0.7930496478110192, + "learning_rate": 4.813843738935886e-06, + "loss": 0.4129, + "step": 3087 + }, + { + "epoch": 0.7636003956478734, + "grad_norm": 0.7492915045783554, + "learning_rate": 4.813720645508946e-06, + "loss": 0.424, + "step": 3088 + }, + { + "epoch": 0.7638476755687438, + "grad_norm": 0.7939923318098281, + "learning_rate": 4.8135975129733385e-06, + "loss": 0.4203, + "step": 3089 + }, + { + "epoch": 0.7640949554896143, + "grad_norm": 0.8054567000276287, + "learning_rate": 4.813474341331145e-06, + "loss": 0.4082, + "step": 3090 + }, + { + "epoch": 0.7643422354104846, + "grad_norm": 0.7615495246999505, + "learning_rate": 4.813351130584448e-06, + "loss": 0.4363, + "step": 3091 + }, + { + "epoch": 0.7645895153313551, + "grad_norm": 0.7487931230093278, + "learning_rate": 4.813227880735331e-06, + "loss": 0.4208, + "step": 3092 + }, + { + "epoch": 0.7648367952522255, + "grad_norm": 0.7989302908264841, + "learning_rate": 4.8131045917858754e-06, + "loss": 0.4568, + "step": 3093 + }, + { + "epoch": 0.765084075173096, + "grad_norm": 0.8150902813166511, + "learning_rate": 4.812981263738165e-06, + "loss": 0.4299, + "step": 3094 + }, + { + "epoch": 0.7653313550939663, + "grad_norm": 0.795813383219662, + "learning_rate": 4.8128578965942875e-06, + "loss": 0.444, + "step": 3095 + }, + { + "epoch": 0.7655786350148368, + "grad_norm": 0.768998047077691, + "learning_rate": 4.812734490356326e-06, + "loss": 0.411, + "step": 3096 + }, + { + "epoch": 0.7658259149357072, + "grad_norm": 0.7876009500354682, + "learning_rate": 4.812611045026365e-06, + "loss": 0.4065, + "step": 3097 + }, + { + "epoch": 0.7660731948565777, + "grad_norm": 0.8228181027521226, + "learning_rate": 4.812487560606493e-06, + "loss": 0.4257, + "step": 3098 + }, + { + "epoch": 0.766320474777448, + "grad_norm": 0.7899654098672674, + "learning_rate": 4.812364037098798e-06, + "loss": 0.4281, + "step": 3099 + }, + { + "epoch": 0.7665677546983185, + "grad_norm": 0.8263770103922008, + "learning_rate": 4.812240474505366e-06, + "loss": 0.4172, + "step": 3100 + }, + { + "epoch": 0.7668150346191889, + "grad_norm": 0.8223565750218941, + "learning_rate": 4.812116872828285e-06, + "loss": 0.4094, + "step": 3101 + }, + { + "epoch": 0.7670623145400594, + "grad_norm": 0.7882256242313526, + "learning_rate": 4.811993232069647e-06, + "loss": 0.4405, + "step": 3102 + }, + { + "epoch": 0.7673095944609297, + "grad_norm": 0.7673647038001873, + "learning_rate": 4.81186955223154e-06, + "loss": 0.4043, + "step": 3103 + }, + { + "epoch": 0.7675568743818002, + "grad_norm": 0.7748167909851514, + "learning_rate": 4.811745833316056e-06, + "loss": 0.4149, + "step": 3104 + }, + { + "epoch": 0.7678041543026706, + "grad_norm": 0.7804500490847732, + "learning_rate": 4.811622075325284e-06, + "loss": 0.4149, + "step": 3105 + }, + { + "epoch": 0.768051434223541, + "grad_norm": 0.7692586391443403, + "learning_rate": 4.811498278261318e-06, + "loss": 0.4366, + "step": 3106 + }, + { + "epoch": 0.7682987141444114, + "grad_norm": 0.7842598306966564, + "learning_rate": 4.811374442126248e-06, + "loss": 0.4212, + "step": 3107 + }, + { + "epoch": 0.7685459940652819, + "grad_norm": 0.7708815469931595, + "learning_rate": 4.8112505669221695e-06, + "loss": 0.4464, + "step": 3108 + }, + { + "epoch": 0.7687932739861523, + "grad_norm": 0.8253602424248783, + "learning_rate": 4.811126652651177e-06, + "loss": 0.4068, + "step": 3109 + }, + { + "epoch": 0.7690405539070228, + "grad_norm": 0.8309488561255076, + "learning_rate": 4.811002699315362e-06, + "loss": 0.4139, + "step": 3110 + }, + { + "epoch": 0.7692878338278932, + "grad_norm": 0.8085658208306157, + "learning_rate": 4.810878706916823e-06, + "loss": 0.4377, + "step": 3111 + }, + { + "epoch": 0.7695351137487636, + "grad_norm": 0.7741149699878886, + "learning_rate": 4.8107546754576525e-06, + "loss": 0.4274, + "step": 3112 + }, + { + "epoch": 0.7697823936696341, + "grad_norm": 0.7663408982316708, + "learning_rate": 4.81063060493995e-06, + "loss": 0.4153, + "step": 3113 + }, + { + "epoch": 0.7700296735905044, + "grad_norm": 0.8112801558552337, + "learning_rate": 4.81050649536581e-06, + "loss": 0.4095, + "step": 3114 + }, + { + "epoch": 0.7702769535113749, + "grad_norm": 0.775543508476325, + "learning_rate": 4.810382346737333e-06, + "loss": 0.4495, + "step": 3115 + }, + { + "epoch": 0.7705242334322453, + "grad_norm": 0.7901145505532202, + "learning_rate": 4.8102581590566156e-06, + "loss": 0.4322, + "step": 3116 + }, + { + "epoch": 0.7707715133531158, + "grad_norm": 0.7821623933520957, + "learning_rate": 4.810133932325758e-06, + "loss": 0.4069, + "step": 3117 + }, + { + "epoch": 0.7710187932739861, + "grad_norm": 0.8265570200176376, + "learning_rate": 4.810009666546858e-06, + "loss": 0.402, + "step": 3118 + }, + { + "epoch": 0.7712660731948566, + "grad_norm": 0.8154505625826904, + "learning_rate": 4.8098853617220186e-06, + "loss": 0.4606, + "step": 3119 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 0.8235703743873171, + "learning_rate": 4.8097610178533396e-06, + "loss": 0.4021, + "step": 3120 + }, + { + "epoch": 0.7717606330365975, + "grad_norm": 0.7884753518627691, + "learning_rate": 4.809636634942923e-06, + "loss": 0.4118, + "step": 3121 + }, + { + "epoch": 0.7720079129574678, + "grad_norm": 0.8084593721515126, + "learning_rate": 4.809512212992872e-06, + "loss": 0.431, + "step": 3122 + }, + { + "epoch": 0.7722551928783383, + "grad_norm": 0.779666053910998, + "learning_rate": 4.809387752005288e-06, + "loss": 0.4187, + "step": 3123 + }, + { + "epoch": 0.7725024727992087, + "grad_norm": 0.82077016879112, + "learning_rate": 4.809263251982276e-06, + "loss": 0.4131, + "step": 3124 + }, + { + "epoch": 0.7727497527200792, + "grad_norm": 0.8032424551504816, + "learning_rate": 4.80913871292594e-06, + "loss": 0.3919, + "step": 3125 + }, + { + "epoch": 0.7729970326409495, + "grad_norm": 0.7832784986154248, + "learning_rate": 4.8090141348383854e-06, + "loss": 0.4286, + "step": 3126 + }, + { + "epoch": 0.77324431256182, + "grad_norm": 0.8535658999730766, + "learning_rate": 4.808889517721718e-06, + "loss": 0.4152, + "step": 3127 + }, + { + "epoch": 0.7734915924826904, + "grad_norm": 0.8068467344043826, + "learning_rate": 4.808764861578043e-06, + "loss": 0.4368, + "step": 3128 + }, + { + "epoch": 0.7737388724035609, + "grad_norm": 0.8411797207814421, + "learning_rate": 4.808640166409469e-06, + "loss": 0.4149, + "step": 3129 + }, + { + "epoch": 0.7739861523244312, + "grad_norm": 0.797745138292174, + "learning_rate": 4.808515432218102e-06, + "loss": 0.4046, + "step": 3130 + }, + { + "epoch": 0.7742334322453017, + "grad_norm": 0.7810552927361032, + "learning_rate": 4.808390659006053e-06, + "loss": 0.4298, + "step": 3131 + }, + { + "epoch": 0.7744807121661721, + "grad_norm": 0.7977546221024441, + "learning_rate": 4.808265846775429e-06, + "loss": 0.4287, + "step": 3132 + }, + { + "epoch": 0.7747279920870426, + "grad_norm": 0.7827566413934264, + "learning_rate": 4.8081409955283405e-06, + "loss": 0.4265, + "step": 3133 + }, + { + "epoch": 0.7749752720079129, + "grad_norm": 0.7900617509765616, + "learning_rate": 4.808016105266897e-06, + "loss": 0.4154, + "step": 3134 + }, + { + "epoch": 0.7752225519287834, + "grad_norm": 0.7925920557238082, + "learning_rate": 4.80789117599321e-06, + "loss": 0.4282, + "step": 3135 + }, + { + "epoch": 0.7754698318496538, + "grad_norm": 0.8133811417479949, + "learning_rate": 4.807766207709392e-06, + "loss": 0.4471, + "step": 3136 + }, + { + "epoch": 0.7757171117705243, + "grad_norm": 0.7828796782998949, + "learning_rate": 4.807641200417554e-06, + "loss": 0.4132, + "step": 3137 + }, + { + "epoch": 0.7759643916913946, + "grad_norm": 0.8190662126335538, + "learning_rate": 4.807516154119809e-06, + "loss": 0.4192, + "step": 3138 + }, + { + "epoch": 0.7762116716122651, + "grad_norm": 0.7894618621921312, + "learning_rate": 4.807391068818272e-06, + "loss": 0.4287, + "step": 3139 + }, + { + "epoch": 0.7764589515331355, + "grad_norm": 0.7990613399601455, + "learning_rate": 4.807265944515056e-06, + "loss": 0.4185, + "step": 3140 + }, + { + "epoch": 0.776706231454006, + "grad_norm": 0.7940117259476284, + "learning_rate": 4.807140781212277e-06, + "loss": 0.4296, + "step": 3141 + }, + { + "epoch": 0.7769535113748763, + "grad_norm": 0.8090210021535224, + "learning_rate": 4.80701557891205e-06, + "loss": 0.4166, + "step": 3142 + }, + { + "epoch": 0.7772007912957468, + "grad_norm": 0.798970265827542, + "learning_rate": 4.806890337616491e-06, + "loss": 0.4122, + "step": 3143 + }, + { + "epoch": 0.7774480712166172, + "grad_norm": 0.8108410945027389, + "learning_rate": 4.806765057327718e-06, + "loss": 0.4216, + "step": 3144 + }, + { + "epoch": 0.7776953511374877, + "grad_norm": 0.7695340798568441, + "learning_rate": 4.806639738047847e-06, + "loss": 0.4478, + "step": 3145 + }, + { + "epoch": 0.777942631058358, + "grad_norm": 0.8425189435774728, + "learning_rate": 4.806514379778998e-06, + "loss": 0.3915, + "step": 3146 + }, + { + "epoch": 0.7781899109792285, + "grad_norm": 0.7909871752665145, + "learning_rate": 4.806388982523289e-06, + "loss": 0.4145, + "step": 3147 + }, + { + "epoch": 0.7784371909000989, + "grad_norm": 0.8395535395280559, + "learning_rate": 4.806263546282839e-06, + "loss": 0.4229, + "step": 3148 + }, + { + "epoch": 0.7786844708209694, + "grad_norm": 0.7946048118578783, + "learning_rate": 4.806138071059769e-06, + "loss": 0.416, + "step": 3149 + }, + { + "epoch": 0.7789317507418397, + "grad_norm": 0.7696555887999623, + "learning_rate": 4.806012556856201e-06, + "loss": 0.3952, + "step": 3150 + }, + { + "epoch": 0.7791790306627102, + "grad_norm": 0.8071199984857714, + "learning_rate": 4.805887003674255e-06, + "loss": 0.415, + "step": 3151 + }, + { + "epoch": 0.7794263105835806, + "grad_norm": 0.7989656420229575, + "learning_rate": 4.805761411516054e-06, + "loss": 0.403, + "step": 3152 + }, + { + "epoch": 0.7796735905044511, + "grad_norm": 0.8035327928385132, + "learning_rate": 4.805635780383719e-06, + "loss": 0.387, + "step": 3153 + }, + { + "epoch": 0.7799208704253214, + "grad_norm": 0.8139828219701931, + "learning_rate": 4.805510110279376e-06, + "loss": 0.4157, + "step": 3154 + }, + { + "epoch": 0.7801681503461919, + "grad_norm": 0.8595116770693756, + "learning_rate": 4.805384401205147e-06, + "loss": 0.42, + "step": 3155 + }, + { + "epoch": 0.7804154302670623, + "grad_norm": 0.7638842723959627, + "learning_rate": 4.80525865316316e-06, + "loss": 0.4054, + "step": 3156 + }, + { + "epoch": 0.7806627101879328, + "grad_norm": 0.7788059644498618, + "learning_rate": 4.805132866155538e-06, + "loss": 0.4065, + "step": 3157 + }, + { + "epoch": 0.7809099901088031, + "grad_norm": 0.7917946328097877, + "learning_rate": 4.805007040184407e-06, + "loss": 0.4034, + "step": 3158 + }, + { + "epoch": 0.7811572700296736, + "grad_norm": 0.794606043110883, + "learning_rate": 4.804881175251895e-06, + "loss": 0.4459, + "step": 3159 + }, + { + "epoch": 0.781404549950544, + "grad_norm": 0.7855478903283287, + "learning_rate": 4.804755271360129e-06, + "loss": 0.4043, + "step": 3160 + }, + { + "epoch": 0.7816518298714145, + "grad_norm": 0.8003629127364914, + "learning_rate": 4.804629328511238e-06, + "loss": 0.4148, + "step": 3161 + }, + { + "epoch": 0.7818991097922848, + "grad_norm": 0.7983155545329887, + "learning_rate": 4.804503346707349e-06, + "loss": 0.4337, + "step": 3162 + }, + { + "epoch": 0.7821463897131553, + "grad_norm": 0.8021511440132186, + "learning_rate": 4.804377325950593e-06, + "loss": 0.409, + "step": 3163 + }, + { + "epoch": 0.7823936696340257, + "grad_norm": 0.760133073100139, + "learning_rate": 4.804251266243099e-06, + "loss": 0.423, + "step": 3164 + }, + { + "epoch": 0.7826409495548962, + "grad_norm": 0.7848135629022405, + "learning_rate": 4.8041251675869996e-06, + "loss": 0.4158, + "step": 3165 + }, + { + "epoch": 0.7828882294757665, + "grad_norm": 0.7925730082390904, + "learning_rate": 4.803999029984423e-06, + "loss": 0.4211, + "step": 3166 + }, + { + "epoch": 0.783135509396637, + "grad_norm": 0.7764546739610668, + "learning_rate": 4.803872853437506e-06, + "loss": 0.4444, + "step": 3167 + }, + { + "epoch": 0.7833827893175074, + "grad_norm": 0.8499268393363937, + "learning_rate": 4.803746637948377e-06, + "loss": 0.4121, + "step": 3168 + }, + { + "epoch": 0.7836300692383779, + "grad_norm": 0.796267303570014, + "learning_rate": 4.803620383519171e-06, + "loss": 0.4579, + "step": 3169 + }, + { + "epoch": 0.7838773491592482, + "grad_norm": 0.8586483960136988, + "learning_rate": 4.803494090152022e-06, + "loss": 0.4167, + "step": 3170 + }, + { + "epoch": 0.7841246290801187, + "grad_norm": 0.7846703688927068, + "learning_rate": 4.803367757849065e-06, + "loss": 0.4256, + "step": 3171 + }, + { + "epoch": 0.7843719090009891, + "grad_norm": 0.8152792507339273, + "learning_rate": 4.803241386612436e-06, + "loss": 0.3683, + "step": 3172 + }, + { + "epoch": 0.7846191889218596, + "grad_norm": 0.7998726293610626, + "learning_rate": 4.8031149764442695e-06, + "loss": 0.4073, + "step": 3173 + }, + { + "epoch": 0.7848664688427299, + "grad_norm": 0.8425199760576926, + "learning_rate": 4.802988527346703e-06, + "loss": 0.4096, + "step": 3174 + }, + { + "epoch": 0.7851137487636004, + "grad_norm": 0.8087625745415404, + "learning_rate": 4.802862039321875e-06, + "loss": 0.4172, + "step": 3175 + }, + { + "epoch": 0.7853610286844708, + "grad_norm": 0.8103640303916235, + "learning_rate": 4.802735512371922e-06, + "loss": 0.4016, + "step": 3176 + }, + { + "epoch": 0.7856083086053413, + "grad_norm": 0.7885395510106586, + "learning_rate": 4.8026089464989825e-06, + "loss": 0.4076, + "step": 3177 + }, + { + "epoch": 0.7858555885262116, + "grad_norm": 0.8642951165136358, + "learning_rate": 4.802482341705197e-06, + "loss": 0.4139, + "step": 3178 + }, + { + "epoch": 0.7861028684470821, + "grad_norm": 0.7819223306835482, + "learning_rate": 4.8023556979927045e-06, + "loss": 0.4341, + "step": 3179 + }, + { + "epoch": 0.7863501483679525, + "grad_norm": 0.8193735193209689, + "learning_rate": 4.802229015363646e-06, + "loss": 0.4271, + "step": 3180 + }, + { + "epoch": 0.786597428288823, + "grad_norm": 0.7692047832916445, + "learning_rate": 4.802102293820162e-06, + "loss": 0.4266, + "step": 3181 + }, + { + "epoch": 0.7868447082096933, + "grad_norm": 0.7580998926928701, + "learning_rate": 4.801975533364397e-06, + "loss": 0.4232, + "step": 3182 + }, + { + "epoch": 0.7870919881305638, + "grad_norm": 0.7952151355924447, + "learning_rate": 4.801848733998491e-06, + "loss": 0.4287, + "step": 3183 + }, + { + "epoch": 0.7873392680514342, + "grad_norm": 0.768566684447632, + "learning_rate": 4.801721895724588e-06, + "loss": 0.4352, + "step": 3184 + }, + { + "epoch": 0.7875865479723047, + "grad_norm": 0.8392183142924099, + "learning_rate": 4.801595018544834e-06, + "loss": 0.4161, + "step": 3185 + }, + { + "epoch": 0.787833827893175, + "grad_norm": 0.8229971702026877, + "learning_rate": 4.80146810246137e-06, + "loss": 0.4035, + "step": 3186 + }, + { + "epoch": 0.7880811078140455, + "grad_norm": 0.7669879130914552, + "learning_rate": 4.801341147476343e-06, + "loss": 0.3952, + "step": 3187 + }, + { + "epoch": 0.7883283877349159, + "grad_norm": 0.7891804474395242, + "learning_rate": 4.801214153591899e-06, + "loss": 0.3975, + "step": 3188 + }, + { + "epoch": 0.7885756676557863, + "grad_norm": 0.7986833755586805, + "learning_rate": 4.801087120810185e-06, + "loss": 0.413, + "step": 3189 + }, + { + "epoch": 0.7888229475766568, + "grad_norm": 0.7952880723142738, + "learning_rate": 4.800960049133347e-06, + "loss": 0.4372, + "step": 3190 + }, + { + "epoch": 0.7890702274975272, + "grad_norm": 0.7757455316227299, + "learning_rate": 4.800832938563534e-06, + "loss": 0.4249, + "step": 3191 + }, + { + "epoch": 0.7893175074183977, + "grad_norm": 0.7949907790570365, + "learning_rate": 4.800705789102894e-06, + "loss": 0.431, + "step": 3192 + }, + { + "epoch": 0.789564787339268, + "grad_norm": 0.7868569621757431, + "learning_rate": 4.800578600753577e-06, + "loss": 0.424, + "step": 3193 + }, + { + "epoch": 0.7898120672601385, + "grad_norm": 0.7864424676820765, + "learning_rate": 4.800451373517732e-06, + "loss": 0.3849, + "step": 3194 + }, + { + "epoch": 0.7900593471810089, + "grad_norm": 0.784595550816719, + "learning_rate": 4.800324107397509e-06, + "loss": 0.4098, + "step": 3195 + }, + { + "epoch": 0.7903066271018794, + "grad_norm": 0.830043984168889, + "learning_rate": 4.800196802395061e-06, + "loss": 0.3888, + "step": 3196 + }, + { + "epoch": 0.7905539070227497, + "grad_norm": 0.790118097460834, + "learning_rate": 4.800069458512538e-06, + "loss": 0.4102, + "step": 3197 + }, + { + "epoch": 0.7908011869436202, + "grad_norm": 0.8227085580927859, + "learning_rate": 4.799942075752093e-06, + "loss": 0.4375, + "step": 3198 + }, + { + "epoch": 0.7910484668644906, + "grad_norm": 0.7982196753808344, + "learning_rate": 4.799814654115879e-06, + "loss": 0.4529, + "step": 3199 + }, + { + "epoch": 0.7912957467853611, + "grad_norm": 0.7866905634175901, + "learning_rate": 4.799687193606052e-06, + "loss": 0.4251, + "step": 3200 + }, + { + "epoch": 0.7915430267062314, + "grad_norm": 0.7685080791035871, + "learning_rate": 4.799559694224763e-06, + "loss": 0.4215, + "step": 3201 + }, + { + "epoch": 0.7917903066271019, + "grad_norm": 0.7599904496578714, + "learning_rate": 4.799432155974168e-06, + "loss": 0.4219, + "step": 3202 + }, + { + "epoch": 0.7920375865479723, + "grad_norm": 0.8263878111249411, + "learning_rate": 4.799304578856425e-06, + "loss": 0.4235, + "step": 3203 + }, + { + "epoch": 0.7922848664688428, + "grad_norm": 0.7741646735638096, + "learning_rate": 4.799176962873689e-06, + "loss": 0.4094, + "step": 3204 + }, + { + "epoch": 0.7925321463897131, + "grad_norm": 0.7842075482726589, + "learning_rate": 4.799049308028116e-06, + "loss": 0.4265, + "step": 3205 + }, + { + "epoch": 0.7927794263105836, + "grad_norm": 0.8011953412570326, + "learning_rate": 4.7989216143218655e-06, + "loss": 0.4101, + "step": 3206 + }, + { + "epoch": 0.793026706231454, + "grad_norm": 0.8305619346107188, + "learning_rate": 4.798793881757095e-06, + "loss": 0.4166, + "step": 3207 + }, + { + "epoch": 0.7932739861523245, + "grad_norm": 0.8065231582621645, + "learning_rate": 4.798666110335963e-06, + "loss": 0.4233, + "step": 3208 + }, + { + "epoch": 0.7935212660731948, + "grad_norm": 0.7874595373516532, + "learning_rate": 4.798538300060631e-06, + "loss": 0.4178, + "step": 3209 + }, + { + "epoch": 0.7937685459940653, + "grad_norm": 0.7907591698271821, + "learning_rate": 4.798410450933258e-06, + "loss": 0.4226, + "step": 3210 + }, + { + "epoch": 0.7940158259149357, + "grad_norm": 0.7625097668467142, + "learning_rate": 4.798282562956005e-06, + "loss": 0.415, + "step": 3211 + }, + { + "epoch": 0.7942631058358062, + "grad_norm": 0.7561667754981604, + "learning_rate": 4.798154636131033e-06, + "loss": 0.4355, + "step": 3212 + }, + { + "epoch": 0.7945103857566765, + "grad_norm": 0.7809583776002582, + "learning_rate": 4.7980266704605064e-06, + "loss": 0.4113, + "step": 3213 + }, + { + "epoch": 0.794757665677547, + "grad_norm": 0.784211514845161, + "learning_rate": 4.797898665946587e-06, + "loss": 0.4288, + "step": 3214 + }, + { + "epoch": 0.7950049455984174, + "grad_norm": 0.7780178015749081, + "learning_rate": 4.797770622591439e-06, + "loss": 0.4187, + "step": 3215 + }, + { + "epoch": 0.7952522255192879, + "grad_norm": 0.8068492708723259, + "learning_rate": 4.797642540397226e-06, + "loss": 0.4221, + "step": 3216 + }, + { + "epoch": 0.7954995054401582, + "grad_norm": 0.8164724169733689, + "learning_rate": 4.797514419366112e-06, + "loss": 0.4121, + "step": 3217 + }, + { + "epoch": 0.7957467853610287, + "grad_norm": 0.7890501236318611, + "learning_rate": 4.7973862595002655e-06, + "loss": 0.4102, + "step": 3218 + }, + { + "epoch": 0.7959940652818991, + "grad_norm": 0.8065399895842733, + "learning_rate": 4.79725806080185e-06, + "loss": 0.4579, + "step": 3219 + }, + { + "epoch": 0.7962413452027696, + "grad_norm": 0.7813246830312838, + "learning_rate": 4.797129823273035e-06, + "loss": 0.4026, + "step": 3220 + }, + { + "epoch": 0.7964886251236399, + "grad_norm": 0.7719196802952721, + "learning_rate": 4.797001546915985e-06, + "loss": 0.4604, + "step": 3221 + }, + { + "epoch": 0.7967359050445104, + "grad_norm": 0.7951454051368005, + "learning_rate": 4.796873231732871e-06, + "loss": 0.4071, + "step": 3222 + }, + { + "epoch": 0.7969831849653808, + "grad_norm": 0.8307970839902238, + "learning_rate": 4.796744877725861e-06, + "loss": 0.4101, + "step": 3223 + }, + { + "epoch": 0.7972304648862513, + "grad_norm": 0.7701120556585122, + "learning_rate": 4.796616484897123e-06, + "loss": 0.4492, + "step": 3224 + }, + { + "epoch": 0.7974777448071216, + "grad_norm": 0.7773449035476312, + "learning_rate": 4.79648805324883e-06, + "loss": 0.378, + "step": 3225 + }, + { + "epoch": 0.7977250247279921, + "grad_norm": 0.8036758045319646, + "learning_rate": 4.796359582783151e-06, + "loss": 0.4119, + "step": 3226 + }, + { + "epoch": 0.7979723046488625, + "grad_norm": 0.8288205702589608, + "learning_rate": 4.796231073502258e-06, + "loss": 0.4325, + "step": 3227 + }, + { + "epoch": 0.798219584569733, + "grad_norm": 0.8388175569444808, + "learning_rate": 4.796102525408323e-06, + "loss": 0.3907, + "step": 3228 + }, + { + "epoch": 0.7984668644906033, + "grad_norm": 0.7803655660453329, + "learning_rate": 4.795973938503518e-06, + "loss": 0.3959, + "step": 3229 + }, + { + "epoch": 0.7987141444114738, + "grad_norm": 0.7853139273322179, + "learning_rate": 4.79584531279002e-06, + "loss": 0.3897, + "step": 3230 + }, + { + "epoch": 0.7989614243323442, + "grad_norm": 0.7871116216692523, + "learning_rate": 4.7957166482699985e-06, + "loss": 0.4291, + "step": 3231 + }, + { + "epoch": 0.7992087042532147, + "grad_norm": 0.7671489638405405, + "learning_rate": 4.795587944945631e-06, + "loss": 0.4324, + "step": 3232 + }, + { + "epoch": 0.799455984174085, + "grad_norm": 0.8063052507791727, + "learning_rate": 4.795459202819093e-06, + "loss": 0.3814, + "step": 3233 + }, + { + "epoch": 0.7997032640949555, + "grad_norm": 0.7942943583781706, + "learning_rate": 4.795330421892559e-06, + "loss": 0.4134, + "step": 3234 + }, + { + "epoch": 0.7999505440158259, + "grad_norm": 0.8014136850289755, + "learning_rate": 4.795201602168208e-06, + "loss": 0.4275, + "step": 3235 + }, + { + "epoch": 0.8001978239366964, + "grad_norm": 0.793107629545374, + "learning_rate": 4.795072743648216e-06, + "loss": 0.4141, + "step": 3236 + }, + { + "epoch": 0.8004451038575667, + "grad_norm": 0.8333140544980959, + "learning_rate": 4.794943846334761e-06, + "loss": 0.4016, + "step": 3237 + }, + { + "epoch": 0.8006923837784372, + "grad_norm": 0.7937306545181533, + "learning_rate": 4.7948149102300214e-06, + "loss": 0.3894, + "step": 3238 + }, + { + "epoch": 0.8009396636993076, + "grad_norm": 0.7752640899581088, + "learning_rate": 4.794685935336178e-06, + "loss": 0.4351, + "step": 3239 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 0.792963683311261, + "learning_rate": 4.79455692165541e-06, + "loss": 0.4454, + "step": 3240 + }, + { + "epoch": 0.8014342235410484, + "grad_norm": 0.8299219831294848, + "learning_rate": 4.794427869189898e-06, + "loss": 0.3952, + "step": 3241 + }, + { + "epoch": 0.8016815034619189, + "grad_norm": 0.8157943140762436, + "learning_rate": 4.7942987779418245e-06, + "loss": 0.4332, + "step": 3242 + }, + { + "epoch": 0.8019287833827893, + "grad_norm": 0.7986078907851436, + "learning_rate": 4.79416964791337e-06, + "loss": 0.4322, + "step": 3243 + }, + { + "epoch": 0.8021760633036598, + "grad_norm": 0.786261870460221, + "learning_rate": 4.794040479106718e-06, + "loss": 0.4235, + "step": 3244 + }, + { + "epoch": 0.8024233432245301, + "grad_norm": 0.8283195848120665, + "learning_rate": 4.7939112715240515e-06, + "loss": 0.4319, + "step": 3245 + }, + { + "epoch": 0.8026706231454006, + "grad_norm": 0.8287242798951121, + "learning_rate": 4.793782025167555e-06, + "loss": 0.4509, + "step": 3246 + }, + { + "epoch": 0.802917903066271, + "grad_norm": 0.7817295347972962, + "learning_rate": 4.793652740039412e-06, + "loss": 0.405, + "step": 3247 + }, + { + "epoch": 0.8031651829871415, + "grad_norm": 0.7838023242726265, + "learning_rate": 4.79352341614181e-06, + "loss": 0.4224, + "step": 3248 + }, + { + "epoch": 0.8034124629080118, + "grad_norm": 0.7761306457370105, + "learning_rate": 4.793394053476932e-06, + "loss": 0.4359, + "step": 3249 + }, + { + "epoch": 0.8036597428288823, + "grad_norm": 0.8218425977051614, + "learning_rate": 4.793264652046967e-06, + "loss": 0.4233, + "step": 3250 + }, + { + "epoch": 0.8039070227497527, + "grad_norm": 0.7921841515228916, + "learning_rate": 4.7931352118541e-06, + "loss": 0.4177, + "step": 3251 + }, + { + "epoch": 0.8041543026706232, + "grad_norm": 0.8773579262315243, + "learning_rate": 4.793005732900522e-06, + "loss": 0.3918, + "step": 3252 + }, + { + "epoch": 0.8044015825914935, + "grad_norm": 0.829400566392459, + "learning_rate": 4.792876215188419e-06, + "loss": 0.4072, + "step": 3253 + }, + { + "epoch": 0.804648862512364, + "grad_norm": 0.7823791901433861, + "learning_rate": 4.792746658719982e-06, + "loss": 0.4101, + "step": 3254 + }, + { + "epoch": 0.8048961424332344, + "grad_norm": 0.7881281107043909, + "learning_rate": 4.792617063497399e-06, + "loss": 0.4145, + "step": 3255 + }, + { + "epoch": 0.8051434223541049, + "grad_norm": 0.8476540235058267, + "learning_rate": 4.792487429522862e-06, + "loss": 0.4036, + "step": 3256 + }, + { + "epoch": 0.8053907022749752, + "grad_norm": 0.8152889870566801, + "learning_rate": 4.792357756798561e-06, + "loss": 0.418, + "step": 3257 + }, + { + "epoch": 0.8056379821958457, + "grad_norm": 0.8076974577039581, + "learning_rate": 4.79222804532669e-06, + "loss": 0.4021, + "step": 3258 + }, + { + "epoch": 0.8058852621167161, + "grad_norm": 0.8058805425863745, + "learning_rate": 4.792098295109439e-06, + "loss": 0.3917, + "step": 3259 + }, + { + "epoch": 0.8061325420375866, + "grad_norm": 0.8234389705832926, + "learning_rate": 4.791968506149003e-06, + "loss": 0.4173, + "step": 3260 + }, + { + "epoch": 0.8063798219584569, + "grad_norm": 0.7886416606971659, + "learning_rate": 4.791838678447574e-06, + "loss": 0.405, + "step": 3261 + }, + { + "epoch": 0.8066271018793274, + "grad_norm": 0.779637253269339, + "learning_rate": 4.7917088120073484e-06, + "loss": 0.4245, + "step": 3262 + }, + { + "epoch": 0.8068743818001978, + "grad_norm": 0.8095012054220153, + "learning_rate": 4.79157890683052e-06, + "loss": 0.4503, + "step": 3263 + }, + { + "epoch": 0.8071216617210683, + "grad_norm": 0.8194829000253359, + "learning_rate": 4.791448962919285e-06, + "loss": 0.4007, + "step": 3264 + }, + { + "epoch": 0.8073689416419386, + "grad_norm": 0.8104089346079402, + "learning_rate": 4.7913189802758405e-06, + "loss": 0.4139, + "step": 3265 + }, + { + "epoch": 0.8076162215628091, + "grad_norm": 0.8149788810165006, + "learning_rate": 4.791188958902382e-06, + "loss": 0.4001, + "step": 3266 + }, + { + "epoch": 0.8078635014836796, + "grad_norm": 0.7880461660692727, + "learning_rate": 4.791058898801109e-06, + "loss": 0.4242, + "step": 3267 + }, + { + "epoch": 0.80811078140455, + "grad_norm": 0.8440145345829424, + "learning_rate": 4.790928799974219e-06, + "loss": 0.4047, + "step": 3268 + }, + { + "epoch": 0.8083580613254204, + "grad_norm": 0.8081825088885769, + "learning_rate": 4.790798662423911e-06, + "loss": 0.4055, + "step": 3269 + }, + { + "epoch": 0.8086053412462908, + "grad_norm": 0.7968073833564706, + "learning_rate": 4.790668486152385e-06, + "loss": 0.4071, + "step": 3270 + }, + { + "epoch": 0.8088526211671613, + "grad_norm": 0.7841070913783371, + "learning_rate": 4.790538271161841e-06, + "loss": 0.4171, + "step": 3271 + }, + { + "epoch": 0.8090999010880316, + "grad_norm": 0.8286577863722515, + "learning_rate": 4.79040801745448e-06, + "loss": 0.3833, + "step": 3272 + }, + { + "epoch": 0.8093471810089021, + "grad_norm": 0.7644216257502693, + "learning_rate": 4.790277725032504e-06, + "loss": 0.4089, + "step": 3273 + }, + { + "epoch": 0.8095944609297725, + "grad_norm": 0.8298216917114288, + "learning_rate": 4.790147393898116e-06, + "loss": 0.4184, + "step": 3274 + }, + { + "epoch": 0.809841740850643, + "grad_norm": 0.8373160134004612, + "learning_rate": 4.790017024053517e-06, + "loss": 0.4097, + "step": 3275 + }, + { + "epoch": 0.8100890207715133, + "grad_norm": 0.8003961462949286, + "learning_rate": 4.789886615500912e-06, + "loss": 0.4208, + "step": 3276 + }, + { + "epoch": 0.8103363006923838, + "grad_norm": 0.8084367466040833, + "learning_rate": 4.789756168242506e-06, + "loss": 0.4008, + "step": 3277 + }, + { + "epoch": 0.8105835806132542, + "grad_norm": 0.8328646094691385, + "learning_rate": 4.789625682280503e-06, + "loss": 0.3932, + "step": 3278 + }, + { + "epoch": 0.8108308605341247, + "grad_norm": 0.817103508185311, + "learning_rate": 4.789495157617108e-06, + "loss": 0.4241, + "step": 3279 + }, + { + "epoch": 0.811078140454995, + "grad_norm": 0.7878968437266687, + "learning_rate": 4.789364594254529e-06, + "loss": 0.4245, + "step": 3280 + }, + { + "epoch": 0.8113254203758655, + "grad_norm": 0.7723691927519781, + "learning_rate": 4.78923399219497e-06, + "loss": 0.3915, + "step": 3281 + }, + { + "epoch": 0.8115727002967359, + "grad_norm": 0.8168370329750326, + "learning_rate": 4.789103351440641e-06, + "loss": 0.3639, + "step": 3282 + }, + { + "epoch": 0.8118199802176064, + "grad_norm": 0.7968216032656887, + "learning_rate": 4.788972671993751e-06, + "loss": 0.3938, + "step": 3283 + }, + { + "epoch": 0.8120672601384767, + "grad_norm": 0.8185040720148407, + "learning_rate": 4.788841953856506e-06, + "loss": 0.4247, + "step": 3284 + }, + { + "epoch": 0.8123145400593472, + "grad_norm": 0.7781045150914397, + "learning_rate": 4.788711197031118e-06, + "loss": 0.4036, + "step": 3285 + }, + { + "epoch": 0.8125618199802176, + "grad_norm": 0.7785731429716326, + "learning_rate": 4.788580401519794e-06, + "loss": 0.3986, + "step": 3286 + }, + { + "epoch": 0.8128090999010881, + "grad_norm": 0.8212032505997505, + "learning_rate": 4.7884495673247496e-06, + "loss": 0.4266, + "step": 3287 + }, + { + "epoch": 0.8130563798219584, + "grad_norm": 0.7484700477722402, + "learning_rate": 4.788318694448192e-06, + "loss": 0.4596, + "step": 3288 + }, + { + "epoch": 0.8133036597428289, + "grad_norm": 0.8209584736189361, + "learning_rate": 4.788187782892336e-06, + "loss": 0.3874, + "step": 3289 + }, + { + "epoch": 0.8135509396636993, + "grad_norm": 0.7842763948585224, + "learning_rate": 4.788056832659392e-06, + "loss": 0.4281, + "step": 3290 + }, + { + "epoch": 0.8137982195845698, + "grad_norm": 0.809014432481598, + "learning_rate": 4.787925843751576e-06, + "loss": 0.4021, + "step": 3291 + }, + { + "epoch": 0.8140454995054401, + "grad_norm": 0.8093191327442567, + "learning_rate": 4.787794816171101e-06, + "loss": 0.4046, + "step": 3292 + }, + { + "epoch": 0.8142927794263106, + "grad_norm": 0.7877046692252317, + "learning_rate": 4.7876637499201815e-06, + "loss": 0.4486, + "step": 3293 + }, + { + "epoch": 0.814540059347181, + "grad_norm": 0.798840334202066, + "learning_rate": 4.787532645001033e-06, + "loss": 0.4032, + "step": 3294 + }, + { + "epoch": 0.8147873392680515, + "grad_norm": 0.8255728729719902, + "learning_rate": 4.787401501415871e-06, + "loss": 0.452, + "step": 3295 + }, + { + "epoch": 0.8150346191889218, + "grad_norm": 0.797243236400735, + "learning_rate": 4.787270319166913e-06, + "loss": 0.4275, + "step": 3296 + }, + { + "epoch": 0.8152818991097923, + "grad_norm": 0.8349074546118905, + "learning_rate": 4.787139098256377e-06, + "loss": 0.3907, + "step": 3297 + }, + { + "epoch": 0.8155291790306627, + "grad_norm": 0.8110622701045463, + "learning_rate": 4.7870078386864795e-06, + "loss": 0.4357, + "step": 3298 + }, + { + "epoch": 0.8157764589515332, + "grad_norm": 0.7921550904844632, + "learning_rate": 4.78687654045944e-06, + "loss": 0.4277, + "step": 3299 + }, + { + "epoch": 0.8160237388724035, + "grad_norm": 0.7754244489726932, + "learning_rate": 4.7867452035774774e-06, + "loss": 0.412, + "step": 3300 + }, + { + "epoch": 0.816271018793274, + "grad_norm": 0.791617530083558, + "learning_rate": 4.786613828042813e-06, + "loss": 0.4174, + "step": 3301 + }, + { + "epoch": 0.8165182987141444, + "grad_norm": 0.7944566773450151, + "learning_rate": 4.786482413857666e-06, + "loss": 0.4104, + "step": 3302 + }, + { + "epoch": 0.8167655786350149, + "grad_norm": 0.8172864231650154, + "learning_rate": 4.786350961024257e-06, + "loss": 0.4232, + "step": 3303 + }, + { + "epoch": 0.8170128585558852, + "grad_norm": 0.844980815358175, + "learning_rate": 4.78621946954481e-06, + "loss": 0.3849, + "step": 3304 + }, + { + "epoch": 0.8172601384767557, + "grad_norm": 0.7757666993402713, + "learning_rate": 4.786087939421547e-06, + "loss": 0.4008, + "step": 3305 + }, + { + "epoch": 0.8175074183976261, + "grad_norm": 0.794111476583098, + "learning_rate": 4.7859563706566914e-06, + "loss": 0.3986, + "step": 3306 + }, + { + "epoch": 0.8177546983184966, + "grad_norm": 0.7773637704984339, + "learning_rate": 4.785824763252466e-06, + "loss": 0.3883, + "step": 3307 + }, + { + "epoch": 0.8180019782393669, + "grad_norm": 0.798335262514005, + "learning_rate": 4.785693117211095e-06, + "loss": 0.4097, + "step": 3308 + }, + { + "epoch": 0.8182492581602374, + "grad_norm": 0.8023022414258852, + "learning_rate": 4.785561432534806e-06, + "loss": 0.3993, + "step": 3309 + }, + { + "epoch": 0.8184965380811078, + "grad_norm": 0.8301219030490221, + "learning_rate": 4.7854297092258216e-06, + "loss": 0.3828, + "step": 3310 + }, + { + "epoch": 0.8187438180019783, + "grad_norm": 0.8153453055334471, + "learning_rate": 4.785297947286372e-06, + "loss": 0.4305, + "step": 3311 + }, + { + "epoch": 0.8189910979228486, + "grad_norm": 0.8111562590093313, + "learning_rate": 4.785166146718681e-06, + "loss": 0.4078, + "step": 3312 + }, + { + "epoch": 0.8192383778437191, + "grad_norm": 0.7826670595098245, + "learning_rate": 4.785034307524979e-06, + "loss": 0.4281, + "step": 3313 + }, + { + "epoch": 0.8194856577645895, + "grad_norm": 0.7793094315853807, + "learning_rate": 4.784902429707493e-06, + "loss": 0.4373, + "step": 3314 + }, + { + "epoch": 0.81973293768546, + "grad_norm": 0.7994195562682976, + "learning_rate": 4.784770513268452e-06, + "loss": 0.4028, + "step": 3315 + }, + { + "epoch": 0.8199802176063303, + "grad_norm": 0.777986973844296, + "learning_rate": 4.784638558210086e-06, + "loss": 0.4135, + "step": 3316 + }, + { + "epoch": 0.8202274975272008, + "grad_norm": 0.8011858291824131, + "learning_rate": 4.784506564534627e-06, + "loss": 0.4556, + "step": 3317 + }, + { + "epoch": 0.8204747774480712, + "grad_norm": 0.7695970936651433, + "learning_rate": 4.784374532244304e-06, + "loss": 0.4264, + "step": 3318 + }, + { + "epoch": 0.8207220573689417, + "grad_norm": 0.7854179279542661, + "learning_rate": 4.78424246134135e-06, + "loss": 0.3941, + "step": 3319 + }, + { + "epoch": 0.820969337289812, + "grad_norm": 0.8145456782664094, + "learning_rate": 4.784110351827996e-06, + "loss": 0.4303, + "step": 3320 + }, + { + "epoch": 0.8212166172106825, + "grad_norm": 0.7778866148610745, + "learning_rate": 4.783978203706476e-06, + "loss": 0.3849, + "step": 3321 + }, + { + "epoch": 0.8214638971315529, + "grad_norm": 0.8002029621999994, + "learning_rate": 4.783846016979024e-06, + "loss": 0.3992, + "step": 3322 + }, + { + "epoch": 0.8217111770524234, + "grad_norm": 0.8349311151236508, + "learning_rate": 4.7837137916478745e-06, + "loss": 0.3903, + "step": 3323 + }, + { + "epoch": 0.8219584569732937, + "grad_norm": 0.7790180389747006, + "learning_rate": 4.783581527715261e-06, + "loss": 0.3822, + "step": 3324 + }, + { + "epoch": 0.8222057368941642, + "grad_norm": 0.7629026266310897, + "learning_rate": 4.783449225183421e-06, + "loss": 0.4195, + "step": 3325 + }, + { + "epoch": 0.8224530168150346, + "grad_norm": 0.7649237556248503, + "learning_rate": 4.783316884054589e-06, + "loss": 0.4185, + "step": 3326 + }, + { + "epoch": 0.8227002967359051, + "grad_norm": 0.7809836127526398, + "learning_rate": 4.7831845043310034e-06, + "loss": 0.4325, + "step": 3327 + }, + { + "epoch": 0.8229475766567754, + "grad_norm": 0.771282433024842, + "learning_rate": 4.783052086014901e-06, + "loss": 0.4005, + "step": 3328 + }, + { + "epoch": 0.8231948565776459, + "grad_norm": 0.7837387184470643, + "learning_rate": 4.7829196291085205e-06, + "loss": 0.4038, + "step": 3329 + }, + { + "epoch": 0.8234421364985163, + "grad_norm": 0.7763923077993041, + "learning_rate": 4.7827871336141006e-06, + "loss": 0.4172, + "step": 3330 + }, + { + "epoch": 0.8236894164193868, + "grad_norm": 0.7888187786186903, + "learning_rate": 4.782654599533881e-06, + "loss": 0.4086, + "step": 3331 + }, + { + "epoch": 0.8239366963402571, + "grad_norm": 0.7964113323354067, + "learning_rate": 4.7825220268701015e-06, + "loss": 0.3942, + "step": 3332 + }, + { + "epoch": 0.8241839762611276, + "grad_norm": 0.7949089727526033, + "learning_rate": 4.782389415625003e-06, + "loss": 0.427, + "step": 3333 + }, + { + "epoch": 0.824431256181998, + "grad_norm": 0.7898222003899379, + "learning_rate": 4.782256765800828e-06, + "loss": 0.4198, + "step": 3334 + }, + { + "epoch": 0.8246785361028685, + "grad_norm": 0.7713468303220107, + "learning_rate": 4.782124077399818e-06, + "loss": 0.4082, + "step": 3335 + }, + { + "epoch": 0.8249258160237388, + "grad_norm": 0.760638496140234, + "learning_rate": 4.7819913504242156e-06, + "loss": 0.4271, + "step": 3336 + }, + { + "epoch": 0.8251730959446093, + "grad_norm": 0.7949387100448536, + "learning_rate": 4.7818585848762645e-06, + "loss": 0.4114, + "step": 3337 + }, + { + "epoch": 0.8254203758654797, + "grad_norm": 0.7580505449504422, + "learning_rate": 4.781725780758208e-06, + "loss": 0.4131, + "step": 3338 + }, + { + "epoch": 0.8256676557863502, + "grad_norm": 0.8045167857797156, + "learning_rate": 4.781592938072292e-06, + "loss": 0.4223, + "step": 3339 + }, + { + "epoch": 0.8259149357072205, + "grad_norm": 0.8159002530056874, + "learning_rate": 4.781460056820763e-06, + "loss": 0.4332, + "step": 3340 + }, + { + "epoch": 0.826162215628091, + "grad_norm": 0.7873685673757427, + "learning_rate": 4.781327137005865e-06, + "loss": 0.4053, + "step": 3341 + }, + { + "epoch": 0.8264094955489614, + "grad_norm": 0.8123786085064364, + "learning_rate": 4.781194178629844e-06, + "loss": 0.417, + "step": 3342 + }, + { + "epoch": 0.8266567754698319, + "grad_norm": 0.819766015202432, + "learning_rate": 4.781061181694949e-06, + "loss": 0.4162, + "step": 3343 + }, + { + "epoch": 0.8269040553907022, + "grad_norm": 0.7979483243492991, + "learning_rate": 4.78092814620343e-06, + "loss": 0.4317, + "step": 3344 + }, + { + "epoch": 0.8271513353115727, + "grad_norm": 0.7688666283808752, + "learning_rate": 4.780795072157532e-06, + "loss": 0.43, + "step": 3345 + }, + { + "epoch": 0.8273986152324432, + "grad_norm": 0.7990950578586927, + "learning_rate": 4.780661959559506e-06, + "loss": 0.372, + "step": 3346 + }, + { + "epoch": 0.8276458951533135, + "grad_norm": 0.7721501590793813, + "learning_rate": 4.780528808411602e-06, + "loss": 0.4185, + "step": 3347 + }, + { + "epoch": 0.827893175074184, + "grad_norm": 0.7854619913809469, + "learning_rate": 4.780395618716071e-06, + "loss": 0.3988, + "step": 3348 + }, + { + "epoch": 0.8281404549950544, + "grad_norm": 0.8290833887661643, + "learning_rate": 4.7802623904751626e-06, + "loss": 0.4358, + "step": 3349 + }, + { + "epoch": 0.8283877349159249, + "grad_norm": 0.8158026726230498, + "learning_rate": 4.780129123691131e-06, + "loss": 0.3959, + "step": 3350 + }, + { + "epoch": 0.8286350148367952, + "grad_norm": 0.8003195625474808, + "learning_rate": 4.779995818366227e-06, + "loss": 0.4162, + "step": 3351 + }, + { + "epoch": 0.8288822947576657, + "grad_norm": 0.7900036436217165, + "learning_rate": 4.779862474502705e-06, + "loss": 0.4293, + "step": 3352 + }, + { + "epoch": 0.8291295746785361, + "grad_norm": 0.8004447888167544, + "learning_rate": 4.779729092102818e-06, + "loss": 0.4007, + "step": 3353 + }, + { + "epoch": 0.8293768545994066, + "grad_norm": 0.7964423636908463, + "learning_rate": 4.779595671168822e-06, + "loss": 0.3976, + "step": 3354 + }, + { + "epoch": 0.829624134520277, + "grad_norm": 0.8067022591755529, + "learning_rate": 4.779462211702971e-06, + "loss": 0.4154, + "step": 3355 + }, + { + "epoch": 0.8298714144411474, + "grad_norm": 0.8050964956297638, + "learning_rate": 4.77932871370752e-06, + "loss": 0.417, + "step": 3356 + }, + { + "epoch": 0.8301186943620178, + "grad_norm": 0.7859285185827618, + "learning_rate": 4.779195177184728e-06, + "loss": 0.4388, + "step": 3357 + }, + { + "epoch": 0.8303659742828883, + "grad_norm": 0.8209308533484606, + "learning_rate": 4.779061602136851e-06, + "loss": 0.4343, + "step": 3358 + }, + { + "epoch": 0.8306132542037586, + "grad_norm": 0.7901126945376796, + "learning_rate": 4.778927988566146e-06, + "loss": 0.4181, + "step": 3359 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 0.8069670725815871, + "learning_rate": 4.778794336474873e-06, + "loss": 0.3819, + "step": 3360 + }, + { + "epoch": 0.8311078140454995, + "grad_norm": 0.7769685258141426, + "learning_rate": 4.778660645865288e-06, + "loss": 0.4084, + "step": 3361 + }, + { + "epoch": 0.83135509396637, + "grad_norm": 0.8246160503291277, + "learning_rate": 4.7785269167396545e-06, + "loss": 0.3947, + "step": 3362 + }, + { + "epoch": 0.8316023738872403, + "grad_norm": 0.8136590431488328, + "learning_rate": 4.778393149100231e-06, + "loss": 0.4101, + "step": 3363 + }, + { + "epoch": 0.8318496538081108, + "grad_norm": 0.7911097863165703, + "learning_rate": 4.778259342949279e-06, + "loss": 0.4493, + "step": 3364 + }, + { + "epoch": 0.8320969337289812, + "grad_norm": 0.812883523661831, + "learning_rate": 4.77812549828906e-06, + "loss": 0.3993, + "step": 3365 + }, + { + "epoch": 0.8323442136498517, + "grad_norm": 0.8022234638234113, + "learning_rate": 4.777991615121837e-06, + "loss": 0.4131, + "step": 3366 + }, + { + "epoch": 0.832591493570722, + "grad_norm": 0.7795760679299366, + "learning_rate": 4.777857693449871e-06, + "loss": 0.4185, + "step": 3367 + }, + { + "epoch": 0.8328387734915925, + "grad_norm": 0.7837567396640641, + "learning_rate": 4.777723733275429e-06, + "loss": 0.4314, + "step": 3368 + }, + { + "epoch": 0.8330860534124629, + "grad_norm": 0.7892606037338461, + "learning_rate": 4.7775897346007726e-06, + "loss": 0.4224, + "step": 3369 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.7831937189039634, + "learning_rate": 4.7774556974281685e-06, + "loss": 0.4399, + "step": 3370 + }, + { + "epoch": 0.8335806132542037, + "grad_norm": 0.8124136973914873, + "learning_rate": 4.77732162175988e-06, + "loss": 0.4537, + "step": 3371 + }, + { + "epoch": 0.8338278931750742, + "grad_norm": 0.7723322289863942, + "learning_rate": 4.777187507598177e-06, + "loss": 0.388, + "step": 3372 + }, + { + "epoch": 0.8340751730959446, + "grad_norm": 0.7743046452175267, + "learning_rate": 4.777053354945322e-06, + "loss": 0.416, + "step": 3373 + }, + { + "epoch": 0.8343224530168151, + "grad_norm": 0.8135790787271157, + "learning_rate": 4.776919163803587e-06, + "loss": 0.4058, + "step": 3374 + }, + { + "epoch": 0.8345697329376854, + "grad_norm": 0.7661956419487184, + "learning_rate": 4.776784934175237e-06, + "loss": 0.4246, + "step": 3375 + }, + { + "epoch": 0.8348170128585559, + "grad_norm": 0.80368496419414, + "learning_rate": 4.7766506660625414e-06, + "loss": 0.4271, + "step": 3376 + }, + { + "epoch": 0.8350642927794263, + "grad_norm": 0.7714493561021236, + "learning_rate": 4.776516359467771e-06, + "loss": 0.4371, + "step": 3377 + }, + { + "epoch": 0.8353115727002968, + "grad_norm": 0.8100023760100212, + "learning_rate": 4.776382014393195e-06, + "loss": 0.4019, + "step": 3378 + }, + { + "epoch": 0.8355588526211671, + "grad_norm": 0.7884436871457682, + "learning_rate": 4.776247630841085e-06, + "loss": 0.3934, + "step": 3379 + }, + { + "epoch": 0.8358061325420376, + "grad_norm": 0.7671160651595207, + "learning_rate": 4.776113208813712e-06, + "loss": 0.4291, + "step": 3380 + }, + { + "epoch": 0.836053412462908, + "grad_norm": 0.7938160957249958, + "learning_rate": 4.775978748313348e-06, + "loss": 0.4187, + "step": 3381 + }, + { + "epoch": 0.8363006923837785, + "grad_norm": 0.8224792228789037, + "learning_rate": 4.775844249342265e-06, + "loss": 0.4066, + "step": 3382 + }, + { + "epoch": 0.8365479723046488, + "grad_norm": 0.7905005579082747, + "learning_rate": 4.775709711902738e-06, + "loss": 0.4017, + "step": 3383 + }, + { + "epoch": 0.8367952522255193, + "grad_norm": 0.8057947131158326, + "learning_rate": 4.7755751359970405e-06, + "loss": 0.4054, + "step": 3384 + }, + { + "epoch": 0.8370425321463897, + "grad_norm": 0.8175423715679635, + "learning_rate": 4.775440521627447e-06, + "loss": 0.409, + "step": 3385 + }, + { + "epoch": 0.8372898120672602, + "grad_norm": 0.8008681519700376, + "learning_rate": 4.7753058687962325e-06, + "loss": 0.4221, + "step": 3386 + }, + { + "epoch": 0.8375370919881305, + "grad_norm": 0.7850063473346038, + "learning_rate": 4.775171177505674e-06, + "loss": 0.377, + "step": 3387 + }, + { + "epoch": 0.837784371909001, + "grad_norm": 0.7947371747318924, + "learning_rate": 4.775036447758048e-06, + "loss": 0.4143, + "step": 3388 + }, + { + "epoch": 0.8380316518298714, + "grad_norm": 0.7655744610178552, + "learning_rate": 4.774901679555631e-06, + "loss": 0.3918, + "step": 3389 + }, + { + "epoch": 0.8382789317507419, + "grad_norm": 0.7762446116793785, + "learning_rate": 4.774766872900702e-06, + "loss": 0.4381, + "step": 3390 + }, + { + "epoch": 0.8385262116716122, + "grad_norm": 0.8303410835002127, + "learning_rate": 4.7746320277955395e-06, + "loss": 0.3758, + "step": 3391 + }, + { + "epoch": 0.8387734915924827, + "grad_norm": 0.7985702070353878, + "learning_rate": 4.774497144242421e-06, + "loss": 0.3818, + "step": 3392 + }, + { + "epoch": 0.8390207715133531, + "grad_norm": 0.7920132547874397, + "learning_rate": 4.774362222243629e-06, + "loss": 0.4347, + "step": 3393 + }, + { + "epoch": 0.8392680514342236, + "grad_norm": 0.8030043793173209, + "learning_rate": 4.774227261801442e-06, + "loss": 0.3946, + "step": 3394 + }, + { + "epoch": 0.8395153313550939, + "grad_norm": 0.747732048647203, + "learning_rate": 4.774092262918143e-06, + "loss": 0.4537, + "step": 3395 + }, + { + "epoch": 0.8397626112759644, + "grad_norm": 0.840523303115213, + "learning_rate": 4.773957225596013e-06, + "loss": 0.4157, + "step": 3396 + }, + { + "epoch": 0.8400098911968348, + "grad_norm": 0.8110229327341179, + "learning_rate": 4.773822149837334e-06, + "loss": 0.4228, + "step": 3397 + }, + { + "epoch": 0.8402571711177053, + "grad_norm": 0.7683317822796379, + "learning_rate": 4.77368703564439e-06, + "loss": 0.4239, + "step": 3398 + }, + { + "epoch": 0.8405044510385756, + "grad_norm": 0.782689216594726, + "learning_rate": 4.7735518830194635e-06, + "loss": 0.4216, + "step": 3399 + }, + { + "epoch": 0.8407517309594461, + "grad_norm": 0.7907944083692497, + "learning_rate": 4.773416691964842e-06, + "loss": 0.3986, + "step": 3400 + }, + { + "epoch": 0.8409990108803165, + "grad_norm": 0.7655934114807923, + "learning_rate": 4.7732814624828075e-06, + "loss": 0.4165, + "step": 3401 + }, + { + "epoch": 0.841246290801187, + "grad_norm": 0.8110535411816441, + "learning_rate": 4.773146194575647e-06, + "loss": 0.4225, + "step": 3402 + }, + { + "epoch": 0.8414935707220573, + "grad_norm": 0.8100327921657837, + "learning_rate": 4.773010888245647e-06, + "loss": 0.42, + "step": 3403 + }, + { + "epoch": 0.8417408506429278, + "grad_norm": 0.7530456461140547, + "learning_rate": 4.772875543495094e-06, + "loss": 0.4248, + "step": 3404 + }, + { + "epoch": 0.8419881305637982, + "grad_norm": 0.7879760448848543, + "learning_rate": 4.772740160326276e-06, + "loss": 0.4314, + "step": 3405 + }, + { + "epoch": 0.8422354104846687, + "grad_norm": 0.8606818943078248, + "learning_rate": 4.772604738741482e-06, + "loss": 0.3978, + "step": 3406 + }, + { + "epoch": 0.842482690405539, + "grad_norm": 0.7549716602552334, + "learning_rate": 4.7724692787430006e-06, + "loss": 0.4289, + "step": 3407 + }, + { + "epoch": 0.8427299703264095, + "grad_norm": 0.7633266980017795, + "learning_rate": 4.772333780333121e-06, + "loss": 0.4274, + "step": 3408 + }, + { + "epoch": 0.8429772502472799, + "grad_norm": 0.765061521639252, + "learning_rate": 4.772198243514135e-06, + "loss": 0.41, + "step": 3409 + }, + { + "epoch": 0.8432245301681504, + "grad_norm": 0.7783131637239651, + "learning_rate": 4.772062668288332e-06, + "loss": 0.4222, + "step": 3410 + }, + { + "epoch": 0.8434718100890207, + "grad_norm": 0.7613563968237722, + "learning_rate": 4.771927054658003e-06, + "loss": 0.4069, + "step": 3411 + }, + { + "epoch": 0.8437190900098912, + "grad_norm": 0.8018125862329029, + "learning_rate": 4.771791402625442e-06, + "loss": 0.411, + "step": 3412 + }, + { + "epoch": 0.8439663699307616, + "grad_norm": 0.8350909510173182, + "learning_rate": 4.771655712192942e-06, + "loss": 0.4001, + "step": 3413 + }, + { + "epoch": 0.844213649851632, + "grad_norm": 0.7891030802950871, + "learning_rate": 4.771519983362795e-06, + "loss": 0.4467, + "step": 3414 + }, + { + "epoch": 0.8444609297725024, + "grad_norm": 0.8525872477343289, + "learning_rate": 4.771384216137297e-06, + "loss": 0.3848, + "step": 3415 + }, + { + "epoch": 0.8447082096933729, + "grad_norm": 0.8166630385029069, + "learning_rate": 4.771248410518742e-06, + "loss": 0.4117, + "step": 3416 + }, + { + "epoch": 0.8449554896142433, + "grad_norm": 0.8127217435899223, + "learning_rate": 4.771112566509424e-06, + "loss": 0.4449, + "step": 3417 + }, + { + "epoch": 0.8452027695351138, + "grad_norm": 0.7989077116900508, + "learning_rate": 4.770976684111643e-06, + "loss": 0.4309, + "step": 3418 + }, + { + "epoch": 0.8454500494559841, + "grad_norm": 0.7742207773656186, + "learning_rate": 4.770840763327691e-06, + "loss": 0.4169, + "step": 3419 + }, + { + "epoch": 0.8456973293768546, + "grad_norm": 0.8009671807975369, + "learning_rate": 4.770704804159869e-06, + "loss": 0.4272, + "step": 3420 + }, + { + "epoch": 0.845944609297725, + "grad_norm": 0.7762396091246215, + "learning_rate": 4.770568806610474e-06, + "loss": 0.4071, + "step": 3421 + }, + { + "epoch": 0.8461918892185954, + "grad_norm": 0.79048982739237, + "learning_rate": 4.770432770681804e-06, + "loss": 0.4142, + "step": 3422 + }, + { + "epoch": 0.8464391691394659, + "grad_norm": 0.7925764780479834, + "learning_rate": 4.7702966963761595e-06, + "loss": 0.4406, + "step": 3423 + }, + { + "epoch": 0.8466864490603363, + "grad_norm": 0.8229734761276529, + "learning_rate": 4.770160583695841e-06, + "loss": 0.4051, + "step": 3424 + }, + { + "epoch": 0.8469337289812068, + "grad_norm": 0.7812549831681974, + "learning_rate": 4.7700244326431485e-06, + "loss": 0.426, + "step": 3425 + }, + { + "epoch": 0.8471810089020771, + "grad_norm": 0.8153456675383441, + "learning_rate": 4.769888243220382e-06, + "loss": 0.4306, + "step": 3426 + }, + { + "epoch": 0.8474282888229476, + "grad_norm": 0.8228319181988596, + "learning_rate": 4.769752015429846e-06, + "loss": 0.4103, + "step": 3427 + }, + { + "epoch": 0.847675568743818, + "grad_norm": 0.8350865864963404, + "learning_rate": 4.769615749273842e-06, + "loss": 0.4163, + "step": 3428 + }, + { + "epoch": 0.8479228486646885, + "grad_norm": 0.8015176047552672, + "learning_rate": 4.769479444754672e-06, + "loss": 0.4199, + "step": 3429 + }, + { + "epoch": 0.8481701285855588, + "grad_norm": 0.8301892125593503, + "learning_rate": 4.769343101874643e-06, + "loss": 0.4164, + "step": 3430 + }, + { + "epoch": 0.8484174085064293, + "grad_norm": 0.8073898224130004, + "learning_rate": 4.769206720636056e-06, + "loss": 0.4482, + "step": 3431 + }, + { + "epoch": 0.8486646884272997, + "grad_norm": 0.8309199793371376, + "learning_rate": 4.769070301041219e-06, + "loss": 0.4566, + "step": 3432 + }, + { + "epoch": 0.8489119683481702, + "grad_norm": 0.7678874274807607, + "learning_rate": 4.768933843092436e-06, + "loss": 0.3946, + "step": 3433 + }, + { + "epoch": 0.8491592482690405, + "grad_norm": 0.7549429356875449, + "learning_rate": 4.768797346792015e-06, + "loss": 0.4287, + "step": 3434 + }, + { + "epoch": 0.849406528189911, + "grad_norm": 0.7876208123929652, + "learning_rate": 4.768660812142263e-06, + "loss": 0.405, + "step": 3435 + }, + { + "epoch": 0.8496538081107814, + "grad_norm": 0.8291311437122808, + "learning_rate": 4.768524239145487e-06, + "loss": 0.4087, + "step": 3436 + }, + { + "epoch": 0.8499010880316519, + "grad_norm": 0.8268178012129144, + "learning_rate": 4.768387627803996e-06, + "loss": 0.4097, + "step": 3437 + }, + { + "epoch": 0.8501483679525222, + "grad_norm": 0.7851962802571755, + "learning_rate": 4.7682509781200995e-06, + "loss": 0.435, + "step": 3438 + }, + { + "epoch": 0.8503956478733927, + "grad_norm": 0.8224586447080902, + "learning_rate": 4.768114290096106e-06, + "loss": 0.407, + "step": 3439 + }, + { + "epoch": 0.8506429277942631, + "grad_norm": 0.8156576914563988, + "learning_rate": 4.7679775637343275e-06, + "loss": 0.4067, + "step": 3440 + }, + { + "epoch": 0.8508902077151336, + "grad_norm": 0.8416785505706572, + "learning_rate": 4.767840799037074e-06, + "loss": 0.4297, + "step": 3441 + }, + { + "epoch": 0.8511374876360039, + "grad_norm": 0.7581910072086763, + "learning_rate": 4.767703996006658e-06, + "loss": 0.439, + "step": 3442 + }, + { + "epoch": 0.8513847675568744, + "grad_norm": 0.8036041084178673, + "learning_rate": 4.767567154645392e-06, + "loss": 0.3876, + "step": 3443 + }, + { + "epoch": 0.8516320474777448, + "grad_norm": 0.8077582774150418, + "learning_rate": 4.767430274955587e-06, + "loss": 0.4529, + "step": 3444 + }, + { + "epoch": 0.8518793273986153, + "grad_norm": 0.7940995416916322, + "learning_rate": 4.767293356939559e-06, + "loss": 0.4045, + "step": 3445 + }, + { + "epoch": 0.8521266073194856, + "grad_norm": 0.8380267975480605, + "learning_rate": 4.7671564005996215e-06, + "loss": 0.4236, + "step": 3446 + }, + { + "epoch": 0.8523738872403561, + "grad_norm": 0.7782724418448519, + "learning_rate": 4.767019405938089e-06, + "loss": 0.4095, + "step": 3447 + }, + { + "epoch": 0.8526211671612265, + "grad_norm": 0.8634191317576759, + "learning_rate": 4.766882372957278e-06, + "loss": 0.4273, + "step": 3448 + }, + { + "epoch": 0.852868447082097, + "grad_norm": 0.7886128298243266, + "learning_rate": 4.7667453016595044e-06, + "loss": 0.4336, + "step": 3449 + }, + { + "epoch": 0.8531157270029673, + "grad_norm": 0.7839418502660845, + "learning_rate": 4.766608192047084e-06, + "loss": 0.4042, + "step": 3450 + }, + { + "epoch": 0.8533630069238378, + "grad_norm": 0.779315107663836, + "learning_rate": 4.766471044122337e-06, + "loss": 0.4116, + "step": 3451 + }, + { + "epoch": 0.8536102868447082, + "grad_norm": 0.8130332426670765, + "learning_rate": 4.766333857887579e-06, + "loss": 0.3986, + "step": 3452 + }, + { + "epoch": 0.8538575667655787, + "grad_norm": 0.8316154784681559, + "learning_rate": 4.7661966333451305e-06, + "loss": 0.3984, + "step": 3453 + }, + { + "epoch": 0.854104846686449, + "grad_norm": 0.7697149493976957, + "learning_rate": 4.766059370497309e-06, + "loss": 0.4037, + "step": 3454 + }, + { + "epoch": 0.8543521266073195, + "grad_norm": 0.8222479951816859, + "learning_rate": 4.765922069346437e-06, + "loss": 0.3778, + "step": 3455 + }, + { + "epoch": 0.8545994065281899, + "grad_norm": 0.8444888274019681, + "learning_rate": 4.765784729894834e-06, + "loss": 0.3792, + "step": 3456 + }, + { + "epoch": 0.8548466864490604, + "grad_norm": 0.8094354195563492, + "learning_rate": 4.765647352144822e-06, + "loss": 0.4236, + "step": 3457 + }, + { + "epoch": 0.8550939663699307, + "grad_norm": 0.7945699779942299, + "learning_rate": 4.7655099360987225e-06, + "loss": 0.419, + "step": 3458 + }, + { + "epoch": 0.8553412462908012, + "grad_norm": 0.7623631949030496, + "learning_rate": 4.765372481758859e-06, + "loss": 0.4047, + "step": 3459 + }, + { + "epoch": 0.8555885262116716, + "grad_norm": 0.8006217233235627, + "learning_rate": 4.7652349891275525e-06, + "loss": 0.4199, + "step": 3460 + }, + { + "epoch": 0.8558358061325421, + "grad_norm": 0.8071519894620478, + "learning_rate": 4.765097458207131e-06, + "loss": 0.429, + "step": 3461 + }, + { + "epoch": 0.8560830860534124, + "grad_norm": 0.7998455842155523, + "learning_rate": 4.764959888999917e-06, + "loss": 0.3849, + "step": 3462 + }, + { + "epoch": 0.8563303659742829, + "grad_norm": 0.7739611268657531, + "learning_rate": 4.7648222815082345e-06, + "loss": 0.4204, + "step": 3463 + }, + { + "epoch": 0.8565776458951533, + "grad_norm": 0.7927935553548467, + "learning_rate": 4.764684635734412e-06, + "loss": 0.4075, + "step": 3464 + }, + { + "epoch": 0.8568249258160238, + "grad_norm": 0.7803766875608863, + "learning_rate": 4.764546951680775e-06, + "loss": 0.4338, + "step": 3465 + }, + { + "epoch": 0.8570722057368941, + "grad_norm": 0.8016805038029136, + "learning_rate": 4.76440922934965e-06, + "loss": 0.4044, + "step": 3466 + }, + { + "epoch": 0.8573194856577646, + "grad_norm": 0.8185021142595483, + "learning_rate": 4.764271468743367e-06, + "loss": 0.4072, + "step": 3467 + }, + { + "epoch": 0.857566765578635, + "grad_norm": 0.7925774571128128, + "learning_rate": 4.764133669864253e-06, + "loss": 0.4161, + "step": 3468 + }, + { + "epoch": 0.8578140454995055, + "grad_norm": 0.8062989314810408, + "learning_rate": 4.763995832714636e-06, + "loss": 0.3727, + "step": 3469 + }, + { + "epoch": 0.8580613254203758, + "grad_norm": 0.7957403676082958, + "learning_rate": 4.763857957296849e-06, + "loss": 0.4329, + "step": 3470 + }, + { + "epoch": 0.8583086053412463, + "grad_norm": 0.8384575802088189, + "learning_rate": 4.7637200436132194e-06, + "loss": 0.4043, + "step": 3471 + }, + { + "epoch": 0.8585558852621167, + "grad_norm": 0.807113219782392, + "learning_rate": 4.76358209166608e-06, + "loss": 0.4014, + "step": 3472 + }, + { + "epoch": 0.8588031651829872, + "grad_norm": 0.8043008375712767, + "learning_rate": 4.7634441014577635e-06, + "loss": 0.4005, + "step": 3473 + }, + { + "epoch": 0.8590504451038575, + "grad_norm": 0.7842916597723525, + "learning_rate": 4.763306072990601e-06, + "loss": 0.391, + "step": 3474 + }, + { + "epoch": 0.859297725024728, + "grad_norm": 0.7986848229547799, + "learning_rate": 4.763168006266925e-06, + "loss": 0.4219, + "step": 3475 + }, + { + "epoch": 0.8595450049455984, + "grad_norm": 0.8198764103686891, + "learning_rate": 4.76302990128907e-06, + "loss": 0.3762, + "step": 3476 + }, + { + "epoch": 0.8597922848664689, + "grad_norm": 0.7972917241817145, + "learning_rate": 4.76289175805937e-06, + "loss": 0.4477, + "step": 3477 + }, + { + "epoch": 0.8600395647873392, + "grad_norm": 0.8004509954487877, + "learning_rate": 4.762753576580161e-06, + "loss": 0.4219, + "step": 3478 + }, + { + "epoch": 0.8602868447082097, + "grad_norm": 0.7815966556815089, + "learning_rate": 4.762615356853779e-06, + "loss": 0.4269, + "step": 3479 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 0.8140688940745657, + "learning_rate": 4.762477098882558e-06, + "loss": 0.4224, + "step": 3480 + }, + { + "epoch": 0.8607814045499506, + "grad_norm": 0.8085670395416449, + "learning_rate": 4.762338802668838e-06, + "loss": 0.4033, + "step": 3481 + }, + { + "epoch": 0.8610286844708209, + "grad_norm": 0.8114398196934406, + "learning_rate": 4.762200468214953e-06, + "loss": 0.405, + "step": 3482 + }, + { + "epoch": 0.8612759643916914, + "grad_norm": 0.7957454762289632, + "learning_rate": 4.7620620955232435e-06, + "loss": 0.3963, + "step": 3483 + }, + { + "epoch": 0.8615232443125618, + "grad_norm": 0.7716297802639464, + "learning_rate": 4.7619236845960495e-06, + "loss": 0.3725, + "step": 3484 + }, + { + "epoch": 0.8617705242334323, + "grad_norm": 0.7498275856769105, + "learning_rate": 4.7617852354357085e-06, + "loss": 0.408, + "step": 3485 + }, + { + "epoch": 0.8620178041543026, + "grad_norm": 0.8209720763151387, + "learning_rate": 4.761646748044561e-06, + "loss": 0.4222, + "step": 3486 + }, + { + "epoch": 0.8622650840751731, + "grad_norm": 0.7811869145474315, + "learning_rate": 4.761508222424948e-06, + "loss": 0.4029, + "step": 3487 + }, + { + "epoch": 0.8625123639960435, + "grad_norm": 0.774668109393603, + "learning_rate": 4.761369658579213e-06, + "loss": 0.4089, + "step": 3488 + }, + { + "epoch": 0.862759643916914, + "grad_norm": 0.774508807733018, + "learning_rate": 4.761231056509694e-06, + "loss": 0.4087, + "step": 3489 + }, + { + "epoch": 0.8630069238377843, + "grad_norm": 0.8043812063644213, + "learning_rate": 4.761092416218737e-06, + "loss": 0.4133, + "step": 3490 + }, + { + "epoch": 0.8632542037586548, + "grad_norm": 0.8291349458889816, + "learning_rate": 4.760953737708685e-06, + "loss": 0.4119, + "step": 3491 + }, + { + "epoch": 0.8635014836795252, + "grad_norm": 0.7920356279520397, + "learning_rate": 4.7608150209818815e-06, + "loss": 0.3771, + "step": 3492 + }, + { + "epoch": 0.8637487636003957, + "grad_norm": 0.8072523281275508, + "learning_rate": 4.760676266040671e-06, + "loss": 0.4068, + "step": 3493 + }, + { + "epoch": 0.863996043521266, + "grad_norm": 0.7899951360565894, + "learning_rate": 4.7605374728874e-06, + "loss": 0.4221, + "step": 3494 + }, + { + "epoch": 0.8642433234421365, + "grad_norm": 0.7947865396993798, + "learning_rate": 4.760398641524413e-06, + "loss": 0.4459, + "step": 3495 + }, + { + "epoch": 0.8644906033630069, + "grad_norm": 0.7709649897446718, + "learning_rate": 4.760259771954058e-06, + "loss": 0.4485, + "step": 3496 + }, + { + "epoch": 0.8647378832838774, + "grad_norm": 0.8107187352574584, + "learning_rate": 4.7601208641786814e-06, + "loss": 0.4296, + "step": 3497 + }, + { + "epoch": 0.8649851632047477, + "grad_norm": 0.808173993566204, + "learning_rate": 4.759981918200632e-06, + "loss": 0.3832, + "step": 3498 + }, + { + "epoch": 0.8652324431256182, + "grad_norm": 0.7733226232262745, + "learning_rate": 4.7598429340222565e-06, + "loss": 0.4412, + "step": 3499 + }, + { + "epoch": 0.8654797230464887, + "grad_norm": 0.7802454421928897, + "learning_rate": 4.7597039116459065e-06, + "loss": 0.44, + "step": 3500 + }, + { + "epoch": 0.865727002967359, + "grad_norm": 0.7551425262142368, + "learning_rate": 4.75956485107393e-06, + "loss": 0.4164, + "step": 3501 + }, + { + "epoch": 0.8659742828882295, + "grad_norm": 0.7607363103302663, + "learning_rate": 4.75942575230868e-06, + "loss": 0.4278, + "step": 3502 + }, + { + "epoch": 0.8662215628090999, + "grad_norm": 0.7928219599691498, + "learning_rate": 4.759286615352504e-06, + "loss": 0.4044, + "step": 3503 + }, + { + "epoch": 0.8664688427299704, + "grad_norm": 0.7746463131564298, + "learning_rate": 4.759147440207758e-06, + "loss": 0.4119, + "step": 3504 + }, + { + "epoch": 0.8667161226508407, + "grad_norm": 0.8030529000110455, + "learning_rate": 4.7590082268767906e-06, + "loss": 0.3837, + "step": 3505 + }, + { + "epoch": 0.8669634025717112, + "grad_norm": 0.8373758297950461, + "learning_rate": 4.758868975361958e-06, + "loss": 0.4134, + "step": 3506 + }, + { + "epoch": 0.8672106824925816, + "grad_norm": 0.847300972466465, + "learning_rate": 4.758729685665612e-06, + "loss": 0.4137, + "step": 3507 + }, + { + "epoch": 0.8674579624134521, + "grad_norm": 0.822308602045918, + "learning_rate": 4.758590357790107e-06, + "loss": 0.4266, + "step": 3508 + }, + { + "epoch": 0.8677052423343224, + "grad_norm": 0.7751883119985707, + "learning_rate": 4.7584509917378e-06, + "loss": 0.4068, + "step": 3509 + }, + { + "epoch": 0.8679525222551929, + "grad_norm": 0.7472605460681306, + "learning_rate": 4.758311587511044e-06, + "loss": 0.4192, + "step": 3510 + }, + { + "epoch": 0.8681998021760633, + "grad_norm": 0.7846163021154071, + "learning_rate": 4.758172145112198e-06, + "loss": 0.4236, + "step": 3511 + }, + { + "epoch": 0.8684470820969338, + "grad_norm": 0.8040444889492215, + "learning_rate": 4.758032664543617e-06, + "loss": 0.4068, + "step": 3512 + }, + { + "epoch": 0.8686943620178041, + "grad_norm": 0.7854285165758984, + "learning_rate": 4.757893145807659e-06, + "loss": 0.4087, + "step": 3513 + }, + { + "epoch": 0.8689416419386746, + "grad_norm": 0.8123417665178312, + "learning_rate": 4.757753588906684e-06, + "loss": 0.3667, + "step": 3514 + }, + { + "epoch": 0.869188921859545, + "grad_norm": 0.783236901262894, + "learning_rate": 4.757613993843048e-06, + "loss": 0.4303, + "step": 3515 + }, + { + "epoch": 0.8694362017804155, + "grad_norm": 0.7909692214395833, + "learning_rate": 4.757474360619113e-06, + "loss": 0.4018, + "step": 3516 + }, + { + "epoch": 0.8696834817012858, + "grad_norm": 0.8481627332283962, + "learning_rate": 4.757334689237239e-06, + "loss": 0.4067, + "step": 3517 + }, + { + "epoch": 0.8699307616221563, + "grad_norm": 0.7615068125391466, + "learning_rate": 4.757194979699784e-06, + "loss": 0.4368, + "step": 3518 + }, + { + "epoch": 0.8701780415430267, + "grad_norm": 0.801705773275248, + "learning_rate": 4.757055232009113e-06, + "loss": 0.4375, + "step": 3519 + }, + { + "epoch": 0.8704253214638972, + "grad_norm": 0.8221944799768854, + "learning_rate": 4.756915446167587e-06, + "loss": 0.389, + "step": 3520 + }, + { + "epoch": 0.8706726013847675, + "grad_norm": 0.7862370813701264, + "learning_rate": 4.756775622177568e-06, + "loss": 0.42, + "step": 3521 + }, + { + "epoch": 0.870919881305638, + "grad_norm": 0.7597195820726511, + "learning_rate": 4.756635760041421e-06, + "loss": 0.3909, + "step": 3522 + }, + { + "epoch": 0.8711671612265084, + "grad_norm": 0.8076111401656502, + "learning_rate": 4.7564958597615085e-06, + "loss": 0.4135, + "step": 3523 + }, + { + "epoch": 0.8714144411473789, + "grad_norm": 0.8192089416161316, + "learning_rate": 4.756355921340197e-06, + "loss": 0.3929, + "step": 3524 + }, + { + "epoch": 0.8716617210682492, + "grad_norm": 0.8545678851827884, + "learning_rate": 4.7562159447798485e-06, + "loss": 0.3968, + "step": 3525 + }, + { + "epoch": 0.8719090009891197, + "grad_norm": 0.7846539307702833, + "learning_rate": 4.756075930082833e-06, + "loss": 0.4232, + "step": 3526 + }, + { + "epoch": 0.8721562809099901, + "grad_norm": 0.7645948649778886, + "learning_rate": 4.755935877251515e-06, + "loss": 0.3992, + "step": 3527 + }, + { + "epoch": 0.8724035608308606, + "grad_norm": 0.8118160536750945, + "learning_rate": 4.755795786288262e-06, + "loss": 0.3972, + "step": 3528 + }, + { + "epoch": 0.8726508407517309, + "grad_norm": 0.8161177141362831, + "learning_rate": 4.7556556571954414e-06, + "loss": 0.4113, + "step": 3529 + }, + { + "epoch": 0.8728981206726014, + "grad_norm": 0.7550227042035801, + "learning_rate": 4.755515489975424e-06, + "loss": 0.4113, + "step": 3530 + }, + { + "epoch": 0.8731454005934718, + "grad_norm": 0.7841549066488641, + "learning_rate": 4.755375284630577e-06, + "loss": 0.3964, + "step": 3531 + }, + { + "epoch": 0.8733926805143423, + "grad_norm": 0.7504063160687925, + "learning_rate": 4.75523504116327e-06, + "loss": 0.4069, + "step": 3532 + }, + { + "epoch": 0.8736399604352126, + "grad_norm": 0.8184342628628202, + "learning_rate": 4.755094759575875e-06, + "loss": 0.4015, + "step": 3533 + }, + { + "epoch": 0.8738872403560831, + "grad_norm": 0.7998457424375706, + "learning_rate": 4.754954439870763e-06, + "loss": 0.3947, + "step": 3534 + }, + { + "epoch": 0.8741345202769535, + "grad_norm": 0.7918492983576808, + "learning_rate": 4.754814082050305e-06, + "loss": 0.4025, + "step": 3535 + }, + { + "epoch": 0.874381800197824, + "grad_norm": 0.826262336173255, + "learning_rate": 4.7546736861168745e-06, + "loss": 0.4033, + "step": 3536 + }, + { + "epoch": 0.8746290801186943, + "grad_norm": 0.8139016489751498, + "learning_rate": 4.754533252072843e-06, + "loss": 0.4127, + "step": 3537 + }, + { + "epoch": 0.8748763600395648, + "grad_norm": 0.8212185358376062, + "learning_rate": 4.754392779920585e-06, + "loss": 0.4218, + "step": 3538 + }, + { + "epoch": 0.8751236399604352, + "grad_norm": 0.8123701589959741, + "learning_rate": 4.754252269662476e-06, + "loss": 0.4212, + "step": 3539 + }, + { + "epoch": 0.8753709198813057, + "grad_norm": 0.7360744756469361, + "learning_rate": 4.754111721300889e-06, + "loss": 0.4146, + "step": 3540 + }, + { + "epoch": 0.875618199802176, + "grad_norm": 0.7851546187922345, + "learning_rate": 4.753971134838202e-06, + "loss": 0.4179, + "step": 3541 + }, + { + "epoch": 0.8758654797230465, + "grad_norm": 0.7721531761045248, + "learning_rate": 4.753830510276789e-06, + "loss": 0.4435, + "step": 3542 + }, + { + "epoch": 0.8761127596439169, + "grad_norm": 0.7785287323560217, + "learning_rate": 4.7536898476190295e-06, + "loss": 0.4226, + "step": 3543 + }, + { + "epoch": 0.8763600395647874, + "grad_norm": 0.7967610882789702, + "learning_rate": 4.753549146867299e-06, + "loss": 0.4254, + "step": 3544 + }, + { + "epoch": 0.8766073194856577, + "grad_norm": 0.8096712743989584, + "learning_rate": 4.753408408023976e-06, + "loss": 0.3831, + "step": 3545 + }, + { + "epoch": 0.8768545994065282, + "grad_norm": 0.7636869739295397, + "learning_rate": 4.75326763109144e-06, + "loss": 0.427, + "step": 3546 + }, + { + "epoch": 0.8771018793273986, + "grad_norm": 0.771914882644073, + "learning_rate": 4.753126816072071e-06, + "loss": 0.3803, + "step": 3547 + }, + { + "epoch": 0.8773491592482691, + "grad_norm": 0.7653991534397886, + "learning_rate": 4.752985962968247e-06, + "loss": 0.4219, + "step": 3548 + }, + { + "epoch": 0.8775964391691394, + "grad_norm": 0.8189550150303511, + "learning_rate": 4.752845071782352e-06, + "loss": 0.4325, + "step": 3549 + }, + { + "epoch": 0.8778437190900099, + "grad_norm": 0.7692889761933356, + "learning_rate": 4.752704142516765e-06, + "loss": 0.3801, + "step": 3550 + }, + { + "epoch": 0.8780909990108803, + "grad_norm": 0.7996162297522486, + "learning_rate": 4.7525631751738696e-06, + "loss": 0.4072, + "step": 3551 + }, + { + "epoch": 0.8783382789317508, + "grad_norm": 0.7928390591889003, + "learning_rate": 4.752422169756048e-06, + "loss": 0.4187, + "step": 3552 + }, + { + "epoch": 0.8785855588526211, + "grad_norm": 0.8121066718850152, + "learning_rate": 4.7522811262656835e-06, + "loss": 0.4002, + "step": 3553 + }, + { + "epoch": 0.8788328387734916, + "grad_norm": 0.7825370933836572, + "learning_rate": 4.752140044705161e-06, + "loss": 0.4406, + "step": 3554 + }, + { + "epoch": 0.879080118694362, + "grad_norm": 0.7621264478238917, + "learning_rate": 4.751998925076863e-06, + "loss": 0.417, + "step": 3555 + }, + { + "epoch": 0.8793273986152325, + "grad_norm": 0.7962830855667626, + "learning_rate": 4.7518577673831765e-06, + "loss": 0.4318, + "step": 3556 + }, + { + "epoch": 0.8795746785361028, + "grad_norm": 0.7761619234912889, + "learning_rate": 4.7517165716264866e-06, + "loss": 0.4136, + "step": 3557 + }, + { + "epoch": 0.8798219584569733, + "grad_norm": 0.8059167848034989, + "learning_rate": 4.751575337809183e-06, + "loss": 0.4084, + "step": 3558 + }, + { + "epoch": 0.8800692383778437, + "grad_norm": 0.8124913029375966, + "learning_rate": 4.751434065933648e-06, + "loss": 0.4045, + "step": 3559 + }, + { + "epoch": 0.8803165182987142, + "grad_norm": 0.7814128265870901, + "learning_rate": 4.751292756002273e-06, + "loss": 0.4258, + "step": 3560 + }, + { + "epoch": 0.8805637982195845, + "grad_norm": 0.8039331824474487, + "learning_rate": 4.751151408017445e-06, + "loss": 0.4056, + "step": 3561 + }, + { + "epoch": 0.880811078140455, + "grad_norm": 0.8269124609448832, + "learning_rate": 4.751010021981555e-06, + "loss": 0.4022, + "step": 3562 + }, + { + "epoch": 0.8810583580613254, + "grad_norm": 0.8059296222218977, + "learning_rate": 4.75086859789699e-06, + "loss": 0.4137, + "step": 3563 + }, + { + "epoch": 0.8813056379821959, + "grad_norm": 0.8257234417066797, + "learning_rate": 4.750727135766143e-06, + "loss": 0.4017, + "step": 3564 + }, + { + "epoch": 0.8815529179030662, + "grad_norm": 0.7868621973148561, + "learning_rate": 4.750585635591404e-06, + "loss": 0.4026, + "step": 3565 + }, + { + "epoch": 0.8818001978239367, + "grad_norm": 0.8049536206026485, + "learning_rate": 4.750444097375165e-06, + "loss": 0.3992, + "step": 3566 + }, + { + "epoch": 0.8820474777448071, + "grad_norm": 0.8126870559722053, + "learning_rate": 4.750302521119819e-06, + "loss": 0.4046, + "step": 3567 + }, + { + "epoch": 0.8822947576656776, + "grad_norm": 0.8647264312706785, + "learning_rate": 4.750160906827758e-06, + "loss": 0.4248, + "step": 3568 + }, + { + "epoch": 0.8825420375865479, + "grad_norm": 0.742871129693659, + "learning_rate": 4.750019254501376e-06, + "loss": 0.4345, + "step": 3569 + }, + { + "epoch": 0.8827893175074184, + "grad_norm": 0.8191164799952924, + "learning_rate": 4.749877564143067e-06, + "loss": 0.424, + "step": 3570 + }, + { + "epoch": 0.8830365974282888, + "grad_norm": 0.781549768862716, + "learning_rate": 4.749735835755227e-06, + "loss": 0.4298, + "step": 3571 + }, + { + "epoch": 0.8832838773491593, + "grad_norm": 0.8053711023623178, + "learning_rate": 4.749594069340252e-06, + "loss": 0.413, + "step": 3572 + }, + { + "epoch": 0.8835311572700296, + "grad_norm": 0.8262873093017418, + "learning_rate": 4.749452264900536e-06, + "loss": 0.427, + "step": 3573 + }, + { + "epoch": 0.8837784371909001, + "grad_norm": 0.7651153848178928, + "learning_rate": 4.749310422438478e-06, + "loss": 0.4328, + "step": 3574 + }, + { + "epoch": 0.8840257171117705, + "grad_norm": 0.821565389817644, + "learning_rate": 4.749168541956475e-06, + "loss": 0.4025, + "step": 3575 + }, + { + "epoch": 0.884272997032641, + "grad_norm": 0.8771003427825715, + "learning_rate": 4.749026623456925e-06, + "loss": 0.3983, + "step": 3576 + }, + { + "epoch": 0.8845202769535113, + "grad_norm": 0.8193490682302352, + "learning_rate": 4.748884666942226e-06, + "loss": 0.4175, + "step": 3577 + }, + { + "epoch": 0.8847675568743818, + "grad_norm": 0.7881409124419267, + "learning_rate": 4.748742672414779e-06, + "loss": 0.4199, + "step": 3578 + }, + { + "epoch": 0.8850148367952523, + "grad_norm": 0.7939165086785582, + "learning_rate": 4.748600639876983e-06, + "loss": 0.4059, + "step": 3579 + }, + { + "epoch": 0.8852621167161226, + "grad_norm": 0.7968599992051688, + "learning_rate": 4.748458569331239e-06, + "loss": 0.4462, + "step": 3580 + }, + { + "epoch": 0.8855093966369931, + "grad_norm": 0.8478289232046818, + "learning_rate": 4.7483164607799495e-06, + "loss": 0.3866, + "step": 3581 + }, + { + "epoch": 0.8857566765578635, + "grad_norm": 0.7865185184922773, + "learning_rate": 4.748174314225515e-06, + "loss": 0.415, + "step": 3582 + }, + { + "epoch": 0.886003956478734, + "grad_norm": 0.7813174862734311, + "learning_rate": 4.748032129670339e-06, + "loss": 0.4436, + "step": 3583 + }, + { + "epoch": 0.8862512363996043, + "grad_norm": 0.8084377607109554, + "learning_rate": 4.747889907116826e-06, + "loss": 0.3953, + "step": 3584 + }, + { + "epoch": 0.8864985163204748, + "grad_norm": 0.8036795735133182, + "learning_rate": 4.747747646567378e-06, + "loss": 0.3895, + "step": 3585 + }, + { + "epoch": 0.8867457962413452, + "grad_norm": 0.8098549722269822, + "learning_rate": 4.747605348024399e-06, + "loss": 0.4254, + "step": 3586 + }, + { + "epoch": 0.8869930761622157, + "grad_norm": 0.8081490604273397, + "learning_rate": 4.747463011490297e-06, + "loss": 0.4039, + "step": 3587 + }, + { + "epoch": 0.887240356083086, + "grad_norm": 0.7912104338630691, + "learning_rate": 4.747320636967476e-06, + "loss": 0.4045, + "step": 3588 + }, + { + "epoch": 0.8874876360039565, + "grad_norm": 0.8185827574341613, + "learning_rate": 4.747178224458343e-06, + "loss": 0.3962, + "step": 3589 + }, + { + "epoch": 0.8877349159248269, + "grad_norm": 0.804633825051263, + "learning_rate": 4.7470357739653055e-06, + "loss": 0.4151, + "step": 3590 + }, + { + "epoch": 0.8879821958456974, + "grad_norm": 0.8270157295952916, + "learning_rate": 4.746893285490771e-06, + "loss": 0.3745, + "step": 3591 + }, + { + "epoch": 0.8882294757665677, + "grad_norm": 0.794707377298943, + "learning_rate": 4.746750759037148e-06, + "loss": 0.4317, + "step": 3592 + }, + { + "epoch": 0.8884767556874382, + "grad_norm": 0.815353758945955, + "learning_rate": 4.746608194606845e-06, + "loss": 0.4009, + "step": 3593 + }, + { + "epoch": 0.8887240356083086, + "grad_norm": 0.7851323106201907, + "learning_rate": 4.746465592202273e-06, + "loss": 0.3853, + "step": 3594 + }, + { + "epoch": 0.8889713155291791, + "grad_norm": 0.7726796238264945, + "learning_rate": 4.7463229518258424e-06, + "loss": 0.4068, + "step": 3595 + }, + { + "epoch": 0.8892185954500494, + "grad_norm": 0.842200959923491, + "learning_rate": 4.746180273479963e-06, + "loss": 0.4096, + "step": 3596 + }, + { + "epoch": 0.8894658753709199, + "grad_norm": 0.7998155688698845, + "learning_rate": 4.746037557167047e-06, + "loss": 0.4087, + "step": 3597 + }, + { + "epoch": 0.8897131552917903, + "grad_norm": 0.7937182610227635, + "learning_rate": 4.745894802889507e-06, + "loss": 0.422, + "step": 3598 + }, + { + "epoch": 0.8899604352126608, + "grad_norm": 0.8111751844441354, + "learning_rate": 4.745752010649755e-06, + "loss": 0.39, + "step": 3599 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 0.7714646667870872, + "learning_rate": 4.745609180450207e-06, + "loss": 0.4192, + "step": 3600 + }, + { + "epoch": 0.8904549950544016, + "grad_norm": 0.7554803232211826, + "learning_rate": 4.745466312293275e-06, + "loss": 0.4306, + "step": 3601 + }, + { + "epoch": 0.890702274975272, + "grad_norm": 0.7757947210978946, + "learning_rate": 4.745323406181375e-06, + "loss": 0.4142, + "step": 3602 + }, + { + "epoch": 0.8909495548961425, + "grad_norm": 0.760204249748166, + "learning_rate": 4.7451804621169214e-06, + "loss": 0.4446, + "step": 3603 + }, + { + "epoch": 0.8911968348170128, + "grad_norm": 0.7870209440781624, + "learning_rate": 4.745037480102332e-06, + "loss": 0.3776, + "step": 3604 + }, + { + "epoch": 0.8914441147378833, + "grad_norm": 0.7948033100309609, + "learning_rate": 4.744894460140021e-06, + "loss": 0.4145, + "step": 3605 + }, + { + "epoch": 0.8916913946587537, + "grad_norm": 0.8094052552150242, + "learning_rate": 4.7447514022324085e-06, + "loss": 0.3901, + "step": 3606 + }, + { + "epoch": 0.8919386745796242, + "grad_norm": 0.8210557157284226, + "learning_rate": 4.744608306381912e-06, + "loss": 0.444, + "step": 3607 + }, + { + "epoch": 0.8921859545004945, + "grad_norm": 0.8307590466318364, + "learning_rate": 4.744465172590949e-06, + "loss": 0.3834, + "step": 3608 + }, + { + "epoch": 0.892433234421365, + "grad_norm": 0.7843833688120964, + "learning_rate": 4.7443220008619405e-06, + "loss": 0.4132, + "step": 3609 + }, + { + "epoch": 0.8926805143422354, + "grad_norm": 0.7944958514405386, + "learning_rate": 4.744178791197305e-06, + "loss": 0.4175, + "step": 3610 + }, + { + "epoch": 0.8929277942631059, + "grad_norm": 0.7998369942157381, + "learning_rate": 4.744035543599464e-06, + "loss": 0.3737, + "step": 3611 + }, + { + "epoch": 0.8931750741839762, + "grad_norm": 0.8009239130803604, + "learning_rate": 4.74389225807084e-06, + "loss": 0.4203, + "step": 3612 + }, + { + "epoch": 0.8934223541048467, + "grad_norm": 0.800511982201152, + "learning_rate": 4.743748934613853e-06, + "loss": 0.4466, + "step": 3613 + }, + { + "epoch": 0.8936696340257171, + "grad_norm": 0.7686053384089964, + "learning_rate": 4.743605573230926e-06, + "loss": 0.4101, + "step": 3614 + }, + { + "epoch": 0.8939169139465876, + "grad_norm": 0.8060434016909004, + "learning_rate": 4.7434621739244826e-06, + "loss": 0.4232, + "step": 3615 + }, + { + "epoch": 0.8941641938674579, + "grad_norm": 0.8230882644339693, + "learning_rate": 4.7433187366969465e-06, + "loss": 0.4046, + "step": 3616 + }, + { + "epoch": 0.8944114737883284, + "grad_norm": 0.7989957008420914, + "learning_rate": 4.743175261550743e-06, + "loss": 0.4167, + "step": 3617 + }, + { + "epoch": 0.8946587537091988, + "grad_norm": 0.8394401197348738, + "learning_rate": 4.7430317484882956e-06, + "loss": 0.4009, + "step": 3618 + }, + { + "epoch": 0.8949060336300693, + "grad_norm": 0.8094052684678341, + "learning_rate": 4.7428881975120325e-06, + "loss": 0.3933, + "step": 3619 + }, + { + "epoch": 0.8951533135509396, + "grad_norm": 0.7851229819463934, + "learning_rate": 4.742744608624377e-06, + "loss": 0.4113, + "step": 3620 + }, + { + "epoch": 0.8954005934718101, + "grad_norm": 0.7662309230132146, + "learning_rate": 4.742600981827759e-06, + "loss": 0.4065, + "step": 3621 + }, + { + "epoch": 0.8956478733926805, + "grad_norm": 0.7710743555606467, + "learning_rate": 4.7424573171246045e-06, + "loss": 0.4206, + "step": 3622 + }, + { + "epoch": 0.895895153313551, + "grad_norm": 0.8099130000859085, + "learning_rate": 4.742313614517342e-06, + "loss": 0.3821, + "step": 3623 + }, + { + "epoch": 0.8961424332344213, + "grad_norm": 0.8223717726070423, + "learning_rate": 4.7421698740084024e-06, + "loss": 0.4351, + "step": 3624 + }, + { + "epoch": 0.8963897131552918, + "grad_norm": 0.785125594303034, + "learning_rate": 4.742026095600213e-06, + "loss": 0.3946, + "step": 3625 + }, + { + "epoch": 0.8966369930761622, + "grad_norm": 0.8069026278651512, + "learning_rate": 4.741882279295204e-06, + "loss": 0.3847, + "step": 3626 + }, + { + "epoch": 0.8968842729970327, + "grad_norm": 0.8105176440474819, + "learning_rate": 4.7417384250958085e-06, + "loss": 0.4036, + "step": 3627 + }, + { + "epoch": 0.897131552917903, + "grad_norm": 0.8156431875624227, + "learning_rate": 4.741594533004455e-06, + "loss": 0.3943, + "step": 3628 + }, + { + "epoch": 0.8973788328387735, + "grad_norm": 0.8325389032978646, + "learning_rate": 4.74145060302358e-06, + "loss": 0.3985, + "step": 3629 + }, + { + "epoch": 0.8976261127596439, + "grad_norm": 0.7812029183112011, + "learning_rate": 4.741306635155613e-06, + "loss": 0.4174, + "step": 3630 + }, + { + "epoch": 0.8978733926805144, + "grad_norm": 0.7443379316555324, + "learning_rate": 4.741162629402987e-06, + "loss": 0.4244, + "step": 3631 + }, + { + "epoch": 0.8981206726013847, + "grad_norm": 0.7918379844224285, + "learning_rate": 4.741018585768139e-06, + "loss": 0.4298, + "step": 3632 + }, + { + "epoch": 0.8983679525222552, + "grad_norm": 0.8155608902187095, + "learning_rate": 4.740874504253501e-06, + "loss": 0.4136, + "step": 3633 + }, + { + "epoch": 0.8986152324431256, + "grad_norm": 0.798431437148323, + "learning_rate": 4.740730384861511e-06, + "loss": 0.4152, + "step": 3634 + }, + { + "epoch": 0.8988625123639961, + "grad_norm": 0.7817921688685995, + "learning_rate": 4.740586227594602e-06, + "loss": 0.4322, + "step": 3635 + }, + { + "epoch": 0.8991097922848664, + "grad_norm": 0.801265782510891, + "learning_rate": 4.740442032455213e-06, + "loss": 0.3742, + "step": 3636 + }, + { + "epoch": 0.8993570722057369, + "grad_norm": 0.774678482565866, + "learning_rate": 4.740297799445781e-06, + "loss": 0.4431, + "step": 3637 + }, + { + "epoch": 0.8996043521266073, + "grad_norm": 0.7968222612747385, + "learning_rate": 4.740153528568743e-06, + "loss": 0.4176, + "step": 3638 + }, + { + "epoch": 0.8998516320474778, + "grad_norm": 0.7887370276926967, + "learning_rate": 4.740009219826538e-06, + "loss": 0.4488, + "step": 3639 + }, + { + "epoch": 0.9000989119683481, + "grad_norm": 0.8051668166188332, + "learning_rate": 4.739864873221607e-06, + "loss": 0.4105, + "step": 3640 + }, + { + "epoch": 0.9003461918892186, + "grad_norm": 0.7819691265369665, + "learning_rate": 4.739720488756387e-06, + "loss": 0.4241, + "step": 3641 + }, + { + "epoch": 0.900593471810089, + "grad_norm": 0.834960143043667, + "learning_rate": 4.73957606643332e-06, + "loss": 0.4472, + "step": 3642 + }, + { + "epoch": 0.9008407517309595, + "grad_norm": 0.8080501371876072, + "learning_rate": 4.739431606254847e-06, + "loss": 0.3909, + "step": 3643 + }, + { + "epoch": 0.9010880316518298, + "grad_norm": 0.7919301864894568, + "learning_rate": 4.73928710822341e-06, + "loss": 0.412, + "step": 3644 + }, + { + "epoch": 0.9013353115727003, + "grad_norm": 0.7911907598771045, + "learning_rate": 4.739142572341451e-06, + "loss": 0.437, + "step": 3645 + }, + { + "epoch": 0.9015825914935707, + "grad_norm": 0.7705392923487695, + "learning_rate": 4.738997998611413e-06, + "loss": 0.4042, + "step": 3646 + }, + { + "epoch": 0.9018298714144412, + "grad_norm": 0.8233674027397053, + "learning_rate": 4.7388533870357415e-06, + "loss": 0.414, + "step": 3647 + }, + { + "epoch": 0.9020771513353115, + "grad_norm": 0.7503969461053938, + "learning_rate": 4.738708737616879e-06, + "loss": 0.4066, + "step": 3648 + }, + { + "epoch": 0.902324431256182, + "grad_norm": 0.8049770365300944, + "learning_rate": 4.73856405035727e-06, + "loss": 0.41, + "step": 3649 + }, + { + "epoch": 0.9025717111770524, + "grad_norm": 0.8109425037455092, + "learning_rate": 4.7384193252593606e-06, + "loss": 0.4445, + "step": 3650 + }, + { + "epoch": 0.9028189910979229, + "grad_norm": 0.8369589101352245, + "learning_rate": 4.7382745623255985e-06, + "loss": 0.4313, + "step": 3651 + }, + { + "epoch": 0.9030662710187932, + "grad_norm": 0.8046568527922425, + "learning_rate": 4.73812976155843e-06, + "loss": 0.3856, + "step": 3652 + }, + { + "epoch": 0.9033135509396637, + "grad_norm": 0.8099061520890892, + "learning_rate": 4.737984922960301e-06, + "loss": 0.4034, + "step": 3653 + }, + { + "epoch": 0.9035608308605341, + "grad_norm": 0.7989778433300206, + "learning_rate": 4.737840046533662e-06, + "loss": 0.3918, + "step": 3654 + }, + { + "epoch": 0.9038081107814046, + "grad_norm": 0.7793440943753789, + "learning_rate": 4.737695132280961e-06, + "loss": 0.3995, + "step": 3655 + }, + { + "epoch": 0.904055390702275, + "grad_norm": 0.7676663113860338, + "learning_rate": 4.737550180204646e-06, + "loss": 0.3792, + "step": 3656 + }, + { + "epoch": 0.9043026706231454, + "grad_norm": 0.8045089888506259, + "learning_rate": 4.737405190307169e-06, + "loss": 0.4119, + "step": 3657 + }, + { + "epoch": 0.9045499505440159, + "grad_norm": 0.7690497853722972, + "learning_rate": 4.7372601625909805e-06, + "loss": 0.4096, + "step": 3658 + }, + { + "epoch": 0.9047972304648862, + "grad_norm": 0.8071716454958764, + "learning_rate": 4.737115097058532e-06, + "loss": 0.4048, + "step": 3659 + }, + { + "epoch": 0.9050445103857567, + "grad_norm": 0.7895776592417868, + "learning_rate": 4.736969993712275e-06, + "loss": 0.4167, + "step": 3660 + }, + { + "epoch": 0.9052917903066271, + "grad_norm": 0.8240625463753163, + "learning_rate": 4.736824852554661e-06, + "loss": 0.403, + "step": 3661 + }, + { + "epoch": 0.9055390702274976, + "grad_norm": 0.8195618957323566, + "learning_rate": 4.736679673588146e-06, + "loss": 0.3815, + "step": 3662 + }, + { + "epoch": 0.905786350148368, + "grad_norm": 0.7834351939345308, + "learning_rate": 4.736534456815182e-06, + "loss": 0.4219, + "step": 3663 + }, + { + "epoch": 0.9060336300692384, + "grad_norm": 0.8324618159473429, + "learning_rate": 4.736389202238224e-06, + "loss": 0.3964, + "step": 3664 + }, + { + "epoch": 0.9062809099901088, + "grad_norm": 0.8600961054014756, + "learning_rate": 4.736243909859727e-06, + "loss": 0.3904, + "step": 3665 + }, + { + "epoch": 0.9065281899109793, + "grad_norm": 0.7577065431947011, + "learning_rate": 4.736098579682148e-06, + "loss": 0.4157, + "step": 3666 + }, + { + "epoch": 0.9067754698318496, + "grad_norm": 0.7866848892631093, + "learning_rate": 4.735953211707942e-06, + "loss": 0.403, + "step": 3667 + }, + { + "epoch": 0.9070227497527201, + "grad_norm": 0.7505834429164107, + "learning_rate": 4.735807805939568e-06, + "loss": 0.4294, + "step": 3668 + }, + { + "epoch": 0.9072700296735905, + "grad_norm": 0.8346896722270837, + "learning_rate": 4.735662362379482e-06, + "loss": 0.4088, + "step": 3669 + }, + { + "epoch": 0.907517309594461, + "grad_norm": 0.8556942291684705, + "learning_rate": 4.735516881030143e-06, + "loss": 0.3931, + "step": 3670 + }, + { + "epoch": 0.9077645895153313, + "grad_norm": 0.8109171826553419, + "learning_rate": 4.7353713618940104e-06, + "loss": 0.3865, + "step": 3671 + }, + { + "epoch": 0.9080118694362018, + "grad_norm": 0.7873661354765638, + "learning_rate": 4.735225804973543e-06, + "loss": 0.3986, + "step": 3672 + }, + { + "epoch": 0.9082591493570722, + "grad_norm": 0.8094256714029084, + "learning_rate": 4.735080210271202e-06, + "loss": 0.4157, + "step": 3673 + }, + { + "epoch": 0.9085064292779427, + "grad_norm": 0.8159543106898893, + "learning_rate": 4.734934577789449e-06, + "loss": 0.3597, + "step": 3674 + }, + { + "epoch": 0.908753709198813, + "grad_norm": 0.826502911376005, + "learning_rate": 4.734788907530744e-06, + "loss": 0.3858, + "step": 3675 + }, + { + "epoch": 0.9090009891196835, + "grad_norm": 0.7757107363293632, + "learning_rate": 4.734643199497551e-06, + "loss": 0.4146, + "step": 3676 + }, + { + "epoch": 0.9092482690405539, + "grad_norm": 0.7897304584611788, + "learning_rate": 4.73449745369233e-06, + "loss": 0.3961, + "step": 3677 + }, + { + "epoch": 0.9094955489614244, + "grad_norm": 0.8082471594665909, + "learning_rate": 4.734351670117548e-06, + "loss": 0.3993, + "step": 3678 + }, + { + "epoch": 0.9097428288822947, + "grad_norm": 0.8032744698939259, + "learning_rate": 4.734205848775667e-06, + "loss": 0.4244, + "step": 3679 + }, + { + "epoch": 0.9099901088031652, + "grad_norm": 0.7602920123766033, + "learning_rate": 4.734059989669153e-06, + "loss": 0.3965, + "step": 3680 + }, + { + "epoch": 0.9102373887240356, + "grad_norm": 0.8009664738803186, + "learning_rate": 4.73391409280047e-06, + "loss": 0.4085, + "step": 3681 + }, + { + "epoch": 0.9104846686449061, + "grad_norm": 0.8213732647282298, + "learning_rate": 4.733768158172086e-06, + "loss": 0.3911, + "step": 3682 + }, + { + "epoch": 0.9107319485657764, + "grad_norm": 0.7848207476521984, + "learning_rate": 4.733622185786466e-06, + "loss": 0.4079, + "step": 3683 + }, + { + "epoch": 0.9109792284866469, + "grad_norm": 0.7988798539247747, + "learning_rate": 4.733476175646079e-06, + "loss": 0.4248, + "step": 3684 + }, + { + "epoch": 0.9112265084075173, + "grad_norm": 0.791433923608785, + "learning_rate": 4.733330127753391e-06, + "loss": 0.374, + "step": 3685 + }, + { + "epoch": 0.9114737883283878, + "grad_norm": 0.8508840280452902, + "learning_rate": 4.733184042110872e-06, + "loss": 0.3851, + "step": 3686 + }, + { + "epoch": 0.9117210682492581, + "grad_norm": 0.7563054793404609, + "learning_rate": 4.733037918720991e-06, + "loss": 0.4036, + "step": 3687 + }, + { + "epoch": 0.9119683481701286, + "grad_norm": 0.7806311523663997, + "learning_rate": 4.732891757586217e-06, + "loss": 0.4269, + "step": 3688 + }, + { + "epoch": 0.912215628090999, + "grad_norm": 0.8016731889784404, + "learning_rate": 4.732745558709022e-06, + "loss": 0.4159, + "step": 3689 + }, + { + "epoch": 0.9124629080118695, + "grad_norm": 0.841535918475194, + "learning_rate": 4.732599322091878e-06, + "loss": 0.3978, + "step": 3690 + }, + { + "epoch": 0.9127101879327398, + "grad_norm": 0.8294101182941553, + "learning_rate": 4.732453047737254e-06, + "loss": 0.3808, + "step": 3691 + }, + { + "epoch": 0.9129574678536103, + "grad_norm": 0.7925075807432463, + "learning_rate": 4.7323067356476236e-06, + "loss": 0.39, + "step": 3692 + }, + { + "epoch": 0.9132047477744807, + "grad_norm": 0.8122022329751005, + "learning_rate": 4.7321603858254615e-06, + "loss": 0.3956, + "step": 3693 + }, + { + "epoch": 0.9134520276953512, + "grad_norm": 0.8872703411191297, + "learning_rate": 4.732013998273239e-06, + "loss": 0.3825, + "step": 3694 + }, + { + "epoch": 0.9136993076162215, + "grad_norm": 0.8172203043153907, + "learning_rate": 4.7318675729934325e-06, + "loss": 0.4327, + "step": 3695 + }, + { + "epoch": 0.913946587537092, + "grad_norm": 0.7698700210872599, + "learning_rate": 4.731721109988516e-06, + "loss": 0.4249, + "step": 3696 + }, + { + "epoch": 0.9141938674579624, + "grad_norm": 0.7909208463040441, + "learning_rate": 4.731574609260965e-06, + "loss": 0.413, + "step": 3697 + }, + { + "epoch": 0.9144411473788329, + "grad_norm": 0.85111940345262, + "learning_rate": 4.7314280708132555e-06, + "loss": 0.3766, + "step": 3698 + }, + { + "epoch": 0.9146884272997032, + "grad_norm": 0.8267173832504716, + "learning_rate": 4.731281494647866e-06, + "loss": 0.4056, + "step": 3699 + }, + { + "epoch": 0.9149357072205737, + "grad_norm": 0.781799158173567, + "learning_rate": 4.731134880767273e-06, + "loss": 0.3793, + "step": 3700 + }, + { + "epoch": 0.9151829871414441, + "grad_norm": 0.7796258192015334, + "learning_rate": 4.730988229173955e-06, + "loss": 0.4309, + "step": 3701 + }, + { + "epoch": 0.9154302670623146, + "grad_norm": 0.8493662394700314, + "learning_rate": 4.7308415398703896e-06, + "loss": 0.4401, + "step": 3702 + }, + { + "epoch": 0.9156775469831849, + "grad_norm": 0.8252101322373908, + "learning_rate": 4.730694812859058e-06, + "loss": 0.425, + "step": 3703 + }, + { + "epoch": 0.9159248269040554, + "grad_norm": 0.8436892482893468, + "learning_rate": 4.73054804814244e-06, + "loss": 0.4001, + "step": 3704 + }, + { + "epoch": 0.9161721068249258, + "grad_norm": 0.7990937103631333, + "learning_rate": 4.730401245723015e-06, + "loss": 0.4528, + "step": 3705 + }, + { + "epoch": 0.9164193867457963, + "grad_norm": 0.7426409943836156, + "learning_rate": 4.730254405603266e-06, + "loss": 0.4139, + "step": 3706 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.8458020418541371, + "learning_rate": 4.730107527785675e-06, + "loss": 0.3864, + "step": 3707 + }, + { + "epoch": 0.9169139465875371, + "grad_norm": 0.7763715601421739, + "learning_rate": 4.729960612272724e-06, + "loss": 0.413, + "step": 3708 + }, + { + "epoch": 0.9171612265084075, + "grad_norm": 0.772693666164887, + "learning_rate": 4.729813659066895e-06, + "loss": 0.3964, + "step": 3709 + }, + { + "epoch": 0.917408506429278, + "grad_norm": 0.7765688668068649, + "learning_rate": 4.729666668170675e-06, + "loss": 0.3879, + "step": 3710 + }, + { + "epoch": 0.9176557863501483, + "grad_norm": 0.7526284361207995, + "learning_rate": 4.729519639586546e-06, + "loss": 0.4087, + "step": 3711 + }, + { + "epoch": 0.9179030662710188, + "grad_norm": 0.7880056084680264, + "learning_rate": 4.729372573316994e-06, + "loss": 0.3952, + "step": 3712 + }, + { + "epoch": 0.9181503461918892, + "grad_norm": 0.8119548569197702, + "learning_rate": 4.729225469364506e-06, + "loss": 0.4106, + "step": 3713 + }, + { + "epoch": 0.9183976261127597, + "grad_norm": 0.8080727821692407, + "learning_rate": 4.729078327731566e-06, + "loss": 0.4168, + "step": 3714 + }, + { + "epoch": 0.91864490603363, + "grad_norm": 0.7656106325715241, + "learning_rate": 4.728931148420663e-06, + "loss": 0.4209, + "step": 3715 + }, + { + "epoch": 0.9188921859545005, + "grad_norm": 0.7482334472892916, + "learning_rate": 4.728783931434285e-06, + "loss": 0.427, + "step": 3716 + }, + { + "epoch": 0.9191394658753709, + "grad_norm": 0.7750948068078986, + "learning_rate": 4.7286366767749195e-06, + "loss": 0.3864, + "step": 3717 + }, + { + "epoch": 0.9193867457962414, + "grad_norm": 0.802420863654844, + "learning_rate": 4.728489384445055e-06, + "loss": 0.4016, + "step": 3718 + }, + { + "epoch": 0.9196340257171117, + "grad_norm": 0.7601304461080065, + "learning_rate": 4.728342054447183e-06, + "loss": 0.4446, + "step": 3719 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 0.8089882839309193, + "learning_rate": 4.728194686783792e-06, + "loss": 0.3926, + "step": 3720 + }, + { + "epoch": 0.9201285855588526, + "grad_norm": 0.7875812610990902, + "learning_rate": 4.728047281457374e-06, + "loss": 0.3865, + "step": 3721 + }, + { + "epoch": 0.920375865479723, + "grad_norm": 0.8304273323171134, + "learning_rate": 4.7278998384704215e-06, + "loss": 0.3886, + "step": 3722 + }, + { + "epoch": 0.9206231454005934, + "grad_norm": 0.7793749974809633, + "learning_rate": 4.727752357825424e-06, + "loss": 0.3991, + "step": 3723 + }, + { + "epoch": 0.9208704253214639, + "grad_norm": 0.7655198670497095, + "learning_rate": 4.7276048395248755e-06, + "loss": 0.4016, + "step": 3724 + }, + { + "epoch": 0.9211177052423343, + "grad_norm": 0.7785444053497739, + "learning_rate": 4.7274572835712706e-06, + "loss": 0.3837, + "step": 3725 + }, + { + "epoch": 0.9213649851632048, + "grad_norm": 0.794178493200149, + "learning_rate": 4.727309689967103e-06, + "loss": 0.4141, + "step": 3726 + }, + { + "epoch": 0.9216122650840751, + "grad_norm": 0.7616756186660109, + "learning_rate": 4.727162058714867e-06, + "loss": 0.4016, + "step": 3727 + }, + { + "epoch": 0.9218595450049456, + "grad_norm": 0.827382828719098, + "learning_rate": 4.7270143898170575e-06, + "loss": 0.3829, + "step": 3728 + }, + { + "epoch": 0.922106824925816, + "grad_norm": 0.7964010549622547, + "learning_rate": 4.7268666832761725e-06, + "loss": 0.3913, + "step": 3729 + }, + { + "epoch": 0.9223541048466865, + "grad_norm": 0.8188786377099656, + "learning_rate": 4.726718939094706e-06, + "loss": 0.4117, + "step": 3730 + }, + { + "epoch": 0.9226013847675568, + "grad_norm": 0.7849413309879384, + "learning_rate": 4.726571157275157e-06, + "loss": 0.4084, + "step": 3731 + }, + { + "epoch": 0.9228486646884273, + "grad_norm": 0.8072618169447672, + "learning_rate": 4.726423337820023e-06, + "loss": 0.4058, + "step": 3732 + }, + { + "epoch": 0.9230959446092978, + "grad_norm": 0.807105005048506, + "learning_rate": 4.726275480731803e-06, + "loss": 0.3882, + "step": 3733 + }, + { + "epoch": 0.9233432245301681, + "grad_norm": 0.760185353289157, + "learning_rate": 4.726127586012996e-06, + "loss": 0.4132, + "step": 3734 + }, + { + "epoch": 0.9235905044510386, + "grad_norm": 0.7754454391486514, + "learning_rate": 4.7259796536661016e-06, + "loss": 0.426, + "step": 3735 + }, + { + "epoch": 0.923837784371909, + "grad_norm": 0.8142785355549925, + "learning_rate": 4.725831683693621e-06, + "loss": 0.398, + "step": 3736 + }, + { + "epoch": 0.9240850642927795, + "grad_norm": 0.7681071514435952, + "learning_rate": 4.725683676098054e-06, + "loss": 0.4131, + "step": 3737 + }, + { + "epoch": 0.9243323442136498, + "grad_norm": 0.7718039403892808, + "learning_rate": 4.725535630881904e-06, + "loss": 0.428, + "step": 3738 + }, + { + "epoch": 0.9245796241345203, + "grad_norm": 0.7862011212454346, + "learning_rate": 4.725387548047672e-06, + "loss": 0.3932, + "step": 3739 + }, + { + "epoch": 0.9248269040553907, + "grad_norm": 0.7927179649242823, + "learning_rate": 4.725239427597862e-06, + "loss": 0.4271, + "step": 3740 + }, + { + "epoch": 0.9250741839762612, + "grad_norm": 0.7860521643345544, + "learning_rate": 4.725091269534976e-06, + "loss": 0.3935, + "step": 3741 + }, + { + "epoch": 0.9253214638971315, + "grad_norm": 0.796238835215501, + "learning_rate": 4.724943073861521e-06, + "loss": 0.3948, + "step": 3742 + }, + { + "epoch": 0.925568743818002, + "grad_norm": 0.8406638282243356, + "learning_rate": 4.724794840580001e-06, + "loss": 0.4006, + "step": 3743 + }, + { + "epoch": 0.9258160237388724, + "grad_norm": 0.7855740670031731, + "learning_rate": 4.724646569692919e-06, + "loss": 0.3985, + "step": 3744 + }, + { + "epoch": 0.9260633036597429, + "grad_norm": 0.7799292341163916, + "learning_rate": 4.7244982612027845e-06, + "loss": 0.3917, + "step": 3745 + }, + { + "epoch": 0.9263105835806132, + "grad_norm": 0.7735208824748577, + "learning_rate": 4.724349915112103e-06, + "loss": 0.4268, + "step": 3746 + }, + { + "epoch": 0.9265578635014837, + "grad_norm": 0.7861442577149689, + "learning_rate": 4.724201531423383e-06, + "loss": 0.4117, + "step": 3747 + }, + { + "epoch": 0.9268051434223541, + "grad_norm": 0.7621689766137228, + "learning_rate": 4.72405311013913e-06, + "loss": 0.4052, + "step": 3748 + }, + { + "epoch": 0.9270524233432246, + "grad_norm": 0.7738986753047935, + "learning_rate": 4.723904651261855e-06, + "loss": 0.4308, + "step": 3749 + }, + { + "epoch": 0.9272997032640949, + "grad_norm": 0.7464914050650943, + "learning_rate": 4.723756154794068e-06, + "loss": 0.4125, + "step": 3750 + }, + { + "epoch": 0.9275469831849654, + "grad_norm": 0.7875071720493665, + "learning_rate": 4.7236076207382765e-06, + "loss": 0.4171, + "step": 3751 + }, + { + "epoch": 0.9277942631058358, + "grad_norm": 0.7695819617858256, + "learning_rate": 4.7234590490969935e-06, + "loss": 0.4251, + "step": 3752 + }, + { + "epoch": 0.9280415430267063, + "grad_norm": 0.7860254683196816, + "learning_rate": 4.723310439872729e-06, + "loss": 0.4174, + "step": 3753 + }, + { + "epoch": 0.9282888229475766, + "grad_norm": 0.799371539348264, + "learning_rate": 4.723161793067995e-06, + "loss": 0.4048, + "step": 3754 + }, + { + "epoch": 0.9285361028684471, + "grad_norm": 0.766005328064519, + "learning_rate": 4.723013108685306e-06, + "loss": 0.3992, + "step": 3755 + }, + { + "epoch": 0.9287833827893175, + "grad_norm": 0.8184769776996572, + "learning_rate": 4.722864386727171e-06, + "loss": 0.4098, + "step": 3756 + }, + { + "epoch": 0.929030662710188, + "grad_norm": 0.767670477728639, + "learning_rate": 4.722715627196109e-06, + "loss": 0.376, + "step": 3757 + }, + { + "epoch": 0.9292779426310583, + "grad_norm": 0.7691881581821964, + "learning_rate": 4.72256683009463e-06, + "loss": 0.4209, + "step": 3758 + }, + { + "epoch": 0.9295252225519288, + "grad_norm": 0.779597647151623, + "learning_rate": 4.722417995425252e-06, + "loss": 0.3974, + "step": 3759 + }, + { + "epoch": 0.9297725024727992, + "grad_norm": 0.7958084407546698, + "learning_rate": 4.72226912319049e-06, + "loss": 0.3913, + "step": 3760 + }, + { + "epoch": 0.9300197823936697, + "grad_norm": 0.7578575157603611, + "learning_rate": 4.722120213392859e-06, + "loss": 0.4177, + "step": 3761 + }, + { + "epoch": 0.93026706231454, + "grad_norm": 0.7738447839180078, + "learning_rate": 4.721971266034878e-06, + "loss": 0.4515, + "step": 3762 + }, + { + "epoch": 0.9305143422354105, + "grad_norm": 0.7907234951401526, + "learning_rate": 4.721822281119064e-06, + "loss": 0.4325, + "step": 3763 + }, + { + "epoch": 0.9307616221562809, + "grad_norm": 0.7756955998159536, + "learning_rate": 4.721673258647934e-06, + "loss": 0.4332, + "step": 3764 + }, + { + "epoch": 0.9310089020771514, + "grad_norm": 0.782257930528881, + "learning_rate": 4.721524198624009e-06, + "loss": 0.4034, + "step": 3765 + }, + { + "epoch": 0.9312561819980217, + "grad_norm": 0.7801610811208322, + "learning_rate": 4.721375101049807e-06, + "loss": 0.3815, + "step": 3766 + }, + { + "epoch": 0.9315034619188922, + "grad_norm": 0.8097011083585558, + "learning_rate": 4.721225965927848e-06, + "loss": 0.4462, + "step": 3767 + }, + { + "epoch": 0.9317507418397626, + "grad_norm": 0.7898701624455864, + "learning_rate": 4.721076793260655e-06, + "loss": 0.4028, + "step": 3768 + }, + { + "epoch": 0.9319980217606331, + "grad_norm": 0.7551579097684481, + "learning_rate": 4.720927583050747e-06, + "loss": 0.4243, + "step": 3769 + }, + { + "epoch": 0.9322453016815034, + "grad_norm": 0.7764822151923534, + "learning_rate": 4.720778335300647e-06, + "loss": 0.4082, + "step": 3770 + }, + { + "epoch": 0.9324925816023739, + "grad_norm": 0.7862920635864419, + "learning_rate": 4.720629050012879e-06, + "loss": 0.3879, + "step": 3771 + }, + { + "epoch": 0.9327398615232443, + "grad_norm": 0.8223007795021665, + "learning_rate": 4.720479727189964e-06, + "loss": 0.4085, + "step": 3772 + }, + { + "epoch": 0.9329871414441148, + "grad_norm": 0.7707108563497688, + "learning_rate": 4.720330366834427e-06, + "loss": 0.4027, + "step": 3773 + }, + { + "epoch": 0.9332344213649851, + "grad_norm": 0.8109320443845112, + "learning_rate": 4.7201809689487935e-06, + "loss": 0.3899, + "step": 3774 + }, + { + "epoch": 0.9334817012858556, + "grad_norm": 0.7964599002381397, + "learning_rate": 4.720031533535589e-06, + "loss": 0.4535, + "step": 3775 + }, + { + "epoch": 0.933728981206726, + "grad_norm": 0.7915463325437578, + "learning_rate": 4.719882060597336e-06, + "loss": 0.4162, + "step": 3776 + }, + { + "epoch": 0.9339762611275965, + "grad_norm": 0.7698383537920458, + "learning_rate": 4.719732550136565e-06, + "loss": 0.4373, + "step": 3777 + }, + { + "epoch": 0.9342235410484668, + "grad_norm": 0.7705957905234673, + "learning_rate": 4.719583002155801e-06, + "loss": 0.4066, + "step": 3778 + }, + { + "epoch": 0.9344708209693373, + "grad_norm": 0.7787562181312311, + "learning_rate": 4.719433416657573e-06, + "loss": 0.4164, + "step": 3779 + }, + { + "epoch": 0.9347181008902077, + "grad_norm": 0.7859748027749637, + "learning_rate": 4.719283793644409e-06, + "loss": 0.4083, + "step": 3780 + }, + { + "epoch": 0.9349653808110782, + "grad_norm": 0.8513132306964396, + "learning_rate": 4.719134133118838e-06, + "loss": 0.3746, + "step": 3781 + }, + { + "epoch": 0.9352126607319485, + "grad_norm": 0.8106609584332799, + "learning_rate": 4.718984435083389e-06, + "loss": 0.411, + "step": 3782 + }, + { + "epoch": 0.935459940652819, + "grad_norm": 0.7840411155887811, + "learning_rate": 4.718834699540593e-06, + "loss": 0.3892, + "step": 3783 + }, + { + "epoch": 0.9357072205736894, + "grad_norm": 0.7987354153840486, + "learning_rate": 4.718684926492982e-06, + "loss": 0.4125, + "step": 3784 + }, + { + "epoch": 0.9359545004945599, + "grad_norm": 0.7769870805002546, + "learning_rate": 4.718535115943085e-06, + "loss": 0.4186, + "step": 3785 + }, + { + "epoch": 0.9362017804154302, + "grad_norm": 0.7953525655481078, + "learning_rate": 4.718385267893437e-06, + "loss": 0.4571, + "step": 3786 + }, + { + "epoch": 0.9364490603363007, + "grad_norm": 0.8257447321469814, + "learning_rate": 4.718235382346569e-06, + "loss": 0.4258, + "step": 3787 + }, + { + "epoch": 0.9366963402571711, + "grad_norm": 0.7979742103452806, + "learning_rate": 4.718085459305015e-06, + "loss": 0.4325, + "step": 3788 + }, + { + "epoch": 0.9369436201780416, + "grad_norm": 0.7755001210635785, + "learning_rate": 4.717935498771311e-06, + "loss": 0.3987, + "step": 3789 + }, + { + "epoch": 0.9371909000989119, + "grad_norm": 0.7855397941016596, + "learning_rate": 4.717785500747988e-06, + "loss": 0.4179, + "step": 3790 + }, + { + "epoch": 0.9374381800197824, + "grad_norm": 0.7659770262453232, + "learning_rate": 4.717635465237584e-06, + "loss": 0.3917, + "step": 3791 + }, + { + "epoch": 0.9376854599406528, + "grad_norm": 0.7815948203349444, + "learning_rate": 4.717485392242636e-06, + "loss": 0.4193, + "step": 3792 + }, + { + "epoch": 0.9379327398615233, + "grad_norm": 0.7930513949333287, + "learning_rate": 4.717335281765677e-06, + "loss": 0.3904, + "step": 3793 + }, + { + "epoch": 0.9381800197823936, + "grad_norm": 0.8129070197480318, + "learning_rate": 4.717185133809248e-06, + "loss": 0.4164, + "step": 3794 + }, + { + "epoch": 0.9384272997032641, + "grad_norm": 0.7668091089072321, + "learning_rate": 4.7170349483758845e-06, + "loss": 0.4051, + "step": 3795 + }, + { + "epoch": 0.9386745796241345, + "grad_norm": 0.7910228910052575, + "learning_rate": 4.716884725468127e-06, + "loss": 0.4083, + "step": 3796 + }, + { + "epoch": 0.938921859545005, + "grad_norm": 0.7732700837398885, + "learning_rate": 4.716734465088513e-06, + "loss": 0.4034, + "step": 3797 + }, + { + "epoch": 0.9391691394658753, + "grad_norm": 0.774749652475244, + "learning_rate": 4.716584167239584e-06, + "loss": 0.4185, + "step": 3798 + }, + { + "epoch": 0.9394164193867458, + "grad_norm": 0.8554633807185527, + "learning_rate": 4.716433831923879e-06, + "loss": 0.391, + "step": 3799 + }, + { + "epoch": 0.9396636993076162, + "grad_norm": 0.8449891799779463, + "learning_rate": 4.716283459143939e-06, + "loss": 0.3926, + "step": 3800 + }, + { + "epoch": 0.9399109792284867, + "grad_norm": 0.8158798360289666, + "learning_rate": 4.716133048902307e-06, + "loss": 0.4356, + "step": 3801 + }, + { + "epoch": 0.940158259149357, + "grad_norm": 0.7615640183067676, + "learning_rate": 4.715982601201525e-06, + "loss": 0.4172, + "step": 3802 + }, + { + "epoch": 0.9404055390702275, + "grad_norm": 0.7871218583450545, + "learning_rate": 4.715832116044135e-06, + "loss": 0.3677, + "step": 3803 + }, + { + "epoch": 0.9406528189910979, + "grad_norm": 0.7582932974852993, + "learning_rate": 4.715681593432683e-06, + "loss": 0.4602, + "step": 3804 + }, + { + "epoch": 0.9409000989119684, + "grad_norm": 0.7746130525975381, + "learning_rate": 4.71553103336971e-06, + "loss": 0.3856, + "step": 3805 + }, + { + "epoch": 0.9411473788328387, + "grad_norm": 0.7636577412053083, + "learning_rate": 4.715380435857763e-06, + "loss": 0.4131, + "step": 3806 + }, + { + "epoch": 0.9413946587537092, + "grad_norm": 0.7811739159391169, + "learning_rate": 4.715229800899388e-06, + "loss": 0.3884, + "step": 3807 + }, + { + "epoch": 0.9416419386745796, + "grad_norm": 0.8188404252385233, + "learning_rate": 4.715079128497129e-06, + "loss": 0.3946, + "step": 3808 + }, + { + "epoch": 0.94188921859545, + "grad_norm": 0.7875551448948243, + "learning_rate": 4.714928418653535e-06, + "loss": 0.3932, + "step": 3809 + }, + { + "epoch": 0.9421364985163204, + "grad_norm": 0.8046139238238394, + "learning_rate": 4.714777671371152e-06, + "loss": 0.4221, + "step": 3810 + }, + { + "epoch": 0.9423837784371909, + "grad_norm": 0.8110204164182576, + "learning_rate": 4.71462688665253e-06, + "loss": 0.4177, + "step": 3811 + }, + { + "epoch": 0.9426310583580614, + "grad_norm": 0.7878491388246204, + "learning_rate": 4.714476064500215e-06, + "loss": 0.39, + "step": 3812 + }, + { + "epoch": 0.9428783382789317, + "grad_norm": 0.7495863308390222, + "learning_rate": 4.714325204916758e-06, + "loss": 0.421, + "step": 3813 + }, + { + "epoch": 0.9431256181998022, + "grad_norm": 0.77849672486737, + "learning_rate": 4.714174307904709e-06, + "loss": 0.4247, + "step": 3814 + }, + { + "epoch": 0.9433728981206726, + "grad_norm": 0.802560196408335, + "learning_rate": 4.714023373466618e-06, + "loss": 0.408, + "step": 3815 + }, + { + "epoch": 0.9436201780415431, + "grad_norm": 0.7786351448112381, + "learning_rate": 4.713872401605036e-06, + "loss": 0.3885, + "step": 3816 + }, + { + "epoch": 0.9438674579624134, + "grad_norm": 0.8164733266440941, + "learning_rate": 4.713721392322515e-06, + "loss": 0.4153, + "step": 3817 + }, + { + "epoch": 0.9441147378832839, + "grad_norm": 0.7955726912893689, + "learning_rate": 4.713570345621609e-06, + "loss": 0.4145, + "step": 3818 + }, + { + "epoch": 0.9443620178041543, + "grad_norm": 0.7689706793710702, + "learning_rate": 4.71341926150487e-06, + "loss": 0.3869, + "step": 3819 + }, + { + "epoch": 0.9446092977250248, + "grad_norm": 0.804572440311293, + "learning_rate": 4.713268139974851e-06, + "loss": 0.4042, + "step": 3820 + }, + { + "epoch": 0.9448565776458951, + "grad_norm": 0.8035662350608253, + "learning_rate": 4.713116981034107e-06, + "loss": 0.4172, + "step": 3821 + }, + { + "epoch": 0.9451038575667656, + "grad_norm": 0.7746997086586535, + "learning_rate": 4.712965784685194e-06, + "loss": 0.3954, + "step": 3822 + }, + { + "epoch": 0.945351137487636, + "grad_norm": 0.7831748839896366, + "learning_rate": 4.712814550930667e-06, + "loss": 0.4129, + "step": 3823 + }, + { + "epoch": 0.9455984174085065, + "grad_norm": 0.7809792520188084, + "learning_rate": 4.712663279773081e-06, + "loss": 0.4347, + "step": 3824 + }, + { + "epoch": 0.9458456973293768, + "grad_norm": 0.7673768265462751, + "learning_rate": 4.7125119712149944e-06, + "loss": 0.4052, + "step": 3825 + }, + { + "epoch": 0.9460929772502473, + "grad_norm": 0.7798452737328526, + "learning_rate": 4.712360625258965e-06, + "loss": 0.4024, + "step": 3826 + }, + { + "epoch": 0.9463402571711177, + "grad_norm": 0.7778031688861111, + "learning_rate": 4.7122092419075496e-06, + "loss": 0.3775, + "step": 3827 + }, + { + "epoch": 0.9465875370919882, + "grad_norm": 0.7806188818311592, + "learning_rate": 4.712057821163308e-06, + "loss": 0.4155, + "step": 3828 + }, + { + "epoch": 0.9468348170128585, + "grad_norm": 0.7931029006594754, + "learning_rate": 4.7119063630288e-06, + "loss": 0.4178, + "step": 3829 + }, + { + "epoch": 0.947082096933729, + "grad_norm": 0.7694503362483404, + "learning_rate": 4.711754867506585e-06, + "loss": 0.4008, + "step": 3830 + }, + { + "epoch": 0.9473293768545994, + "grad_norm": 0.8129310646415457, + "learning_rate": 4.711603334599224e-06, + "loss": 0.3733, + "step": 3831 + }, + { + "epoch": 0.9475766567754699, + "grad_norm": 0.7761005503274951, + "learning_rate": 4.711451764309278e-06, + "loss": 0.4384, + "step": 3832 + }, + { + "epoch": 0.9478239366963402, + "grad_norm": 0.7912406175514075, + "learning_rate": 4.711300156639309e-06, + "loss": 0.403, + "step": 3833 + }, + { + "epoch": 0.9480712166172107, + "grad_norm": 0.7841375432382586, + "learning_rate": 4.7111485115918795e-06, + "loss": 0.4205, + "step": 3834 + }, + { + "epoch": 0.9483184965380811, + "grad_norm": 0.7876792036708451, + "learning_rate": 4.710996829169554e-06, + "loss": 0.3802, + "step": 3835 + }, + { + "epoch": 0.9485657764589516, + "grad_norm": 0.7883706920212202, + "learning_rate": 4.710845109374895e-06, + "loss": 0.3764, + "step": 3836 + }, + { + "epoch": 0.9488130563798219, + "grad_norm": 0.7713104412410063, + "learning_rate": 4.710693352210468e-06, + "loss": 0.4153, + "step": 3837 + }, + { + "epoch": 0.9490603363006924, + "grad_norm": 0.774838074480863, + "learning_rate": 4.7105415576788375e-06, + "loss": 0.4099, + "step": 3838 + }, + { + "epoch": 0.9493076162215628, + "grad_norm": 0.7482537027491292, + "learning_rate": 4.710389725782568e-06, + "loss": 0.4218, + "step": 3839 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.7904541367962616, + "learning_rate": 4.710237856524229e-06, + "loss": 0.4037, + "step": 3840 + }, + { + "epoch": 0.9498021760633036, + "grad_norm": 0.7702306085739898, + "learning_rate": 4.710085949906385e-06, + "loss": 0.4299, + "step": 3841 + }, + { + "epoch": 0.9500494559841741, + "grad_norm": 0.7976548908474161, + "learning_rate": 4.709934005931605e-06, + "loss": 0.3735, + "step": 3842 + }, + { + "epoch": 0.9502967359050445, + "grad_norm": 0.8362496119917581, + "learning_rate": 4.709782024602456e-06, + "loss": 0.4022, + "step": 3843 + }, + { + "epoch": 0.950544015825915, + "grad_norm": 0.7829890013758223, + "learning_rate": 4.709630005921508e-06, + "loss": 0.4207, + "step": 3844 + }, + { + "epoch": 0.9507912957467853, + "grad_norm": 0.7769273804916441, + "learning_rate": 4.709477949891331e-06, + "loss": 0.4209, + "step": 3845 + }, + { + "epoch": 0.9510385756676558, + "grad_norm": 0.7589464323885547, + "learning_rate": 4.709325856514494e-06, + "loss": 0.4101, + "step": 3846 + }, + { + "epoch": 0.9512858555885262, + "grad_norm": 0.8020651971148175, + "learning_rate": 4.709173725793567e-06, + "loss": 0.4404, + "step": 3847 + }, + { + "epoch": 0.9515331355093967, + "grad_norm": 0.7785190052402172, + "learning_rate": 4.709021557731125e-06, + "loss": 0.3814, + "step": 3848 + }, + { + "epoch": 0.951780415430267, + "grad_norm": 0.7755037590030682, + "learning_rate": 4.708869352329736e-06, + "loss": 0.3985, + "step": 3849 + }, + { + "epoch": 0.9520276953511375, + "grad_norm": 0.7900519608011191, + "learning_rate": 4.708717109591976e-06, + "loss": 0.4362, + "step": 3850 + }, + { + "epoch": 0.9522749752720079, + "grad_norm": 0.8073740775342424, + "learning_rate": 4.708564829520416e-06, + "loss": 0.4269, + "step": 3851 + }, + { + "epoch": 0.9525222551928784, + "grad_norm": 0.8233059438001666, + "learning_rate": 4.708412512117631e-06, + "loss": 0.4044, + "step": 3852 + }, + { + "epoch": 0.9527695351137487, + "grad_norm": 0.8118457866395735, + "learning_rate": 4.708260157386196e-06, + "loss": 0.4115, + "step": 3853 + }, + { + "epoch": 0.9530168150346192, + "grad_norm": 0.7583537392743612, + "learning_rate": 4.708107765328685e-06, + "loss": 0.416, + "step": 3854 + }, + { + "epoch": 0.9532640949554896, + "grad_norm": 0.8297248524334135, + "learning_rate": 4.707955335947675e-06, + "loss": 0.3898, + "step": 3855 + }, + { + "epoch": 0.9535113748763601, + "grad_norm": 0.7870425199589566, + "learning_rate": 4.707802869245742e-06, + "loss": 0.4179, + "step": 3856 + }, + { + "epoch": 0.9537586547972304, + "grad_norm": 0.8316754519208652, + "learning_rate": 4.707650365225463e-06, + "loss": 0.4201, + "step": 3857 + }, + { + "epoch": 0.9540059347181009, + "grad_norm": 0.7792225618981867, + "learning_rate": 4.7074978238894164e-06, + "loss": 0.41, + "step": 3858 + }, + { + "epoch": 0.9542532146389713, + "grad_norm": 0.7799396685116958, + "learning_rate": 4.70734524524018e-06, + "loss": 0.4066, + "step": 3859 + }, + { + "epoch": 0.9545004945598418, + "grad_norm": 0.8053702110699514, + "learning_rate": 4.707192629280334e-06, + "loss": 0.3941, + "step": 3860 + }, + { + "epoch": 0.9547477744807121, + "grad_norm": 0.7618664972873861, + "learning_rate": 4.707039976012457e-06, + "loss": 0.4125, + "step": 3861 + }, + { + "epoch": 0.9549950544015826, + "grad_norm": 0.7931884558229111, + "learning_rate": 4.706887285439128e-06, + "loss": 0.3761, + "step": 3862 + }, + { + "epoch": 0.955242334322453, + "grad_norm": 0.7728504886433268, + "learning_rate": 4.70673455756293e-06, + "loss": 0.3919, + "step": 3863 + }, + { + "epoch": 0.9554896142433235, + "grad_norm": 0.7955438947171283, + "learning_rate": 4.7065817923864435e-06, + "loss": 0.3957, + "step": 3864 + }, + { + "epoch": 0.9557368941641938, + "grad_norm": 0.7809682843371737, + "learning_rate": 4.7064289899122515e-06, + "loss": 0.3659, + "step": 3865 + }, + { + "epoch": 0.9559841740850643, + "grad_norm": 0.801135077053223, + "learning_rate": 4.706276150142936e-06, + "loss": 0.4134, + "step": 3866 + }, + { + "epoch": 0.9562314540059347, + "grad_norm": 0.769903315991128, + "learning_rate": 4.706123273081081e-06, + "loss": 0.4145, + "step": 3867 + }, + { + "epoch": 0.9564787339268052, + "grad_norm": 0.7714718747779309, + "learning_rate": 4.7059703587292706e-06, + "loss": 0.4187, + "step": 3868 + }, + { + "epoch": 0.9567260138476755, + "grad_norm": 0.7771131713870824, + "learning_rate": 4.705817407090089e-06, + "loss": 0.4276, + "step": 3869 + }, + { + "epoch": 0.956973293768546, + "grad_norm": 0.7812494248399661, + "learning_rate": 4.705664418166122e-06, + "loss": 0.4195, + "step": 3870 + }, + { + "epoch": 0.9572205736894164, + "grad_norm": 0.7870184851971324, + "learning_rate": 4.705511391959955e-06, + "loss": 0.3779, + "step": 3871 + }, + { + "epoch": 0.9574678536102869, + "grad_norm": 0.7744463770512644, + "learning_rate": 4.7053583284741745e-06, + "loss": 0.4138, + "step": 3872 + }, + { + "epoch": 0.9577151335311572, + "grad_norm": 0.7958653995952897, + "learning_rate": 4.7052052277113695e-06, + "loss": 0.3956, + "step": 3873 + }, + { + "epoch": 0.9579624134520277, + "grad_norm": 0.7816603192193171, + "learning_rate": 4.705052089674125e-06, + "loss": 0.4006, + "step": 3874 + }, + { + "epoch": 0.9582096933728981, + "grad_norm": 0.7918038182911267, + "learning_rate": 4.704898914365032e-06, + "loss": 0.4128, + "step": 3875 + }, + { + "epoch": 0.9584569732937686, + "grad_norm": 0.8150385320639588, + "learning_rate": 4.704745701786678e-06, + "loss": 0.4069, + "step": 3876 + }, + { + "epoch": 0.9587042532146389, + "grad_norm": 0.7920833803794489, + "learning_rate": 4.704592451941654e-06, + "loss": 0.4187, + "step": 3877 + }, + { + "epoch": 0.9589515331355094, + "grad_norm": 0.7923085557647959, + "learning_rate": 4.704439164832549e-06, + "loss": 0.4362, + "step": 3878 + }, + { + "epoch": 0.9591988130563798, + "grad_norm": 0.7660991000196078, + "learning_rate": 4.704285840461955e-06, + "loss": 0.4174, + "step": 3879 + }, + { + "epoch": 0.9594460929772503, + "grad_norm": 0.8128783497936076, + "learning_rate": 4.704132478832464e-06, + "loss": 0.3888, + "step": 3880 + }, + { + "epoch": 0.9596933728981206, + "grad_norm": 0.7698482384519877, + "learning_rate": 4.703979079946667e-06, + "loss": 0.4169, + "step": 3881 + }, + { + "epoch": 0.9599406528189911, + "grad_norm": 0.7892348762474982, + "learning_rate": 4.703825643807157e-06, + "loss": 0.411, + "step": 3882 + }, + { + "epoch": 0.9601879327398615, + "grad_norm": 0.7861064933964294, + "learning_rate": 4.703672170416529e-06, + "loss": 0.4015, + "step": 3883 + }, + { + "epoch": 0.960435212660732, + "grad_norm": 0.8174752026252636, + "learning_rate": 4.703518659777376e-06, + "loss": 0.3917, + "step": 3884 + }, + { + "epoch": 0.9606824925816023, + "grad_norm": 0.7732844681830707, + "learning_rate": 4.703365111892293e-06, + "loss": 0.3775, + "step": 3885 + }, + { + "epoch": 0.9609297725024728, + "grad_norm": 0.805848268421217, + "learning_rate": 4.703211526763875e-06, + "loss": 0.3776, + "step": 3886 + }, + { + "epoch": 0.9611770524233432, + "grad_norm": 0.7655914125589361, + "learning_rate": 4.703057904394719e-06, + "loss": 0.3853, + "step": 3887 + }, + { + "epoch": 0.9614243323442137, + "grad_norm": 0.7744602016571512, + "learning_rate": 4.7029042447874205e-06, + "loss": 0.4206, + "step": 3888 + }, + { + "epoch": 0.9616716122650841, + "grad_norm": 0.7876431280098052, + "learning_rate": 4.702750547944577e-06, + "loss": 0.434, + "step": 3889 + }, + { + "epoch": 0.9619188921859545, + "grad_norm": 0.7701420784704784, + "learning_rate": 4.702596813868787e-06, + "loss": 0.4142, + "step": 3890 + }, + { + "epoch": 0.962166172106825, + "grad_norm": 0.7876056794108254, + "learning_rate": 4.70244304256265e-06, + "loss": 0.392, + "step": 3891 + }, + { + "epoch": 0.9624134520276953, + "grad_norm": 0.7944473649588513, + "learning_rate": 4.702289234028763e-06, + "loss": 0.411, + "step": 3892 + }, + { + "epoch": 0.9626607319485658, + "grad_norm": 0.7746154516350491, + "learning_rate": 4.702135388269727e-06, + "loss": 0.3754, + "step": 3893 + }, + { + "epoch": 0.9629080118694362, + "grad_norm": 0.7747445417607648, + "learning_rate": 4.701981505288142e-06, + "loss": 0.4035, + "step": 3894 + }, + { + "epoch": 0.9631552917903067, + "grad_norm": 0.7628514933888112, + "learning_rate": 4.70182758508661e-06, + "loss": 0.4276, + "step": 3895 + }, + { + "epoch": 0.963402571711177, + "grad_norm": 0.7697009449996881, + "learning_rate": 4.701673627667732e-06, + "loss": 0.4199, + "step": 3896 + }, + { + "epoch": 0.9636498516320475, + "grad_norm": 0.8210442405395625, + "learning_rate": 4.70151963303411e-06, + "loss": 0.429, + "step": 3897 + }, + { + "epoch": 0.9638971315529179, + "grad_norm": 0.7867985724002428, + "learning_rate": 4.7013656011883476e-06, + "loss": 0.433, + "step": 3898 + }, + { + "epoch": 0.9641444114737884, + "grad_norm": 0.7943173211816381, + "learning_rate": 4.7012115321330484e-06, + "loss": 0.4057, + "step": 3899 + }, + { + "epoch": 0.9643916913946587, + "grad_norm": 0.8182944927299638, + "learning_rate": 4.701057425870816e-06, + "loss": 0.3819, + "step": 3900 + }, + { + "epoch": 0.9646389713155292, + "grad_norm": 0.7479471012595492, + "learning_rate": 4.700903282404256e-06, + "loss": 0.4035, + "step": 3901 + }, + { + "epoch": 0.9648862512363996, + "grad_norm": 0.7812074899937719, + "learning_rate": 4.700749101735973e-06, + "loss": 0.4126, + "step": 3902 + }, + { + "epoch": 0.9651335311572701, + "grad_norm": 0.8278036061767997, + "learning_rate": 4.7005948838685735e-06, + "loss": 0.3914, + "step": 3903 + }, + { + "epoch": 0.9653808110781404, + "grad_norm": 0.8387896149443221, + "learning_rate": 4.700440628804665e-06, + "loss": 0.3896, + "step": 3904 + }, + { + "epoch": 0.9656280909990109, + "grad_norm": 0.7947700311503184, + "learning_rate": 4.700286336546854e-06, + "loss": 0.4004, + "step": 3905 + }, + { + "epoch": 0.9658753709198813, + "grad_norm": 0.7794144464386743, + "learning_rate": 4.700132007097748e-06, + "loss": 0.3994, + "step": 3906 + }, + { + "epoch": 0.9661226508407518, + "grad_norm": 0.8099820963076695, + "learning_rate": 4.699977640459958e-06, + "loss": 0.4252, + "step": 3907 + }, + { + "epoch": 0.9663699307616221, + "grad_norm": 0.7791668503349546, + "learning_rate": 4.699823236636091e-06, + "loss": 0.4112, + "step": 3908 + }, + { + "epoch": 0.9666172106824926, + "grad_norm": 0.7799461943658222, + "learning_rate": 4.6996687956287564e-06, + "loss": 0.3938, + "step": 3909 + }, + { + "epoch": 0.966864490603363, + "grad_norm": 0.7954451636878649, + "learning_rate": 4.699514317440567e-06, + "loss": 0.4116, + "step": 3910 + }, + { + "epoch": 0.9671117705242335, + "grad_norm": 0.7551128256490328, + "learning_rate": 4.699359802074131e-06, + "loss": 0.4095, + "step": 3911 + }, + { + "epoch": 0.9673590504451038, + "grad_norm": 0.7816752155120708, + "learning_rate": 4.6992052495320635e-06, + "loss": 0.3919, + "step": 3912 + }, + { + "epoch": 0.9676063303659743, + "grad_norm": 0.7748996073968227, + "learning_rate": 4.699050659816975e-06, + "loss": 0.4034, + "step": 3913 + }, + { + "epoch": 0.9678536102868447, + "grad_norm": 0.7713815417049966, + "learning_rate": 4.698896032931478e-06, + "loss": 0.3945, + "step": 3914 + }, + { + "epoch": 0.9681008902077152, + "grad_norm": 0.7706764167647603, + "learning_rate": 4.698741368878187e-06, + "loss": 0.4321, + "step": 3915 + }, + { + "epoch": 0.9683481701285855, + "grad_norm": 0.8023547706545407, + "learning_rate": 4.698586667659717e-06, + "loss": 0.4382, + "step": 3916 + }, + { + "epoch": 0.968595450049456, + "grad_norm": 0.8233471376751998, + "learning_rate": 4.698431929278681e-06, + "loss": 0.397, + "step": 3917 + }, + { + "epoch": 0.9688427299703264, + "grad_norm": 0.7778633351955079, + "learning_rate": 4.698277153737697e-06, + "loss": 0.4199, + "step": 3918 + }, + { + "epoch": 0.9690900098911969, + "grad_norm": 0.7703112244491745, + "learning_rate": 4.698122341039379e-06, + "loss": 0.4158, + "step": 3919 + }, + { + "epoch": 0.9693372898120672, + "grad_norm": 0.7727244260981386, + "learning_rate": 4.697967491186345e-06, + "loss": 0.4101, + "step": 3920 + }, + { + "epoch": 0.9695845697329377, + "grad_norm": 0.8320384841071571, + "learning_rate": 4.697812604181211e-06, + "loss": 0.395, + "step": 3921 + }, + { + "epoch": 0.9698318496538081, + "grad_norm": 0.7941589609812207, + "learning_rate": 4.697657680026597e-06, + "loss": 0.3636, + "step": 3922 + }, + { + "epoch": 0.9700791295746786, + "grad_norm": 0.8215449942248417, + "learning_rate": 4.69750271872512e-06, + "loss": 0.383, + "step": 3923 + }, + { + "epoch": 0.9703264094955489, + "grad_norm": 0.7842158778990416, + "learning_rate": 4.697347720279401e-06, + "loss": 0.3928, + "step": 3924 + }, + { + "epoch": 0.9705736894164194, + "grad_norm": 0.7975302553095485, + "learning_rate": 4.697192684692058e-06, + "loss": 0.4068, + "step": 3925 + }, + { + "epoch": 0.9708209693372898, + "grad_norm": 0.7778323064622887, + "learning_rate": 4.697037611965713e-06, + "loss": 0.4463, + "step": 3926 + }, + { + "epoch": 0.9710682492581603, + "grad_norm": 0.7775846948755031, + "learning_rate": 4.696882502102987e-06, + "loss": 0.4284, + "step": 3927 + }, + { + "epoch": 0.9713155291790306, + "grad_norm": 0.7746054597343937, + "learning_rate": 4.6967273551065005e-06, + "loss": 0.395, + "step": 3928 + }, + { + "epoch": 0.9715628090999011, + "grad_norm": 0.8127997502657887, + "learning_rate": 4.696572170978877e-06, + "loss": 0.3914, + "step": 3929 + }, + { + "epoch": 0.9718100890207715, + "grad_norm": 0.777611998632522, + "learning_rate": 4.69641694972274e-06, + "loss": 0.4205, + "step": 3930 + }, + { + "epoch": 0.972057368941642, + "grad_norm": 0.7780004765147993, + "learning_rate": 4.6962616913407125e-06, + "loss": 0.433, + "step": 3931 + }, + { + "epoch": 0.9723046488625123, + "grad_norm": 0.8241222014341494, + "learning_rate": 4.6961063958354195e-06, + "loss": 0.3979, + "step": 3932 + }, + { + "epoch": 0.9725519287833828, + "grad_norm": 0.8089490443001208, + "learning_rate": 4.695951063209485e-06, + "loss": 0.3662, + "step": 3933 + }, + { + "epoch": 0.9727992087042532, + "grad_norm": 0.7923141861732071, + "learning_rate": 4.695795693465536e-06, + "loss": 0.3904, + "step": 3934 + }, + { + "epoch": 0.9730464886251237, + "grad_norm": 0.829573517979138, + "learning_rate": 4.695640286606196e-06, + "loss": 0.413, + "step": 3935 + }, + { + "epoch": 0.973293768545994, + "grad_norm": 0.839797270153677, + "learning_rate": 4.695484842634094e-06, + "loss": 0.3953, + "step": 3936 + }, + { + "epoch": 0.9735410484668645, + "grad_norm": 0.7857025307651592, + "learning_rate": 4.695329361551858e-06, + "loss": 0.3986, + "step": 3937 + }, + { + "epoch": 0.9737883283877349, + "grad_norm": 0.7638491096993291, + "learning_rate": 4.695173843362115e-06, + "loss": 0.3932, + "step": 3938 + }, + { + "epoch": 0.9740356083086054, + "grad_norm": 0.7812683265166083, + "learning_rate": 4.6950182880674935e-06, + "loss": 0.4217, + "step": 3939 + }, + { + "epoch": 0.9742828882294757, + "grad_norm": 0.7603154077113041, + "learning_rate": 4.694862695670623e-06, + "loss": 0.4107, + "step": 3940 + }, + { + "epoch": 0.9745301681503462, + "grad_norm": 0.7539347980917882, + "learning_rate": 4.694707066174133e-06, + "loss": 0.4051, + "step": 3941 + }, + { + "epoch": 0.9747774480712166, + "grad_norm": 0.7863706446058237, + "learning_rate": 4.694551399580656e-06, + "loss": 0.4218, + "step": 3942 + }, + { + "epoch": 0.9750247279920871, + "grad_norm": 0.7665569429715551, + "learning_rate": 4.6943956958928215e-06, + "loss": 0.4019, + "step": 3943 + }, + { + "epoch": 0.9752720079129574, + "grad_norm": 0.799832166832461, + "learning_rate": 4.694239955113262e-06, + "loss": 0.3906, + "step": 3944 + }, + { + "epoch": 0.9755192878338279, + "grad_norm": 0.8274704820268617, + "learning_rate": 4.69408417724461e-06, + "loss": 0.3743, + "step": 3945 + }, + { + "epoch": 0.9757665677546983, + "grad_norm": 0.7752699483338866, + "learning_rate": 4.6939283622894975e-06, + "loss": 0.4137, + "step": 3946 + }, + { + "epoch": 0.9760138476755688, + "grad_norm": 0.7891688093341738, + "learning_rate": 4.693772510250559e-06, + "loss": 0.4211, + "step": 3947 + }, + { + "epoch": 0.9762611275964391, + "grad_norm": 0.7871416681868997, + "learning_rate": 4.69361662113043e-06, + "loss": 0.4214, + "step": 3948 + }, + { + "epoch": 0.9765084075173096, + "grad_norm": 0.7961084106239589, + "learning_rate": 4.693460694931744e-06, + "loss": 0.3997, + "step": 3949 + }, + { + "epoch": 0.97675568743818, + "grad_norm": 0.7892962345788587, + "learning_rate": 4.693304731657138e-06, + "loss": 0.405, + "step": 3950 + }, + { + "epoch": 0.9770029673590505, + "grad_norm": 0.7894503245971667, + "learning_rate": 4.6931487313092465e-06, + "loss": 0.4026, + "step": 3951 + }, + { + "epoch": 0.9772502472799208, + "grad_norm": 0.7703856475017746, + "learning_rate": 4.692992693890706e-06, + "loss": 0.4249, + "step": 3952 + }, + { + "epoch": 0.9774975272007913, + "grad_norm": 0.8133799239732469, + "learning_rate": 4.692836619404156e-06, + "loss": 0.4114, + "step": 3953 + }, + { + "epoch": 0.9777448071216617, + "grad_norm": 0.8152538089456904, + "learning_rate": 4.692680507852235e-06, + "loss": 0.3853, + "step": 3954 + }, + { + "epoch": 0.9779920870425322, + "grad_norm": 0.8184149190064318, + "learning_rate": 4.692524359237579e-06, + "loss": 0.3976, + "step": 3955 + }, + { + "epoch": 0.9782393669634025, + "grad_norm": 0.8254887145013703, + "learning_rate": 4.69236817356283e-06, + "loss": 0.3947, + "step": 3956 + }, + { + "epoch": 0.978486646884273, + "grad_norm": 0.8164624279733338, + "learning_rate": 4.692211950830626e-06, + "loss": 0.3985, + "step": 3957 + }, + { + "epoch": 0.9787339268051434, + "grad_norm": 0.7978082335627064, + "learning_rate": 4.6920556910436085e-06, + "loss": 0.4057, + "step": 3958 + }, + { + "epoch": 0.9789812067260139, + "grad_norm": 0.7763542013199648, + "learning_rate": 4.69189939420442e-06, + "loss": 0.4137, + "step": 3959 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 0.7984236860463884, + "learning_rate": 4.6917430603157e-06, + "loss": 0.41, + "step": 3960 + }, + { + "epoch": 0.9794757665677547, + "grad_norm": 0.7842647035839468, + "learning_rate": 4.691586689380092e-06, + "loss": 0.3884, + "step": 3961 + }, + { + "epoch": 0.9797230464886251, + "grad_norm": 0.7907516321710724, + "learning_rate": 4.69143028140024e-06, + "loss": 0.3949, + "step": 3962 + }, + { + "epoch": 0.9799703264094956, + "grad_norm": 0.7992643147336413, + "learning_rate": 4.691273836378787e-06, + "loss": 0.4191, + "step": 3963 + }, + { + "epoch": 0.9802176063303659, + "grad_norm": 0.796090473738687, + "learning_rate": 4.691117354318377e-06, + "loss": 0.3897, + "step": 3964 + }, + { + "epoch": 0.9804648862512364, + "grad_norm": 0.7815628862657795, + "learning_rate": 4.690960835221655e-06, + "loss": 0.3789, + "step": 3965 + }, + { + "epoch": 0.9807121661721068, + "grad_norm": 0.7507676442911566, + "learning_rate": 4.690804279091268e-06, + "loss": 0.3793, + "step": 3966 + }, + { + "epoch": 0.9809594460929772, + "grad_norm": 0.7732006075013503, + "learning_rate": 4.690647685929861e-06, + "loss": 0.4298, + "step": 3967 + }, + { + "epoch": 0.9812067260138477, + "grad_norm": 0.7922497219388176, + "learning_rate": 4.69049105574008e-06, + "loss": 0.4311, + "step": 3968 + }, + { + "epoch": 0.9814540059347181, + "grad_norm": 0.762528197693143, + "learning_rate": 4.690334388524576e-06, + "loss": 0.4116, + "step": 3969 + }, + { + "epoch": 0.9817012858555886, + "grad_norm": 0.7784231738630654, + "learning_rate": 4.6901776842859926e-06, + "loss": 0.4128, + "step": 3970 + }, + { + "epoch": 0.981948565776459, + "grad_norm": 0.8111640731341906, + "learning_rate": 4.690020943026982e-06, + "loss": 0.3863, + "step": 3971 + }, + { + "epoch": 0.9821958456973294, + "grad_norm": 0.783142213384446, + "learning_rate": 4.689864164750192e-06, + "loss": 0.3814, + "step": 3972 + }, + { + "epoch": 0.9824431256181998, + "grad_norm": 0.7916818596859191, + "learning_rate": 4.689707349458273e-06, + "loss": 0.4082, + "step": 3973 + }, + { + "epoch": 0.9826904055390703, + "grad_norm": 0.7979637978189713, + "learning_rate": 4.689550497153876e-06, + "loss": 0.4196, + "step": 3974 + }, + { + "epoch": 0.9829376854599406, + "grad_norm": 0.7940260223007418, + "learning_rate": 4.689393607839652e-06, + "loss": 0.4083, + "step": 3975 + }, + { + "epoch": 0.9831849653808111, + "grad_norm": 0.7639072225409764, + "learning_rate": 4.6892366815182515e-06, + "loss": 0.3994, + "step": 3976 + }, + { + "epoch": 0.9834322453016815, + "grad_norm": 0.7817053106289357, + "learning_rate": 4.689079718192329e-06, + "loss": 0.3889, + "step": 3977 + }, + { + "epoch": 0.983679525222552, + "grad_norm": 0.7598430403880667, + "learning_rate": 4.688922717864537e-06, + "loss": 0.4018, + "step": 3978 + }, + { + "epoch": 0.9839268051434223, + "grad_norm": 0.8086789487803981, + "learning_rate": 4.6887656805375296e-06, + "loss": 0.3999, + "step": 3979 + }, + { + "epoch": 0.9841740850642928, + "grad_norm": 0.7725826354053251, + "learning_rate": 4.68860860621396e-06, + "loss": 0.3951, + "step": 3980 + }, + { + "epoch": 0.9844213649851632, + "grad_norm": 0.7875872055007198, + "learning_rate": 4.688451494896485e-06, + "loss": 0.4003, + "step": 3981 + }, + { + "epoch": 0.9846686449060337, + "grad_norm": 0.7822097745681463, + "learning_rate": 4.688294346587759e-06, + "loss": 0.417, + "step": 3982 + }, + { + "epoch": 0.984915924826904, + "grad_norm": 0.7809298685569199, + "learning_rate": 4.688137161290438e-06, + "loss": 0.3968, + "step": 3983 + }, + { + "epoch": 0.9851632047477745, + "grad_norm": 0.7923832949438695, + "learning_rate": 4.687979939007179e-06, + "loss": 0.4056, + "step": 3984 + }, + { + "epoch": 0.9854104846686449, + "grad_norm": 0.7708418159185525, + "learning_rate": 4.687822679740641e-06, + "loss": 0.4172, + "step": 3985 + }, + { + "epoch": 0.9856577645895154, + "grad_norm": 0.7589092584911677, + "learning_rate": 4.68766538349348e-06, + "loss": 0.398, + "step": 3986 + }, + { + "epoch": 0.9859050445103857, + "grad_norm": 0.7732339144056797, + "learning_rate": 4.687508050268357e-06, + "loss": 0.4171, + "step": 3987 + }, + { + "epoch": 0.9861523244312562, + "grad_norm": 0.7643439359678004, + "learning_rate": 4.6873506800679295e-06, + "loss": 0.4489, + "step": 3988 + }, + { + "epoch": 0.9863996043521266, + "grad_norm": 0.755783549108914, + "learning_rate": 4.687193272894859e-06, + "loss": 0.4125, + "step": 3989 + }, + { + "epoch": 0.9866468842729971, + "grad_norm": 0.7987812317499954, + "learning_rate": 4.6870358287518046e-06, + "loss": 0.3959, + "step": 3990 + }, + { + "epoch": 0.9868941641938674, + "grad_norm": 0.7920689363171113, + "learning_rate": 4.686878347641428e-06, + "loss": 0.4421, + "step": 3991 + }, + { + "epoch": 0.9871414441147379, + "grad_norm": 0.7521273189464834, + "learning_rate": 4.686720829566393e-06, + "loss": 0.419, + "step": 3992 + }, + { + "epoch": 0.9873887240356083, + "grad_norm": 0.7824768932828919, + "learning_rate": 4.686563274529359e-06, + "loss": 0.4081, + "step": 3993 + }, + { + "epoch": 0.9876360039564788, + "grad_norm": 0.8262243329490202, + "learning_rate": 4.686405682532992e-06, + "loss": 0.3794, + "step": 3994 + }, + { + "epoch": 0.9878832838773491, + "grad_norm": 0.7889887455036464, + "learning_rate": 4.686248053579953e-06, + "loss": 0.3703, + "step": 3995 + }, + { + "epoch": 0.9881305637982196, + "grad_norm": 0.7825185955719633, + "learning_rate": 4.686090387672909e-06, + "loss": 0.4105, + "step": 3996 + }, + { + "epoch": 0.98837784371909, + "grad_norm": 0.7783848883203941, + "learning_rate": 4.685932684814524e-06, + "loss": 0.4152, + "step": 3997 + }, + { + "epoch": 0.9886251236399605, + "grad_norm": 0.791414411907457, + "learning_rate": 4.6857749450074625e-06, + "loss": 0.409, + "step": 3998 + }, + { + "epoch": 0.9888724035608308, + "grad_norm": 0.8043289643137662, + "learning_rate": 4.685617168254393e-06, + "loss": 0.4222, + "step": 3999 + }, + { + "epoch": 0.9891196834817013, + "grad_norm": 0.7714257806897765, + "learning_rate": 4.68545935455798e-06, + "loss": 0.4047, + "step": 4000 + }, + { + "epoch": 0.9893669634025717, + "grad_norm": 0.7612816309042126, + "learning_rate": 4.6853015039208924e-06, + "loss": 0.3969, + "step": 4001 + }, + { + "epoch": 0.9896142433234422, + "grad_norm": 0.8149288705742704, + "learning_rate": 4.685143616345799e-06, + "loss": 0.4114, + "step": 4002 + }, + { + "epoch": 0.9898615232443125, + "grad_norm": 0.8201740720242746, + "learning_rate": 4.684985691835367e-06, + "loss": 0.3888, + "step": 4003 + }, + { + "epoch": 0.990108803165183, + "grad_norm": 0.801323731002106, + "learning_rate": 4.684827730392267e-06, + "loss": 0.3877, + "step": 4004 + }, + { + "epoch": 0.9903560830860534, + "grad_norm": 0.8193208923254229, + "learning_rate": 4.6846697320191685e-06, + "loss": 0.3644, + "step": 4005 + }, + { + "epoch": 0.9906033630069239, + "grad_norm": 0.795362255642567, + "learning_rate": 4.684511696718741e-06, + "loss": 0.3919, + "step": 4006 + }, + { + "epoch": 0.9908506429277942, + "grad_norm": 0.7751441931444772, + "learning_rate": 4.684353624493658e-06, + "loss": 0.4054, + "step": 4007 + }, + { + "epoch": 0.9910979228486647, + "grad_norm": 0.8464714446984203, + "learning_rate": 4.68419551534659e-06, + "loss": 0.409, + "step": 4008 + }, + { + "epoch": 0.9913452027695351, + "grad_norm": 0.7602446475018649, + "learning_rate": 4.68403736928021e-06, + "loss": 0.4328, + "step": 4009 + }, + { + "epoch": 0.9915924826904056, + "grad_norm": 0.7966158870763892, + "learning_rate": 4.683879186297191e-06, + "loss": 0.3946, + "step": 4010 + }, + { + "epoch": 0.9918397626112759, + "grad_norm": 0.7517907281222944, + "learning_rate": 4.683720966400206e-06, + "loss": 0.3914, + "step": 4011 + }, + { + "epoch": 0.9920870425321464, + "grad_norm": 0.7889222304421734, + "learning_rate": 4.683562709591931e-06, + "loss": 0.3978, + "step": 4012 + }, + { + "epoch": 0.9923343224530168, + "grad_norm": 0.7811980760364818, + "learning_rate": 4.683404415875039e-06, + "loss": 0.4237, + "step": 4013 + }, + { + "epoch": 0.9925816023738873, + "grad_norm": 0.8277250098361327, + "learning_rate": 4.683246085252207e-06, + "loss": 0.3695, + "step": 4014 + }, + { + "epoch": 0.9928288822947576, + "grad_norm": 0.7931595016445199, + "learning_rate": 4.683087717726112e-06, + "loss": 0.4025, + "step": 4015 + }, + { + "epoch": 0.9930761622156281, + "grad_norm": 0.7703771521508883, + "learning_rate": 4.682929313299428e-06, + "loss": 0.3843, + "step": 4016 + }, + { + "epoch": 0.9933234421364985, + "grad_norm": 0.7864352072212776, + "learning_rate": 4.682770871974835e-06, + "loss": 0.3914, + "step": 4017 + }, + { + "epoch": 0.993570722057369, + "grad_norm": 0.8004356756819567, + "learning_rate": 4.6826123937550115e-06, + "loss": 0.4228, + "step": 4018 + }, + { + "epoch": 0.9938180019782393, + "grad_norm": 0.767390308350493, + "learning_rate": 4.682453878642634e-06, + "loss": 0.4035, + "step": 4019 + }, + { + "epoch": 0.9940652818991098, + "grad_norm": 0.7933226964116108, + "learning_rate": 4.682295326640383e-06, + "loss": 0.4025, + "step": 4020 + }, + { + "epoch": 0.9943125618199802, + "grad_norm": 0.7925071712238786, + "learning_rate": 4.68213673775094e-06, + "loss": 0.3889, + "step": 4021 + }, + { + "epoch": 0.9945598417408507, + "grad_norm": 0.7849972882849776, + "learning_rate": 4.681978111976983e-06, + "loss": 0.4121, + "step": 4022 + }, + { + "epoch": 0.994807121661721, + "grad_norm": 0.7912155124220194, + "learning_rate": 4.681819449321194e-06, + "loss": 0.3771, + "step": 4023 + }, + { + "epoch": 0.9950544015825915, + "grad_norm": 0.7829532894668891, + "learning_rate": 4.681660749786257e-06, + "loss": 0.3937, + "step": 4024 + }, + { + "epoch": 0.9953016815034619, + "grad_norm": 0.8047332493651175, + "learning_rate": 4.6815020133748514e-06, + "loss": 0.392, + "step": 4025 + }, + { + "epoch": 0.9955489614243324, + "grad_norm": 0.796903690277019, + "learning_rate": 4.6813432400896615e-06, + "loss": 0.4058, + "step": 4026 + }, + { + "epoch": 0.9957962413452027, + "grad_norm": 0.7651108232160881, + "learning_rate": 4.681184429933372e-06, + "loss": 0.4037, + "step": 4027 + }, + { + "epoch": 0.9960435212660732, + "grad_norm": 0.8032283012798399, + "learning_rate": 4.681025582908666e-06, + "loss": 0.3986, + "step": 4028 + }, + { + "epoch": 0.9962908011869436, + "grad_norm": 0.7857611793933696, + "learning_rate": 4.68086669901823e-06, + "loss": 0.4026, + "step": 4029 + }, + { + "epoch": 0.996538081107814, + "grad_norm": 0.7659698488027594, + "learning_rate": 4.680707778264747e-06, + "loss": 0.4015, + "step": 4030 + }, + { + "epoch": 0.9967853610286844, + "grad_norm": 0.8012692642045438, + "learning_rate": 4.680548820650905e-06, + "loss": 0.3898, + "step": 4031 + }, + { + "epoch": 0.9970326409495549, + "grad_norm": 0.7579225666375018, + "learning_rate": 4.680389826179391e-06, + "loss": 0.4337, + "step": 4032 + }, + { + "epoch": 0.9972799208704253, + "grad_norm": 0.8076161741050162, + "learning_rate": 4.680230794852892e-06, + "loss": 0.406, + "step": 4033 + }, + { + "epoch": 0.9975272007912958, + "grad_norm": 0.7625082478334948, + "learning_rate": 4.680071726674097e-06, + "loss": 0.4151, + "step": 4034 + }, + { + "epoch": 0.9977744807121661, + "grad_norm": 0.7806980832371719, + "learning_rate": 4.679912621645693e-06, + "loss": 0.4038, + "step": 4035 + }, + { + "epoch": 0.9980217606330366, + "grad_norm": 0.8086404913136939, + "learning_rate": 4.6797534797703705e-06, + "loss": 0.3958, + "step": 4036 + }, + { + "epoch": 0.998269040553907, + "grad_norm": 0.759896651238656, + "learning_rate": 4.679594301050819e-06, + "loss": 0.4097, + "step": 4037 + }, + { + "epoch": 0.9985163204747775, + "grad_norm": 0.7791843543913921, + "learning_rate": 4.67943508548973e-06, + "loss": 0.37, + "step": 4038 + }, + { + "epoch": 0.9987636003956478, + "grad_norm": 0.7930431997735501, + "learning_rate": 4.679275833089793e-06, + "loss": 0.4134, + "step": 4039 + }, + { + "epoch": 0.9990108803165183, + "grad_norm": 0.7597904031408639, + "learning_rate": 4.679116543853702e-06, + "loss": 0.4118, + "step": 4040 + }, + { + "epoch": 0.9992581602373887, + "grad_norm": 0.7896276040674204, + "learning_rate": 4.678957217784147e-06, + "loss": 0.3747, + "step": 4041 + }, + { + "epoch": 0.9995054401582592, + "grad_norm": 0.7923093015887974, + "learning_rate": 4.678797854883823e-06, + "loss": 0.4067, + "step": 4042 + }, + { + "epoch": 0.9997527200791295, + "grad_norm": 0.769422575838774, + "learning_rate": 4.678638455155424e-06, + "loss": 0.4326, + "step": 4043 + }, + { + "epoch": 1.0, + "grad_norm": 0.7984512022584634, + "learning_rate": 4.6784790186016425e-06, + "loss": 0.3915, + "step": 4044 + } + ], + "logging_steps": 1, + "max_steps": 24264, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 4044, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 416699048263680.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}