diff --git "a/checkpoint-580/trainer_state.json" "b/checkpoint-580/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-580/trainer_state.json" @@ -0,0 +1,4093 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008620689655172414, + "grad_norm": 30.156124114990234, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.946, + "step": 1 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 29.95058250427246, + "learning_rate": 1.0000000000000001e-07, + "loss": 4.0398, + "step": 2 + }, + { + "epoch": 0.02586206896551724, + "grad_norm": 29.866586685180664, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.9303, + "step": 3 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 30.012300491333008, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.9637, + "step": 4 + }, + { + "epoch": 0.04310344827586207, + "grad_norm": 30.934751510620117, + "learning_rate": 2.5000000000000004e-07, + "loss": 4.0617, + "step": 5 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 30.031415939331055, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.9659, + "step": 6 + }, + { + "epoch": 0.0603448275862069, + "grad_norm": 29.62813949584961, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.9978, + "step": 7 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 31.0523681640625, + "learning_rate": 4.0000000000000003e-07, + "loss": 4.0237, + "step": 8 + }, + { + "epoch": 0.07758620689655173, + "grad_norm": 28.83756446838379, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.8214, + "step": 9 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 28.1810359954834, + "learning_rate": 5.000000000000001e-07, + "loss": 3.848, + "step": 10 + }, + { + "epoch": 0.09482758620689655, + "grad_norm": 28.367570877075195, + "learning_rate": 5.5e-07, + "loss": 3.7026, + "step": 11 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 27.66380500793457, + "learning_rate": 6.000000000000001e-07, + "loss": 3.6833, + "step": 12 + }, + { + "epoch": 0.11206896551724138, + "grad_norm": 27.496694564819336, + "learning_rate": 6.5e-07, + "loss": 3.762, + "step": 13 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 28.319055557250977, + "learning_rate": 7.000000000000001e-07, + "loss": 3.8867, + "step": 14 + }, + { + "epoch": 0.12931034482758622, + "grad_norm": 26.112581253051758, + "learning_rate": 7.5e-07, + "loss": 3.6278, + "step": 15 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 25.8245792388916, + "learning_rate": 8.000000000000001e-07, + "loss": 3.7957, + "step": 16 + }, + { + "epoch": 0.14655172413793102, + "grad_norm": 23.66245460510254, + "learning_rate": 8.500000000000001e-07, + "loss": 3.4384, + "step": 17 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 21.69405746459961, + "learning_rate": 9.000000000000001e-07, + "loss": 3.2602, + "step": 18 + }, + { + "epoch": 0.16379310344827586, + "grad_norm": 20.691402435302734, + "learning_rate": 9.500000000000001e-07, + "loss": 3.2486, + "step": 19 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 20.67167854309082, + "learning_rate": 1.0000000000000002e-06, + "loss": 3.2401, + "step": 20 + }, + { + "epoch": 0.1810344827586207, + "grad_norm": 19.054428100585938, + "learning_rate": 1.0500000000000001e-06, + "loss": 3.0375, + "step": 21 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 18.890884399414062, + "learning_rate": 1.1e-06, + "loss": 2.9677, + "step": 22 + }, + { + "epoch": 0.19827586206896552, + "grad_norm": 18.61600112915039, + "learning_rate": 1.1500000000000002e-06, + "loss": 2.8387, + "step": 23 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 16.910585403442383, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.4914, + "step": 24 + }, + { + "epoch": 0.21551724137931033, + "grad_norm": 17.708385467529297, + "learning_rate": 1.25e-06, + "loss": 2.5361, + "step": 25 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 17.07745933532715, + "learning_rate": 1.3e-06, + "loss": 2.4098, + "step": 26 + }, + { + "epoch": 0.23275862068965517, + "grad_norm": 16.244144439697266, + "learning_rate": 1.3500000000000002e-06, + "loss": 2.2155, + "step": 27 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 16.11887550354004, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.0036, + "step": 28 + }, + { + "epoch": 0.25, + "grad_norm": 16.034631729125977, + "learning_rate": 1.45e-06, + "loss": 1.8617, + "step": 29 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 15.437153816223145, + "learning_rate": 1.5e-06, + "loss": 1.7413, + "step": 30 + }, + { + "epoch": 0.2672413793103448, + "grad_norm": 14.188116073608398, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.5002, + "step": 31 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 14.200998306274414, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.4469, + "step": 32 + }, + { + "epoch": 0.28448275862068967, + "grad_norm": 13.209551811218262, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.3016, + "step": 33 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 12.631085395812988, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.1522, + "step": 34 + }, + { + "epoch": 0.3017241379310345, + "grad_norm": 12.504134178161621, + "learning_rate": 1.75e-06, + "loss": 1.0058, + "step": 35 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 12.599784851074219, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.921, + "step": 36 + }, + { + "epoch": 0.31896551724137934, + "grad_norm": 11.577665328979492, + "learning_rate": 1.85e-06, + "loss": 0.8099, + "step": 37 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 10.465872764587402, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.6565, + "step": 38 + }, + { + "epoch": 0.33620689655172414, + "grad_norm": 10.161813735961914, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.5712, + "step": 39 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 8.384145736694336, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4265, + "step": 40 + }, + { + "epoch": 0.35344827586206895, + "grad_norm": 7.072062015533447, + "learning_rate": 2.05e-06, + "loss": 0.409, + "step": 41 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 6.072140693664551, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.3263, + "step": 42 + }, + { + "epoch": 0.3706896551724138, + "grad_norm": 6.31119441986084, + "learning_rate": 2.15e-06, + "loss": 0.258, + "step": 43 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 6.619389057159424, + "learning_rate": 2.2e-06, + "loss": 0.2454, + "step": 44 + }, + { + "epoch": 0.3879310344827586, + "grad_norm": 6.546375751495361, + "learning_rate": 2.25e-06, + "loss": 0.2314, + "step": 45 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 5.219631671905518, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.2006, + "step": 46 + }, + { + "epoch": 0.4051724137931034, + "grad_norm": 3.1164474487304688, + "learning_rate": 2.35e-06, + "loss": 0.1677, + "step": 47 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 1.872147798538208, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.1339, + "step": 48 + }, + { + "epoch": 0.4224137931034483, + "grad_norm": 1.4775545597076416, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.1218, + "step": 49 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 1.2931559085845947, + "learning_rate": 2.5e-06, + "loss": 0.1151, + "step": 50 + }, + { + "epoch": 0.4396551724137931, + "grad_norm": 1.3222297430038452, + "learning_rate": 2.55e-06, + "loss": 0.1164, + "step": 51 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 1.393062710762024, + "learning_rate": 2.6e-06, + "loss": 0.1053, + "step": 52 + }, + { + "epoch": 0.45689655172413796, + "grad_norm": 1.0838805437088013, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.105, + "step": 53 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 0.954925537109375, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0922, + "step": 54 + }, + { + "epoch": 0.47413793103448276, + "grad_norm": 0.7521713972091675, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0887, + "step": 55 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 0.7261010408401489, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0997, + "step": 56 + }, + { + "epoch": 0.49137931034482757, + "grad_norm": 0.6530802845954895, + "learning_rate": 2.85e-06, + "loss": 0.0882, + "step": 57 + }, + { + "epoch": 0.5, + "grad_norm": 0.6899245381355286, + "learning_rate": 2.9e-06, + "loss": 0.0844, + "step": 58 + }, + { + "epoch": 0.5086206896551724, + "grad_norm": 0.6771528124809265, + "learning_rate": 2.95e-06, + "loss": 0.0884, + "step": 59 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.6307985782623291, + "learning_rate": 3e-06, + "loss": 0.0912, + "step": 60 + }, + { + "epoch": 0.5258620689655172, + "grad_norm": 0.5869951844215393, + "learning_rate": 3.05e-06, + "loss": 0.0875, + "step": 61 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 0.6039404273033142, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0768, + "step": 62 + }, + { + "epoch": 0.5431034482758621, + "grad_norm": 0.49209344387054443, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0815, + "step": 63 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.7383344769477844, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0874, + "step": 64 + }, + { + "epoch": 0.5603448275862069, + "grad_norm": 0.5552617311477661, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0762, + "step": 65 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 0.4992441236972809, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0793, + "step": 66 + }, + { + "epoch": 0.5775862068965517, + "grad_norm": 0.4979636073112488, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0792, + "step": 67 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 0.4675934612751007, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0766, + "step": 68 + }, + { + "epoch": 0.5948275862068966, + "grad_norm": 0.49495571851730347, + "learning_rate": 3.45e-06, + "loss": 0.0824, + "step": 69 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 0.6044315695762634, + "learning_rate": 3.5e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.6120689655172413, + "grad_norm": 0.4898519217967987, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0755, + "step": 71 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.4218939244747162, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0762, + "step": 72 + }, + { + "epoch": 0.6293103448275862, + "grad_norm": 0.5785802006721497, + "learning_rate": 3.65e-06, + "loss": 0.0837, + "step": 73 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 0.5505399703979492, + "learning_rate": 3.7e-06, + "loss": 0.0814, + "step": 74 + }, + { + "epoch": 0.646551724137931, + "grad_norm": 0.6062561869621277, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0725, + "step": 75 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 0.707350492477417, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0913, + "step": 76 + }, + { + "epoch": 0.6637931034482759, + "grad_norm": 0.5091889500617981, + "learning_rate": 3.85e-06, + "loss": 0.0697, + "step": 77 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 0.4801473319530487, + "learning_rate": 3.900000000000001e-06, + "loss": 0.073, + "step": 78 + }, + { + "epoch": 0.6810344827586207, + "grad_norm": 0.462162047624588, + "learning_rate": 3.95e-06, + "loss": 0.0805, + "step": 79 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.463969349861145, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0738, + "step": 80 + }, + { + "epoch": 0.6982758620689655, + "grad_norm": 0.3980114161968231, + "learning_rate": 4.05e-06, + "loss": 0.0742, + "step": 81 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 0.3627180755138397, + "learning_rate": 4.1e-06, + "loss": 0.0683, + "step": 82 + }, + { + "epoch": 0.7155172413793104, + "grad_norm": 0.39726322889328003, + "learning_rate": 4.15e-06, + "loss": 0.0717, + "step": 83 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 0.48898085951805115, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0676, + "step": 84 + }, + { + "epoch": 0.7327586206896551, + "grad_norm": 0.6143100261688232, + "learning_rate": 4.25e-06, + "loss": 0.0708, + "step": 85 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 0.5028995275497437, + "learning_rate": 4.3e-06, + "loss": 0.0713, + "step": 86 + }, + { + "epoch": 0.75, + "grad_norm": 0.5576066970825195, + "learning_rate": 4.350000000000001e-06, + "loss": 0.078, + "step": 87 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 0.37101301550865173, + "learning_rate": 4.4e-06, + "loss": 0.071, + "step": 88 + }, + { + "epoch": 0.7672413793103449, + "grad_norm": 0.442694753408432, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0791, + "step": 89 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 0.48991039395332336, + "learning_rate": 4.5e-06, + "loss": 0.0681, + "step": 90 + }, + { + "epoch": 0.7844827586206896, + "grad_norm": 0.46367791295051575, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0701, + "step": 91 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 0.3454825282096863, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0774, + "step": 92 + }, + { + "epoch": 0.8017241379310345, + "grad_norm": 0.35102447867393494, + "learning_rate": 4.65e-06, + "loss": 0.0659, + "step": 93 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 0.3791246712207794, + "learning_rate": 4.7e-06, + "loss": 0.0727, + "step": 94 + }, + { + "epoch": 0.8189655172413793, + "grad_norm": 0.3911365270614624, + "learning_rate": 4.75e-06, + "loss": 0.0641, + "step": 95 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.35395047068595886, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0735, + "step": 96 + }, + { + "epoch": 0.8362068965517241, + "grad_norm": 0.3499661087989807, + "learning_rate": 4.85e-06, + "loss": 0.0657, + "step": 97 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 0.34877678751945496, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0701, + "step": 98 + }, + { + "epoch": 0.853448275862069, + "grad_norm": 0.36436647176742554, + "learning_rate": 4.95e-06, + "loss": 0.0758, + "step": 99 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.4726298749446869, + "learning_rate": 5e-06, + "loss": 0.0731, + "step": 100 + }, + { + "epoch": 0.8706896551724138, + "grad_norm": 0.31892526149749756, + "learning_rate": 4.999965269084342e-06, + "loss": 0.0635, + "step": 101 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 0.3707881569862366, + "learning_rate": 4.999861077302358e-06, + "loss": 0.0689, + "step": 102 + }, + { + "epoch": 0.8879310344827587, + "grad_norm": 0.3244622051715851, + "learning_rate": 4.999687427548989e-06, + "loss": 0.0645, + "step": 103 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 0.386982262134552, + "learning_rate": 4.999444324649045e-06, + "loss": 0.0708, + "step": 104 + }, + { + "epoch": 0.9051724137931034, + "grad_norm": 0.3552643656730652, + "learning_rate": 4.999131775357078e-06, + "loss": 0.0766, + "step": 105 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 0.3503671884536743, + "learning_rate": 4.998749788357184e-06, + "loss": 0.0733, + "step": 106 + }, + { + "epoch": 0.9224137931034483, + "grad_norm": 0.4907379746437073, + "learning_rate": 4.998298374262771e-06, + "loss": 0.0721, + "step": 107 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 0.305571585893631, + "learning_rate": 4.997777545616258e-06, + "loss": 0.0603, + "step": 108 + }, + { + "epoch": 0.9396551724137931, + "grad_norm": 0.3188783824443817, + "learning_rate": 4.99718731688873e-06, + "loss": 0.0574, + "step": 109 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 0.3277616500854492, + "learning_rate": 4.996527704479535e-06, + "loss": 0.0671, + "step": 110 + }, + { + "epoch": 0.9568965517241379, + "grad_norm": 0.38640740513801575, + "learning_rate": 4.995798726715826e-06, + "loss": 0.0625, + "step": 111 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.5463483929634094, + "learning_rate": 4.995000403852057e-06, + "loss": 0.0718, + "step": 112 + }, + { + "epoch": 0.9741379310344828, + "grad_norm": 0.3974014222621918, + "learning_rate": 4.994132758069413e-06, + "loss": 0.071, + "step": 113 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 0.3642785847187042, + "learning_rate": 4.993195813475202e-06, + "loss": 0.0658, + "step": 114 + }, + { + "epoch": 0.9913793103448276, + "grad_norm": 0.3118143677711487, + "learning_rate": 4.992189596102179e-06, + "loss": 0.0633, + "step": 115 + }, + { + "epoch": 1.0, + "grad_norm": 0.41573596000671387, + "learning_rate": 4.991114133907822e-06, + "loss": 0.0662, + "step": 116 + }, + { + "epoch": 1.0086206896551724, + "grad_norm": 0.29761484265327454, + "learning_rate": 4.989969456773562e-06, + "loss": 0.0649, + "step": 117 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 0.399275004863739, + "learning_rate": 4.988755596503948e-06, + "loss": 0.0521, + "step": 118 + }, + { + "epoch": 1.0258620689655173, + "grad_norm": 0.37820911407470703, + "learning_rate": 4.987472586825762e-06, + "loss": 0.0596, + "step": 119 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 0.4371725916862488, + "learning_rate": 4.986120463387084e-06, + "loss": 0.07, + "step": 120 + }, + { + "epoch": 1.043103448275862, + "grad_norm": 0.3165535032749176, + "learning_rate": 4.984699263756303e-06, + "loss": 0.0613, + "step": 121 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 0.40526920557022095, + "learning_rate": 4.983209027421072e-06, + "loss": 0.0611, + "step": 122 + }, + { + "epoch": 1.0603448275862069, + "grad_norm": 0.39523962140083313, + "learning_rate": 4.9816497957872055e-06, + "loss": 0.0634, + "step": 123 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 0.33234691619873047, + "learning_rate": 4.9800216121775404e-06, + "loss": 0.0602, + "step": 124 + }, + { + "epoch": 1.0775862068965518, + "grad_norm": 0.3455008268356323, + "learning_rate": 4.978324521830721e-06, + "loss": 0.0624, + "step": 125 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 0.3968574106693268, + "learning_rate": 4.97655857189995e-06, + "loss": 0.0525, + "step": 126 + }, + { + "epoch": 1.0948275862068966, + "grad_norm": 0.47838935256004333, + "learning_rate": 4.974723811451673e-06, + "loss": 0.0617, + "step": 127 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.43055081367492676, + "learning_rate": 4.972820291464219e-06, + "loss": 0.0484, + "step": 128 + }, + { + "epoch": 1.1120689655172413, + "grad_norm": 0.4012243449687958, + "learning_rate": 4.97084806482638e-06, + "loss": 0.0562, + "step": 129 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 0.586875319480896, + "learning_rate": 4.968807186335948e-06, + "loss": 0.0633, + "step": 130 + }, + { + "epoch": 1.1293103448275863, + "grad_norm": 0.48629340529441833, + "learning_rate": 4.966697712698185e-06, + "loss": 0.0615, + "step": 131 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 0.41810542345046997, + "learning_rate": 4.964519702524251e-06, + "loss": 0.0564, + "step": 132 + }, + { + "epoch": 1.146551724137931, + "grad_norm": 0.4095732271671295, + "learning_rate": 4.962273216329577e-06, + "loss": 0.0634, + "step": 133 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 0.3784744441509247, + "learning_rate": 4.959958316532181e-06, + "loss": 0.0521, + "step": 134 + }, + { + "epoch": 1.1637931034482758, + "grad_norm": 0.4087596833705902, + "learning_rate": 4.957575067450935e-06, + "loss": 0.0569, + "step": 135 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 0.4369741380214691, + "learning_rate": 4.955123535303775e-06, + "loss": 0.057, + "step": 136 + }, + { + "epoch": 1.1810344827586208, + "grad_norm": 0.3851775527000427, + "learning_rate": 4.95260378820587e-06, + "loss": 0.0521, + "step": 137 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 0.3391599655151367, + "learning_rate": 4.950015896167716e-06, + "loss": 0.0525, + "step": 138 + }, + { + "epoch": 1.1982758620689655, + "grad_norm": 0.4176456332206726, + "learning_rate": 4.947359931093202e-06, + "loss": 0.0548, + "step": 139 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.45864036679267883, + "learning_rate": 4.944635966777607e-06, + "loss": 0.0578, + "step": 140 + }, + { + "epoch": 1.2155172413793103, + "grad_norm": 0.4484093487262726, + "learning_rate": 4.941844078905551e-06, + "loss": 0.0528, + "step": 141 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 0.42641302943229675, + "learning_rate": 4.938984345048892e-06, + "loss": 0.0519, + "step": 142 + }, + { + "epoch": 1.2327586206896552, + "grad_norm": 0.5111292004585266, + "learning_rate": 4.936056844664571e-06, + "loss": 0.0639, + "step": 143 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.3739636540412903, + "learning_rate": 4.933061659092401e-06, + "loss": 0.055, + "step": 144 + }, + { + "epoch": 1.25, + "grad_norm": 0.3896700143814087, + "learning_rate": 4.929998871552814e-06, + "loss": 0.0548, + "step": 145 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 0.4158782660961151, + "learning_rate": 4.926868567144543e-06, + "loss": 0.0572, + "step": 146 + }, + { + "epoch": 1.2672413793103448, + "grad_norm": 0.4275132715702057, + "learning_rate": 4.923670832842256e-06, + "loss": 0.0497, + "step": 147 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 0.3890449106693268, + "learning_rate": 4.920405757494147e-06, + "loss": 0.0584, + "step": 148 + }, + { + "epoch": 1.2844827586206897, + "grad_norm": 0.4569760262966156, + "learning_rate": 4.917073431819462e-06, + "loss": 0.0618, + "step": 149 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 0.4232596457004547, + "learning_rate": 4.913673948405977e-06, + "loss": 0.0516, + "step": 150 + }, + { + "epoch": 1.3017241379310345, + "grad_norm": 0.4578849673271179, + "learning_rate": 4.910207401707431e-06, + "loss": 0.0538, + "step": 151 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 0.39952200651168823, + "learning_rate": 4.906673888040895e-06, + "loss": 0.0471, + "step": 152 + }, + { + "epoch": 1.3189655172413794, + "grad_norm": 0.38846731185913086, + "learning_rate": 4.903073505584102e-06, + "loss": 0.0437, + "step": 153 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 0.4084296226501465, + "learning_rate": 4.899406354372716e-06, + "loss": 0.0532, + "step": 154 + }, + { + "epoch": 1.3362068965517242, + "grad_norm": 0.4114966094493866, + "learning_rate": 4.895672536297551e-06, + "loss": 0.0443, + "step": 155 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 0.6465100049972534, + "learning_rate": 4.891872155101746e-06, + "loss": 0.0605, + "step": 156 + }, + { + "epoch": 1.353448275862069, + "grad_norm": 0.43234798312187195, + "learning_rate": 4.888005316377873e-06, + "loss": 0.0529, + "step": 157 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 0.408527135848999, + "learning_rate": 4.884072127565015e-06, + "loss": 0.0485, + "step": 158 + }, + { + "epoch": 1.3706896551724137, + "grad_norm": 0.45152318477630615, + "learning_rate": 4.880072697945768e-06, + "loss": 0.0466, + "step": 159 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.4669966995716095, + "learning_rate": 4.876007138643216e-06, + "loss": 0.0438, + "step": 160 + }, + { + "epoch": 1.3879310344827587, + "grad_norm": 0.47117388248443604, + "learning_rate": 4.871875562617837e-06, + "loss": 0.0464, + "step": 161 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 0.6085790991783142, + "learning_rate": 4.867678084664365e-06, + "loss": 0.0574, + "step": 162 + }, + { + "epoch": 1.4051724137931034, + "grad_norm": 0.5736953616142273, + "learning_rate": 4.863414821408602e-06, + "loss": 0.0507, + "step": 163 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 0.5003281831741333, + "learning_rate": 4.8590858913041775e-06, + "loss": 0.051, + "step": 164 + }, + { + "epoch": 1.4224137931034484, + "grad_norm": 0.4361717104911804, + "learning_rate": 4.854691414629258e-06, + "loss": 0.0476, + "step": 165 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 0.47907084226608276, + "learning_rate": 4.8502315134832e-06, + "loss": 0.0495, + "step": 166 + }, + { + "epoch": 1.4396551724137931, + "grad_norm": 0.46805888414382935, + "learning_rate": 4.8457063117831656e-06, + "loss": 0.0474, + "step": 167 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 0.5058819651603699, + "learning_rate": 4.8411159352606735e-06, + "loss": 0.0541, + "step": 168 + }, + { + "epoch": 1.456896551724138, + "grad_norm": 0.4632813334465027, + "learning_rate": 4.836460511458107e-06, + "loss": 0.0409, + "step": 169 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 0.3654536008834839, + "learning_rate": 4.831740169725172e-06, + "loss": 0.0466, + "step": 170 + }, + { + "epoch": 1.4741379310344827, + "grad_norm": 0.4301295578479767, + "learning_rate": 4.8269550412153e-06, + "loss": 0.0472, + "step": 171 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 0.37643712759017944, + "learning_rate": 4.822105258882007e-06, + "loss": 0.0445, + "step": 172 + }, + { + "epoch": 1.4913793103448276, + "grad_norm": 0.3928871154785156, + "learning_rate": 4.817190957475199e-06, + "loss": 0.0427, + "step": 173 + }, + { + "epoch": 1.5, + "grad_norm": 0.39636102318763733, + "learning_rate": 4.812212273537426e-06, + "loss": 0.0431, + "step": 174 + }, + { + "epoch": 1.5086206896551724, + "grad_norm": 0.41640329360961914, + "learning_rate": 4.807169345400088e-06, + "loss": 0.0442, + "step": 175 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.4888840317726135, + "learning_rate": 4.802062313179595e-06, + "loss": 0.0529, + "step": 176 + }, + { + "epoch": 1.5258620689655173, + "grad_norm": 0.4154113829135895, + "learning_rate": 4.796891318773472e-06, + "loss": 0.0419, + "step": 177 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 0.41872596740722656, + "learning_rate": 4.791656505856416e-06, + "loss": 0.0411, + "step": 178 + }, + { + "epoch": 1.543103448275862, + "grad_norm": 0.4037671983242035, + "learning_rate": 4.786358019876301e-06, + "loss": 0.0458, + "step": 179 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.4356297552585602, + "learning_rate": 4.7809960080501464e-06, + "loss": 0.0495, + "step": 180 + }, + { + "epoch": 1.5603448275862069, + "grad_norm": 0.4112146496772766, + "learning_rate": 4.7755706193600135e-06, + "loss": 0.0367, + "step": 181 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 0.36699458956718445, + "learning_rate": 4.770082004548878e-06, + "loss": 0.0427, + "step": 182 + }, + { + "epoch": 1.5775862068965516, + "grad_norm": 0.49518612027168274, + "learning_rate": 4.764530316116433e-06, + "loss": 0.0334, + "step": 183 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 0.3718539774417877, + "learning_rate": 4.758915708314858e-06, + "loss": 0.0367, + "step": 184 + }, + { + "epoch": 1.5948275862068966, + "grad_norm": 0.46727338433265686, + "learning_rate": 4.753238337144528e-06, + "loss": 0.044, + "step": 185 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 0.5345709323883057, + "learning_rate": 4.747498360349681e-06, + "loss": 0.0401, + "step": 186 + }, + { + "epoch": 1.6120689655172413, + "grad_norm": 0.561375617980957, + "learning_rate": 4.7416959374140405e-06, + "loss": 0.0457, + "step": 187 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 0.4418468773365021, + "learning_rate": 4.735831229556374e-06, + "loss": 0.0348, + "step": 188 + }, + { + "epoch": 1.6293103448275863, + "grad_norm": 0.5142704844474792, + "learning_rate": 4.72990439972602e-06, + "loss": 0.0391, + "step": 189 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 0.48653194308280945, + "learning_rate": 4.72391561259836e-06, + "loss": 0.0358, + "step": 190 + }, + { + "epoch": 1.646551724137931, + "grad_norm": 0.48498624563217163, + "learning_rate": 4.717865034570243e-06, + "loss": 0.038, + "step": 191 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.4586655795574188, + "learning_rate": 4.711752833755362e-06, + "loss": 0.0351, + "step": 192 + }, + { + "epoch": 1.6637931034482758, + "grad_norm": 0.44023749232292175, + "learning_rate": 4.70557917997958e-06, + "loss": 0.0277, + "step": 193 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 0.44018658995628357, + "learning_rate": 4.6993442447762185e-06, + "loss": 0.0382, + "step": 194 + }, + { + "epoch": 1.6810344827586206, + "grad_norm": 0.49303099513053894, + "learning_rate": 4.693048201381281e-06, + "loss": 0.0413, + "step": 195 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 0.572812557220459, + "learning_rate": 4.686691224728652e-06, + "loss": 0.0389, + "step": 196 + }, + { + "epoch": 1.6982758620689655, + "grad_norm": 0.4995497167110443, + "learning_rate": 4.680273491445227e-06, + "loss": 0.0314, + "step": 197 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 0.465404212474823, + "learning_rate": 4.673795179846008e-06, + "loss": 0.0323, + "step": 198 + }, + { + "epoch": 1.7155172413793105, + "grad_norm": 0.45183637738227844, + "learning_rate": 4.667256469929149e-06, + "loss": 0.0308, + "step": 199 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 0.4891216456890106, + "learning_rate": 4.660657543370958e-06, + "loss": 0.034, + "step": 200 + }, + { + "epoch": 1.7327586206896552, + "grad_norm": 0.5728661417961121, + "learning_rate": 4.653998583520844e-06, + "loss": 0.0359, + "step": 201 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 0.5465419888496399, + "learning_rate": 4.6472797753962255e-06, + "loss": 0.0366, + "step": 202 + }, + { + "epoch": 1.75, + "grad_norm": 0.4671058654785156, + "learning_rate": 4.640501305677387e-06, + "loss": 0.0345, + "step": 203 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 0.4623485803604126, + "learning_rate": 4.6336633627023e-06, + "loss": 0.0261, + "step": 204 + }, + { + "epoch": 1.7672413793103448, + "grad_norm": 0.4471631944179535, + "learning_rate": 4.626766136461378e-06, + "loss": 0.0244, + "step": 205 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 0.46128782629966736, + "learning_rate": 4.61980981859221e-06, + "loss": 0.0301, + "step": 206 + }, + { + "epoch": 1.7844827586206895, + "grad_norm": 0.45246005058288574, + "learning_rate": 4.612794602374226e-06, + "loss": 0.0274, + "step": 207 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.5971981287002563, + "learning_rate": 4.605720682723331e-06, + "loss": 0.0387, + "step": 208 + }, + { + "epoch": 1.8017241379310345, + "grad_norm": 0.4932047128677368, + "learning_rate": 4.598588256186491e-06, + "loss": 0.0228, + "step": 209 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 0.6208699345588684, + "learning_rate": 4.591397520936271e-06, + "loss": 0.0307, + "step": 210 + }, + { + "epoch": 1.8189655172413794, + "grad_norm": 0.5628708004951477, + "learning_rate": 4.584148676765327e-06, + "loss": 0.0262, + "step": 211 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 0.6893038749694824, + "learning_rate": 4.576841925080853e-06, + "loss": 0.0312, + "step": 212 + }, + { + "epoch": 1.8362068965517242, + "grad_norm": 0.5523396134376526, + "learning_rate": 4.569477468898992e-06, + "loss": 0.0218, + "step": 213 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 0.6797971129417419, + "learning_rate": 4.562055512839189e-06, + "loss": 0.0232, + "step": 214 + }, + { + "epoch": 1.853448275862069, + "grad_norm": 0.7214551568031311, + "learning_rate": 4.554576263118506e-06, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 0.5208979249000549, + "learning_rate": 4.547039927545899e-06, + "loss": 0.0261, + "step": 216 + }, + { + "epoch": 1.8706896551724137, + "grad_norm": 0.5396261215209961, + "learning_rate": 4.539446715516434e-06, + "loss": 0.0269, + "step": 217 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 0.5789651274681091, + "learning_rate": 4.531796838005477e-06, + "loss": 0.035, + "step": 218 + }, + { + "epoch": 1.8879310344827587, + "grad_norm": 0.5388063192367554, + "learning_rate": 4.524090507562828e-06, + "loss": 0.0277, + "step": 219 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.4727953374385834, + "learning_rate": 4.516327938306818e-06, + "loss": 0.0225, + "step": 220 + }, + { + "epoch": 1.9051724137931034, + "grad_norm": 0.4665185809135437, + "learning_rate": 4.508509345918357e-06, + "loss": 0.0268, + "step": 221 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 0.5534846186637878, + "learning_rate": 4.500634947634943e-06, + "loss": 0.0277, + "step": 222 + }, + { + "epoch": 1.9224137931034484, + "grad_norm": 0.46098095178604126, + "learning_rate": 4.492704962244626e-06, + "loss": 0.0274, + "step": 223 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.49379560351371765, + "learning_rate": 4.4847196100799305e-06, + "loss": 0.0252, + "step": 224 + }, + { + "epoch": 1.9396551724137931, + "grad_norm": 0.4714183807373047, + "learning_rate": 4.476679113011729e-06, + "loss": 0.0253, + "step": 225 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 0.6230431199073792, + "learning_rate": 4.4685836944430815e-06, + "loss": 0.0242, + "step": 226 + }, + { + "epoch": 1.956896551724138, + "grad_norm": 0.42832595109939575, + "learning_rate": 4.46043357930303e-06, + "loss": 0.0165, + "step": 227 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 0.38421332836151123, + "learning_rate": 4.452228994040341e-06, + "loss": 0.0168, + "step": 228 + }, + { + "epoch": 1.9741379310344827, + "grad_norm": 0.5493133664131165, + "learning_rate": 4.443970166617223e-06, + "loss": 0.0193, + "step": 229 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 0.45186540484428406, + "learning_rate": 4.435657326502986e-06, + "loss": 0.0165, + "step": 230 + }, + { + "epoch": 1.9913793103448276, + "grad_norm": 0.8328711986541748, + "learning_rate": 4.4272907046676704e-06, + "loss": 0.025, + "step": 231 + }, + { + "epoch": 2.0, + "grad_norm": 0.5378574728965759, + "learning_rate": 4.418870533575626e-06, + "loss": 0.0166, + "step": 232 + }, + { + "epoch": 2.0086206896551726, + "grad_norm": 0.5182176828384399, + "learning_rate": 4.410397047179053e-06, + "loss": 0.011, + "step": 233 + }, + { + "epoch": 2.0172413793103448, + "grad_norm": 0.5283203125, + "learning_rate": 4.401870480911505e-06, + "loss": 0.0178, + "step": 234 + }, + { + "epoch": 2.0258620689655173, + "grad_norm": 0.3926998972892761, + "learning_rate": 4.393291071681345e-06, + "loss": 0.0121, + "step": 235 + }, + { + "epoch": 2.0344827586206895, + "grad_norm": 0.4371504783630371, + "learning_rate": 4.384659057865165e-06, + "loss": 0.0164, + "step": 236 + }, + { + "epoch": 2.043103448275862, + "grad_norm": 0.6982957720756531, + "learning_rate": 4.375974679301158e-06, + "loss": 0.0192, + "step": 237 + }, + { + "epoch": 2.0517241379310347, + "grad_norm": 0.3988925814628601, + "learning_rate": 4.367238177282462e-06, + "loss": 0.0139, + "step": 238 + }, + { + "epoch": 2.060344827586207, + "grad_norm": 0.5101247429847717, + "learning_rate": 4.3584497945504465e-06, + "loss": 0.0118, + "step": 239 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.609987735748291, + "learning_rate": 4.349609775287977e-06, + "loss": 0.0168, + "step": 240 + }, + { + "epoch": 2.0775862068965516, + "grad_norm": 0.37872084975242615, + "learning_rate": 4.340718365112623e-06, + "loss": 0.009, + "step": 241 + }, + { + "epoch": 2.086206896551724, + "grad_norm": 0.5639368891716003, + "learning_rate": 4.331775811069837e-06, + "loss": 0.0154, + "step": 242 + }, + { + "epoch": 2.0948275862068964, + "grad_norm": 0.6034789681434631, + "learning_rate": 4.322782361626094e-06, + "loss": 0.0174, + "step": 243 + }, + { + "epoch": 2.103448275862069, + "grad_norm": 0.46892449259757996, + "learning_rate": 4.313738266661979e-06, + "loss": 0.0128, + "step": 244 + }, + { + "epoch": 2.1120689655172415, + "grad_norm": 0.5814667344093323, + "learning_rate": 4.3046437774652525e-06, + "loss": 0.0129, + "step": 245 + }, + { + "epoch": 2.1206896551724137, + "grad_norm": 0.39165839552879333, + "learning_rate": 4.295499146723864e-06, + "loss": 0.0133, + "step": 246 + }, + { + "epoch": 2.1293103448275863, + "grad_norm": 0.45224517583847046, + "learning_rate": 4.286304628518932e-06, + "loss": 0.0093, + "step": 247 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 0.7525523900985718, + "learning_rate": 4.277060478317687e-06, + "loss": 0.0159, + "step": 248 + }, + { + "epoch": 2.146551724137931, + "grad_norm": 0.6550068259239197, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0126, + "step": 249 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 0.5949872136116028, + "learning_rate": 4.258424310683094e-06, + "loss": 0.01, + "step": 250 + }, + { + "epoch": 2.163793103448276, + "grad_norm": 0.44155633449554443, + "learning_rate": 4.24903281105068e-06, + "loss": 0.0113, + "step": 251 + }, + { + "epoch": 2.1724137931034484, + "grad_norm": 0.7045619487762451, + "learning_rate": 4.23959271500943e-06, + "loss": 0.0108, + "step": 252 + }, + { + "epoch": 2.1810344827586206, + "grad_norm": 0.4648723006248474, + "learning_rate": 4.230104284849889e-06, + "loss": 0.0104, + "step": 253 + }, + { + "epoch": 2.189655172413793, + "grad_norm": 0.4972316026687622, + "learning_rate": 4.220567784205551e-06, + "loss": 0.0082, + "step": 254 + }, + { + "epoch": 2.1982758620689653, + "grad_norm": 0.4233075678348541, + "learning_rate": 4.210983478045537e-06, + "loss": 0.0088, + "step": 255 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5844057202339172, + "learning_rate": 4.201351632667227e-06, + "loss": 0.0101, + "step": 256 + }, + { + "epoch": 2.2155172413793105, + "grad_norm": 0.48759040236473083, + "learning_rate": 4.191672515688872e-06, + "loss": 0.0097, + "step": 257 + }, + { + "epoch": 2.2241379310344827, + "grad_norm": 0.48416000604629517, + "learning_rate": 4.181946396042146e-06, + "loss": 0.0085, + "step": 258 + }, + { + "epoch": 2.2327586206896552, + "grad_norm": 0.5503181219100952, + "learning_rate": 4.172173543964682e-06, + "loss": 0.0122, + "step": 259 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 0.6163179278373718, + "learning_rate": 4.162354230992562e-06, + "loss": 0.0119, + "step": 260 + }, + { + "epoch": 2.25, + "grad_norm": 0.47484147548675537, + "learning_rate": 4.1524887299527695e-06, + "loss": 0.0142, + "step": 261 + }, + { + "epoch": 2.2586206896551726, + "grad_norm": 0.5557327270507812, + "learning_rate": 4.142577314955614e-06, + "loss": 0.0086, + "step": 262 + }, + { + "epoch": 2.2672413793103448, + "grad_norm": 0.3577985465526581, + "learning_rate": 4.1326202613871065e-06, + "loss": 0.0089, + "step": 263 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 0.33690375089645386, + "learning_rate": 4.122617845901322e-06, + "loss": 0.0095, + "step": 264 + }, + { + "epoch": 2.2844827586206895, + "grad_norm": 0.5704361796379089, + "learning_rate": 4.112570346412696e-06, + "loss": 0.0078, + "step": 265 + }, + { + "epoch": 2.293103448275862, + "grad_norm": 0.4501226544380188, + "learning_rate": 4.102478042088315e-06, + "loss": 0.0107, + "step": 266 + }, + { + "epoch": 2.3017241379310347, + "grad_norm": 0.3546983301639557, + "learning_rate": 4.092341213340155e-06, + "loss": 0.0087, + "step": 267 + }, + { + "epoch": 2.310344827586207, + "grad_norm": 0.6514727473258972, + "learning_rate": 4.0821601418172926e-06, + "loss": 0.0121, + "step": 268 + }, + { + "epoch": 2.3189655172413794, + "grad_norm": 0.5242908596992493, + "learning_rate": 4.0719351103980754e-06, + "loss": 0.0104, + "step": 269 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 0.40950050950050354, + "learning_rate": 4.0616664031822686e-06, + "loss": 0.0091, + "step": 270 + }, + { + "epoch": 2.336206896551724, + "grad_norm": 0.24205856025218964, + "learning_rate": 4.051354305483153e-06, + "loss": 0.0043, + "step": 271 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 0.35039305686950684, + "learning_rate": 4.040999103819606e-06, + "loss": 0.0069, + "step": 272 + }, + { + "epoch": 2.353448275862069, + "grad_norm": 0.30672788619995117, + "learning_rate": 4.030601085908137e-06, + "loss": 0.0064, + "step": 273 + }, + { + "epoch": 2.3620689655172415, + "grad_norm": 0.26436808705329895, + "learning_rate": 4.020160540654892e-06, + "loss": 0.0037, + "step": 274 + }, + { + "epoch": 2.3706896551724137, + "grad_norm": 0.4178946614265442, + "learning_rate": 4.009677758147627e-06, + "loss": 0.0041, + "step": 275 + }, + { + "epoch": 2.3793103448275863, + "grad_norm": 0.7681214809417725, + "learning_rate": 3.999153029647651e-06, + "loss": 0.015, + "step": 276 + }, + { + "epoch": 2.3879310344827585, + "grad_norm": 0.49876728653907776, + "learning_rate": 3.988586647581732e-06, + "loss": 0.0082, + "step": 277 + }, + { + "epoch": 2.396551724137931, + "grad_norm": 0.5959510207176208, + "learning_rate": 3.977978905533966e-06, + "loss": 0.0084, + "step": 278 + }, + { + "epoch": 2.405172413793103, + "grad_norm": 0.538182258605957, + "learning_rate": 3.9673300982376315e-06, + "loss": 0.0087, + "step": 279 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 0.6760286688804626, + "learning_rate": 3.956640521566989e-06, + "loss": 0.0123, + "step": 280 + }, + { + "epoch": 2.4224137931034484, + "grad_norm": 0.4143922030925751, + "learning_rate": 3.945910472529068e-06, + "loss": 0.0056, + "step": 281 + }, + { + "epoch": 2.4310344827586206, + "grad_norm": 0.4239058792591095, + "learning_rate": 3.935140249255412e-06, + "loss": 0.009, + "step": 282 + }, + { + "epoch": 2.439655172413793, + "grad_norm": 0.495762437582016, + "learning_rate": 3.924330150993793e-06, + "loss": 0.0091, + "step": 283 + }, + { + "epoch": 2.4482758620689653, + "grad_norm": 0.46725037693977356, + "learning_rate": 3.913480478099898e-06, + "loss": 0.0066, + "step": 284 + }, + { + "epoch": 2.456896551724138, + "grad_norm": 0.501814603805542, + "learning_rate": 3.902591532028989e-06, + "loss": 0.0087, + "step": 285 + }, + { + "epoch": 2.4655172413793105, + "grad_norm": 0.38845211267471313, + "learning_rate": 3.891663615327518e-06, + "loss": 0.0059, + "step": 286 + }, + { + "epoch": 2.4741379310344827, + "grad_norm": 0.3252674639225006, + "learning_rate": 3.880697031624729e-06, + "loss": 0.0049, + "step": 287 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.3237666189670563, + "learning_rate": 3.869692085624218e-06, + "loss": 0.0062, + "step": 288 + }, + { + "epoch": 2.4913793103448274, + "grad_norm": 0.4213542342185974, + "learning_rate": 3.858649083095463e-06, + "loss": 0.0093, + "step": 289 + }, + { + "epoch": 2.5, + "grad_norm": 0.6313378810882568, + "learning_rate": 3.8475683308653385e-06, + "loss": 0.0082, + "step": 290 + }, + { + "epoch": 2.5086206896551726, + "grad_norm": 1.0402452945709229, + "learning_rate": 3.836450136809579e-06, + "loss": 0.0078, + "step": 291 + }, + { + "epoch": 2.5172413793103448, + "grad_norm": 0.6053091287612915, + "learning_rate": 3.825294809844234e-06, + "loss": 0.0044, + "step": 292 + }, + { + "epoch": 2.5258620689655173, + "grad_norm": 0.6727966666221619, + "learning_rate": 3.8141026599170792e-06, + "loss": 0.0135, + "step": 293 + }, + { + "epoch": 2.5344827586206895, + "grad_norm": 0.3446575105190277, + "learning_rate": 3.8028739979990072e-06, + "loss": 0.0063, + "step": 294 + }, + { + "epoch": 2.543103448275862, + "grad_norm": 0.5400390028953552, + "learning_rate": 3.791609136075384e-06, + "loss": 0.0094, + "step": 295 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 0.2682323157787323, + "learning_rate": 3.7803083871373876e-06, + "loss": 0.0028, + "step": 296 + }, + { + "epoch": 2.560344827586207, + "grad_norm": 0.49711498618125916, + "learning_rate": 3.7689720651733034e-06, + "loss": 0.0085, + "step": 297 + }, + { + "epoch": 2.5689655172413794, + "grad_norm": 0.3905172646045685, + "learning_rate": 3.7576004851598052e-06, + "loss": 0.0065, + "step": 298 + }, + { + "epoch": 2.5775862068965516, + "grad_norm": 0.33470281958580017, + "learning_rate": 3.7461939630532018e-06, + "loss": 0.0061, + "step": 299 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 0.43622592091560364, + "learning_rate": 3.734752815780659e-06, + "loss": 0.008, + "step": 300 + }, + { + "epoch": 2.594827586206897, + "grad_norm": 0.40031933784484863, + "learning_rate": 3.7232773612313945e-06, + "loss": 0.0107, + "step": 301 + }, + { + "epoch": 2.603448275862069, + "grad_norm": 0.30814412236213684, + "learning_rate": 3.7117679182478415e-06, + "loss": 0.0042, + "step": 302 + }, + { + "epoch": 2.612068965517241, + "grad_norm": 0.16828757524490356, + "learning_rate": 3.7002248066167977e-06, + "loss": 0.0022, + "step": 303 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.3726310729980469, + "learning_rate": 3.6886483470605293e-06, + "loss": 0.0097, + "step": 304 + }, + { + "epoch": 2.6293103448275863, + "grad_norm": 0.47232842445373535, + "learning_rate": 3.6770388612278713e-06, + "loss": 0.0083, + "step": 305 + }, + { + "epoch": 2.637931034482759, + "grad_norm": 0.2700427770614624, + "learning_rate": 3.66539667168528e-06, + "loss": 0.0017, + "step": 306 + }, + { + "epoch": 2.646551724137931, + "grad_norm": 0.3362017273902893, + "learning_rate": 3.65372210190788e-06, + "loss": 0.0054, + "step": 307 + }, + { + "epoch": 2.655172413793103, + "grad_norm": 0.3315584063529968, + "learning_rate": 3.6420154762704685e-06, + "loss": 0.0032, + "step": 308 + }, + { + "epoch": 2.663793103448276, + "grad_norm": 0.656471312046051, + "learning_rate": 3.630277120038508e-06, + "loss": 0.0093, + "step": 309 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 1.8886873722076416, + "learning_rate": 3.6185073593590868e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 2.6810344827586206, + "grad_norm": 0.3265067934989929, + "learning_rate": 3.606706521251857e-06, + "loss": 0.0037, + "step": 311 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 0.39972051978111267, + "learning_rate": 3.5948749335999493e-06, + "loss": 0.0066, + "step": 312 + }, + { + "epoch": 2.6982758620689653, + "grad_norm": 0.48707249760627747, + "learning_rate": 3.583012925140862e-06, + "loss": 0.0066, + "step": 313 + }, + { + "epoch": 2.706896551724138, + "grad_norm": 0.6093474626541138, + "learning_rate": 3.571120825457327e-06, + "loss": 0.007, + "step": 314 + }, + { + "epoch": 2.7155172413793105, + "grad_norm": 0.3256229758262634, + "learning_rate": 3.5591989649681534e-06, + "loss": 0.0054, + "step": 315 + }, + { + "epoch": 2.7241379310344827, + "grad_norm": 0.47791576385498047, + "learning_rate": 3.5472476749190465e-06, + "loss": 0.0048, + "step": 316 + }, + { + "epoch": 2.7327586206896552, + "grad_norm": 0.7635737061500549, + "learning_rate": 3.535267287373403e-06, + "loss": 0.006, + "step": 317 + }, + { + "epoch": 2.7413793103448274, + "grad_norm": 0.6153246164321899, + "learning_rate": 3.523258135203087e-06, + "loss": 0.0071, + "step": 318 + }, + { + "epoch": 2.75, + "grad_norm": 0.22357748448848724, + "learning_rate": 3.5112205520791785e-06, + "loss": 0.0036, + "step": 319 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.3073960840702057, + "learning_rate": 3.4991548724627054e-06, + "loss": 0.0028, + "step": 320 + }, + { + "epoch": 2.7672413793103448, + "grad_norm": 0.17926068603992462, + "learning_rate": 3.48706143159535e-06, + "loss": 0.0028, + "step": 321 + }, + { + "epoch": 2.7758620689655173, + "grad_norm": 0.43959733843803406, + "learning_rate": 3.4749405654901297e-06, + "loss": 0.0027, + "step": 322 + }, + { + "epoch": 2.7844827586206895, + "grad_norm": 0.396281898021698, + "learning_rate": 3.4627926109220684e-06, + "loss": 0.0048, + "step": 323 + }, + { + "epoch": 2.793103448275862, + "grad_norm": 0.33752575516700745, + "learning_rate": 3.450617905418834e-06, + "loss": 0.0055, + "step": 324 + }, + { + "epoch": 2.8017241379310347, + "grad_norm": 0.3482801914215088, + "learning_rate": 3.4384167872513642e-06, + "loss": 0.0054, + "step": 325 + }, + { + "epoch": 2.810344827586207, + "grad_norm": 0.38212546706199646, + "learning_rate": 3.4261895954244613e-06, + "loss": 0.0051, + "step": 326 + }, + { + "epoch": 2.8189655172413794, + "grad_norm": 0.3641749918460846, + "learning_rate": 3.4139366696673815e-06, + "loss": 0.0056, + "step": 327 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 0.1609128713607788, + "learning_rate": 3.4016583504243892e-06, + "loss": 0.0028, + "step": 328 + }, + { + "epoch": 2.836206896551724, + "grad_norm": 0.2003130167722702, + "learning_rate": 3.3893549788453e-06, + "loss": 0.0023, + "step": 329 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 0.4341651201248169, + "learning_rate": 3.3770268967760026e-06, + "loss": 0.006, + "step": 330 + }, + { + "epoch": 2.853448275862069, + "grad_norm": 0.28961309790611267, + "learning_rate": 3.36467444674896e-06, + "loss": 0.0022, + "step": 331 + }, + { + "epoch": 2.862068965517241, + "grad_norm": 0.35188788175582886, + "learning_rate": 3.3522979719736923e-06, + "loss": 0.0079, + "step": 332 + }, + { + "epoch": 2.8706896551724137, + "grad_norm": 0.3318072259426117, + "learning_rate": 3.339897816327241e-06, + "loss": 0.0038, + "step": 333 + }, + { + "epoch": 2.8793103448275863, + "grad_norm": 0.6669831871986389, + "learning_rate": 3.327474324344614e-06, + "loss": 0.0048, + "step": 334 + }, + { + "epoch": 2.887931034482759, + "grad_norm": 0.4758647084236145, + "learning_rate": 3.3150278412092126e-06, + "loss": 0.0031, + "step": 335 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.45927631855010986, + "learning_rate": 3.3025587127432414e-06, + "loss": 0.0052, + "step": 336 + }, + { + "epoch": 2.905172413793103, + "grad_norm": 0.15748094022274017, + "learning_rate": 3.2900672853981e-06, + "loss": 0.002, + "step": 337 + }, + { + "epoch": 2.913793103448276, + "grad_norm": 0.2490128129720688, + "learning_rate": 3.2775539062447566e-06, + "loss": 0.0024, + "step": 338 + }, + { + "epoch": 2.9224137931034484, + "grad_norm": 0.16914480924606323, + "learning_rate": 3.265018922964102e-06, + "loss": 0.0035, + "step": 339 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 0.15188859403133392, + "learning_rate": 3.2524626838372976e-06, + "loss": 0.0013, + "step": 340 + }, + { + "epoch": 2.939655172413793, + "grad_norm": 0.8957772254943848, + "learning_rate": 3.239885537736085e-06, + "loss": 0.0029, + "step": 341 + }, + { + "epoch": 2.9482758620689653, + "grad_norm": 0.11430688202381134, + "learning_rate": 3.2272878341131074e-06, + "loss": 0.0009, + "step": 342 + }, + { + "epoch": 2.956896551724138, + "grad_norm": 0.2879055440425873, + "learning_rate": 3.2146699229921884e-06, + "loss": 0.0034, + "step": 343 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 0.4262220859527588, + "learning_rate": 3.202032154958615e-06, + "loss": 0.0043, + "step": 344 + }, + { + "epoch": 2.9741379310344827, + "grad_norm": 0.26130998134613037, + "learning_rate": 3.1893748811493926e-06, + "loss": 0.0013, + "step": 345 + }, + { + "epoch": 2.9827586206896552, + "grad_norm": 0.29305410385131836, + "learning_rate": 3.1766984532434853e-06, + "loss": 0.0035, + "step": 346 + }, + { + "epoch": 2.9913793103448274, + "grad_norm": 0.3302770256996155, + "learning_rate": 3.164003223452055e-06, + "loss": 0.0028, + "step": 347 + }, + { + "epoch": 3.0, + "grad_norm": 0.41331973671913147, + "learning_rate": 3.151289544508664e-06, + "loss": 0.0042, + "step": 348 + }, + { + "epoch": 3.0086206896551726, + "grad_norm": 0.1458461433649063, + "learning_rate": 3.1385577696594816e-06, + "loss": 0.0012, + "step": 349 + }, + { + "epoch": 3.0172413793103448, + "grad_norm": 0.13347488641738892, + "learning_rate": 3.1258082526534665e-06, + "loss": 0.0012, + "step": 350 + }, + { + "epoch": 3.0258620689655173, + "grad_norm": 0.19563832879066467, + "learning_rate": 3.1130413477325398e-06, + "loss": 0.0013, + "step": 351 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.31846922636032104, + "learning_rate": 3.100257409621738e-06, + "loss": 0.0044, + "step": 352 + }, + { + "epoch": 3.043103448275862, + "grad_norm": 0.34810179471969604, + "learning_rate": 3.0874567935193624e-06, + "loss": 0.0008, + "step": 353 + }, + { + "epoch": 3.0517241379310347, + "grad_norm": 0.4476202130317688, + "learning_rate": 3.0746398550871093e-06, + "loss": 0.0028, + "step": 354 + }, + { + "epoch": 3.060344827586207, + "grad_norm": 0.33576491475105286, + "learning_rate": 3.061806950440183e-06, + "loss": 0.003, + "step": 355 + }, + { + "epoch": 3.0689655172413794, + "grad_norm": 0.3481740951538086, + "learning_rate": 3.0489584361374073e-06, + "loss": 0.0017, + "step": 356 + }, + { + "epoch": 3.0775862068965516, + "grad_norm": 0.2126464992761612, + "learning_rate": 3.0360946691713163e-06, + "loss": 0.0014, + "step": 357 + }, + { + "epoch": 3.086206896551724, + "grad_norm": 0.22430169582366943, + "learning_rate": 3.0232160069582335e-06, + "loss": 0.0012, + "step": 358 + }, + { + "epoch": 3.0948275862068964, + "grad_norm": 0.3447754681110382, + "learning_rate": 3.010322807328343e-06, + "loss": 0.0013, + "step": 359 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 0.37295961380004883, + "learning_rate": 2.99741542851575e-06, + "loss": 0.0031, + "step": 360 + }, + { + "epoch": 3.1120689655172415, + "grad_norm": 0.3267655372619629, + "learning_rate": 2.98449422914852e-06, + "loss": 0.002, + "step": 361 + }, + { + "epoch": 3.1206896551724137, + "grad_norm": 0.2811220586299896, + "learning_rate": 2.9715595682387243e-06, + "loss": 0.0033, + "step": 362 + }, + { + "epoch": 3.1293103448275863, + "grad_norm": 0.7849671244621277, + "learning_rate": 2.9586118051724532e-06, + "loss": 0.0026, + "step": 363 + }, + { + "epoch": 3.1379310344827585, + "grad_norm": 0.26505476236343384, + "learning_rate": 2.945651299699843e-06, + "loss": 0.0039, + "step": 364 + }, + { + "epoch": 3.146551724137931, + "grad_norm": 0.3053433299064636, + "learning_rate": 2.9326784119250705e-06, + "loss": 0.0062, + "step": 365 + }, + { + "epoch": 3.1551724137931036, + "grad_norm": 0.3546666204929352, + "learning_rate": 2.9196935022963524e-06, + "loss": 0.0056, + "step": 366 + }, + { + "epoch": 3.163793103448276, + "grad_norm": 0.19328542053699493, + "learning_rate": 2.9066969315959305e-06, + "loss": 0.0013, + "step": 367 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.2915872037410736, + "learning_rate": 2.893689060930045e-06, + "loss": 0.0008, + "step": 368 + }, + { + "epoch": 3.1810344827586206, + "grad_norm": 0.25301098823547363, + "learning_rate": 2.8806702517189035e-06, + "loss": 0.0018, + "step": 369 + }, + { + "epoch": 3.189655172413793, + "grad_norm": 0.2005927562713623, + "learning_rate": 2.8676408656866356e-06, + "loss": 0.0018, + "step": 370 + }, + { + "epoch": 3.1982758620689653, + "grad_norm": 0.1574263870716095, + "learning_rate": 2.8546012648512504e-06, + "loss": 0.0007, + "step": 371 + }, + { + "epoch": 3.206896551724138, + "grad_norm": 0.18230217695236206, + "learning_rate": 2.8415518115145673e-06, + "loss": 0.0008, + "step": 372 + }, + { + "epoch": 3.2155172413793105, + "grad_norm": 0.17362555861473083, + "learning_rate": 2.828492868252157e-06, + "loss": 0.0012, + "step": 373 + }, + { + "epoch": 3.2241379310344827, + "grad_norm": 0.12085027992725372, + "learning_rate": 2.815424797903267e-06, + "loss": 0.0008, + "step": 374 + }, + { + "epoch": 3.2327586206896552, + "grad_norm": 0.08503284305334091, + "learning_rate": 2.8023479635607344e-06, + "loss": 0.0006, + "step": 375 + }, + { + "epoch": 3.2413793103448274, + "grad_norm": 0.32579341530799866, + "learning_rate": 2.7892627285609035e-06, + "loss": 0.0023, + "step": 376 + }, + { + "epoch": 3.25, + "grad_norm": 0.11419796198606491, + "learning_rate": 2.7761694564735303e-06, + "loss": 0.0008, + "step": 377 + }, + { + "epoch": 3.2586206896551726, + "grad_norm": 0.2702169418334961, + "learning_rate": 2.763068511091678e-06, + "loss": 0.0018, + "step": 378 + }, + { + "epoch": 3.2672413793103448, + "grad_norm": 0.3495383560657501, + "learning_rate": 2.749960256421608e-06, + "loss": 0.0026, + "step": 379 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 0.1872117817401886, + "learning_rate": 2.7368450566726714e-06, + "loss": 0.0009, + "step": 380 + }, + { + "epoch": 3.2844827586206895, + "grad_norm": 0.11182069033384323, + "learning_rate": 2.7237232762471846e-06, + "loss": 0.0008, + "step": 381 + }, + { + "epoch": 3.293103448275862, + "grad_norm": 0.03204982727766037, + "learning_rate": 2.7105952797303083e-06, + "loss": 0.0002, + "step": 382 + }, + { + "epoch": 3.3017241379310347, + "grad_norm": 0.10610484331846237, + "learning_rate": 2.6974614318799118e-06, + "loss": 0.0006, + "step": 383 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.039000194519758224, + "learning_rate": 2.684322097616448e-06, + "loss": 0.0003, + "step": 384 + }, + { + "epoch": 3.3189655172413794, + "grad_norm": 0.25091490149497986, + "learning_rate": 2.671177642012803e-06, + "loss": 0.0018, + "step": 385 + }, + { + "epoch": 3.3275862068965516, + "grad_norm": 0.03331621363759041, + "learning_rate": 2.658028430284159e-06, + "loss": 0.0002, + "step": 386 + }, + { + "epoch": 3.336206896551724, + "grad_norm": 0.1930539906024933, + "learning_rate": 2.644874827777849e-06, + "loss": 0.0014, + "step": 387 + }, + { + "epoch": 3.344827586206897, + "grad_norm": 0.16782906651496887, + "learning_rate": 2.6317171999631992e-06, + "loss": 0.0013, + "step": 388 + }, + { + "epoch": 3.353448275862069, + "grad_norm": 0.08973520249128342, + "learning_rate": 2.6185559124213794e-06, + "loss": 0.0004, + "step": 389 + }, + { + "epoch": 3.3620689655172415, + "grad_norm": 0.08465900272130966, + "learning_rate": 2.605391330835243e-06, + "loss": 0.0005, + "step": 390 + }, + { + "epoch": 3.3706896551724137, + "grad_norm": 0.1823631227016449, + "learning_rate": 2.5922238209791696e-06, + "loss": 0.0023, + "step": 391 + }, + { + "epoch": 3.3793103448275863, + "grad_norm": 0.07805266231298447, + "learning_rate": 2.5790537487088975e-06, + "loss": 0.0004, + "step": 392 + }, + { + "epoch": 3.3879310344827585, + "grad_norm": 0.1784096211194992, + "learning_rate": 2.565881479951362e-06, + "loss": 0.0012, + "step": 393 + }, + { + "epoch": 3.396551724137931, + "grad_norm": 0.0848960429430008, + "learning_rate": 2.552707380694528e-06, + "loss": 0.0005, + "step": 394 + }, + { + "epoch": 3.405172413793103, + "grad_norm": 0.2038225382566452, + "learning_rate": 2.539531816977219e-06, + "loss": 0.0012, + "step": 395 + }, + { + "epoch": 3.413793103448276, + "grad_norm": 0.2176526039838791, + "learning_rate": 2.5263551548789494e-06, + "loss": 0.0021, + "step": 396 + }, + { + "epoch": 3.4224137931034484, + "grad_norm": 1.042067050933838, + "learning_rate": 2.51317776050975e-06, + "loss": 0.003, + "step": 397 + }, + { + "epoch": 3.4310344827586206, + "grad_norm": 0.08978473395109177, + "learning_rate": 2.5e-06, + "loss": 0.0007, + "step": 398 + }, + { + "epoch": 3.439655172413793, + "grad_norm": 0.07822535932064056, + "learning_rate": 2.4868222394902503e-06, + "loss": 0.0006, + "step": 399 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.11348029971122742, + "learning_rate": 2.4736448451210514e-06, + "loss": 0.0008, + "step": 400 + }, + { + "epoch": 3.456896551724138, + "grad_norm": 0.1684485524892807, + "learning_rate": 2.460468183022782e-06, + "loss": 0.0007, + "step": 401 + }, + { + "epoch": 3.4655172413793105, + "grad_norm": 0.2135872095823288, + "learning_rate": 2.447292619305473e-06, + "loss": 0.0015, + "step": 402 + }, + { + "epoch": 3.4741379310344827, + "grad_norm": 0.0703781470656395, + "learning_rate": 2.4341185200486387e-06, + "loss": 0.0005, + "step": 403 + }, + { + "epoch": 3.4827586206896552, + "grad_norm": 0.1052190512418747, + "learning_rate": 2.420946251291103e-06, + "loss": 0.0004, + "step": 404 + }, + { + "epoch": 3.4913793103448274, + "grad_norm": 0.10699443519115448, + "learning_rate": 2.4077761790208317e-06, + "loss": 0.0006, + "step": 405 + }, + { + "epoch": 3.5, + "grad_norm": 0.3566197454929352, + "learning_rate": 2.3946086691647576e-06, + "loss": 0.0004, + "step": 406 + }, + { + "epoch": 3.5086206896551726, + "grad_norm": 0.26467472314834595, + "learning_rate": 2.3814440875786215e-06, + "loss": 0.0011, + "step": 407 + }, + { + "epoch": 3.5172413793103448, + "grad_norm": 0.04453529417514801, + "learning_rate": 2.368282800036801e-06, + "loss": 0.0003, + "step": 408 + }, + { + "epoch": 3.5258620689655173, + "grad_norm": 0.3083684742450714, + "learning_rate": 2.3551251722221526e-06, + "loss": 0.0005, + "step": 409 + }, + { + "epoch": 3.5344827586206895, + "grad_norm": 0.11275946348905563, + "learning_rate": 2.3419715697158414e-06, + "loss": 0.0006, + "step": 410 + }, + { + "epoch": 3.543103448275862, + "grad_norm": 0.20643529295921326, + "learning_rate": 2.3288223579871984e-06, + "loss": 0.0012, + "step": 411 + }, + { + "epoch": 3.5517241379310347, + "grad_norm": 0.20463480055332184, + "learning_rate": 2.315677902383553e-06, + "loss": 0.0008, + "step": 412 + }, + { + "epoch": 3.560344827586207, + "grad_norm": 0.05099180340766907, + "learning_rate": 2.3025385681200882e-06, + "loss": 0.0001, + "step": 413 + }, + { + "epoch": 3.5689655172413794, + "grad_norm": 0.0880790427327156, + "learning_rate": 2.289404720269693e-06, + "loss": 0.0005, + "step": 414 + }, + { + "epoch": 3.5775862068965516, + "grad_norm": 0.36317208409309387, + "learning_rate": 2.276276723752816e-06, + "loss": 0.0027, + "step": 415 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.09729066491127014, + "learning_rate": 2.2631549433273294e-06, + "loss": 0.0003, + "step": 416 + }, + { + "epoch": 3.594827586206897, + "grad_norm": 0.13390637934207916, + "learning_rate": 2.2500397435783925e-06, + "loss": 0.0005, + "step": 417 + }, + { + "epoch": 3.603448275862069, + "grad_norm": 0.15281806886196136, + "learning_rate": 2.2369314889083234e-06, + "loss": 0.0009, + "step": 418 + }, + { + "epoch": 3.612068965517241, + "grad_norm": 0.05436816066503525, + "learning_rate": 2.2238305435264705e-06, + "loss": 0.0003, + "step": 419 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 0.20743894577026367, + "learning_rate": 2.2107372714390973e-06, + "loss": 0.0007, + "step": 420 + }, + { + "epoch": 3.6293103448275863, + "grad_norm": 0.0923997238278389, + "learning_rate": 2.1976520364392664e-06, + "loss": 0.0004, + "step": 421 + }, + { + "epoch": 3.637931034482759, + "grad_norm": 0.1782890260219574, + "learning_rate": 2.1845752020967335e-06, + "loss": 0.0005, + "step": 422 + }, + { + "epoch": 3.646551724137931, + "grad_norm": 0.2709210515022278, + "learning_rate": 2.1715071317478433e-06, + "loss": 0.0016, + "step": 423 + }, + { + "epoch": 3.655172413793103, + "grad_norm": 0.03978186100721359, + "learning_rate": 2.158448188485433e-06, + "loss": 0.0002, + "step": 424 + }, + { + "epoch": 3.663793103448276, + "grad_norm": 0.0696793794631958, + "learning_rate": 2.1453987351487504e-06, + "loss": 0.0003, + "step": 425 + }, + { + "epoch": 3.6724137931034484, + "grad_norm": 0.2356308102607727, + "learning_rate": 2.1323591343133644e-06, + "loss": 0.0013, + "step": 426 + }, + { + "epoch": 3.6810344827586206, + "grad_norm": 0.03700773045420647, + "learning_rate": 2.119329748281098e-06, + "loss": 0.0003, + "step": 427 + }, + { + "epoch": 3.689655172413793, + "grad_norm": 0.054152730852365494, + "learning_rate": 2.106310939069956e-06, + "loss": 0.0002, + "step": 428 + }, + { + "epoch": 3.6982758620689653, + "grad_norm": 0.16630911827087402, + "learning_rate": 2.0933030684040703e-06, + "loss": 0.0014, + "step": 429 + }, + { + "epoch": 3.706896551724138, + "grad_norm": 0.07690928876399994, + "learning_rate": 2.080306497703648e-06, + "loss": 0.0004, + "step": 430 + }, + { + "epoch": 3.7155172413793105, + "grad_norm": 0.27894601225852966, + "learning_rate": 2.0673215880749307e-06, + "loss": 0.0011, + "step": 431 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.14092902839183807, + "learning_rate": 2.054348700300158e-06, + "loss": 0.0005, + "step": 432 + }, + { + "epoch": 3.7327586206896552, + "grad_norm": 0.044833749532699585, + "learning_rate": 2.0413881948275476e-06, + "loss": 0.0003, + "step": 433 + }, + { + "epoch": 3.7413793103448274, + "grad_norm": 0.21889886260032654, + "learning_rate": 2.0284404317612765e-06, + "loss": 0.002, + "step": 434 + }, + { + "epoch": 3.75, + "grad_norm": 0.07229331135749817, + "learning_rate": 2.01550577085148e-06, + "loss": 0.0003, + "step": 435 + }, + { + "epoch": 3.7586206896551726, + "grad_norm": 0.011932128109037876, + "learning_rate": 2.0025845714842514e-06, + "loss": 0.0001, + "step": 436 + }, + { + "epoch": 3.7672413793103448, + "grad_norm": 0.054304156452417374, + "learning_rate": 1.9896771926716574e-06, + "loss": 0.0004, + "step": 437 + }, + { + "epoch": 3.7758620689655173, + "grad_norm": 0.05861916020512581, + "learning_rate": 1.9767839930417673e-06, + "loss": 0.0003, + "step": 438 + }, + { + "epoch": 3.7844827586206895, + "grad_norm": 0.030074656009674072, + "learning_rate": 1.963905330828684e-06, + "loss": 0.0001, + "step": 439 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 0.09362868964672089, + "learning_rate": 1.951041563862593e-06, + "loss": 0.0004, + "step": 440 + }, + { + "epoch": 3.8017241379310347, + "grad_norm": 0.021378953009843826, + "learning_rate": 1.9381930495598176e-06, + "loss": 0.0001, + "step": 441 + }, + { + "epoch": 3.810344827586207, + "grad_norm": 0.08444344252347946, + "learning_rate": 1.9253601449128915e-06, + "loss": 0.0005, + "step": 442 + }, + { + "epoch": 3.8189655172413794, + "grad_norm": 0.026262691244482994, + "learning_rate": 1.9125432064806376e-06, + "loss": 0.0002, + "step": 443 + }, + { + "epoch": 3.8275862068965516, + "grad_norm": 0.11125782132148743, + "learning_rate": 1.899742590378263e-06, + "loss": 0.0004, + "step": 444 + }, + { + "epoch": 3.836206896551724, + "grad_norm": 0.12721066176891327, + "learning_rate": 1.8869586522674615e-06, + "loss": 0.0004, + "step": 445 + }, + { + "epoch": 3.844827586206897, + "grad_norm": 0.023635324090719223, + "learning_rate": 1.8741917473465341e-06, + "loss": 0.0002, + "step": 446 + }, + { + "epoch": 3.853448275862069, + "grad_norm": 0.5198646783828735, + "learning_rate": 1.861442230340519e-06, + "loss": 0.0016, + "step": 447 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.3609335720539093, + "learning_rate": 1.8487104554913362e-06, + "loss": 0.0013, + "step": 448 + }, + { + "epoch": 3.8706896551724137, + "grad_norm": 0.27512818574905396, + "learning_rate": 1.8359967765479465e-06, + "loss": 0.0011, + "step": 449 + }, + { + "epoch": 3.8793103448275863, + "grad_norm": 0.09016284346580505, + "learning_rate": 1.823301546756515e-06, + "loss": 0.0003, + "step": 450 + }, + { + "epoch": 3.887931034482759, + "grad_norm": 0.11690771579742432, + "learning_rate": 1.8106251188506082e-06, + "loss": 0.0006, + "step": 451 + }, + { + "epoch": 3.896551724137931, + "grad_norm": 0.09307470917701721, + "learning_rate": 1.7979678450413845e-06, + "loss": 0.0003, + "step": 452 + }, + { + "epoch": 3.905172413793103, + "grad_norm": 0.04047633707523346, + "learning_rate": 1.7853300770078124e-06, + "loss": 0.0002, + "step": 453 + }, + { + "epoch": 3.913793103448276, + "grad_norm": 0.10549729317426682, + "learning_rate": 1.7727121658868934e-06, + "loss": 0.0006, + "step": 454 + }, + { + "epoch": 3.9224137931034484, + "grad_norm": 0.1153617799282074, + "learning_rate": 1.7601144622639154e-06, + "loss": 0.0004, + "step": 455 + }, + { + "epoch": 3.9310344827586206, + "grad_norm": 0.3109404742717743, + "learning_rate": 1.7475373161627034e-06, + "loss": 0.0012, + "step": 456 + }, + { + "epoch": 3.939655172413793, + "grad_norm": 0.11449936032295227, + "learning_rate": 1.7349810770358977e-06, + "loss": 0.0004, + "step": 457 + }, + { + "epoch": 3.9482758620689653, + "grad_norm": 0.12818413972854614, + "learning_rate": 1.7224460937552449e-06, + "loss": 0.0009, + "step": 458 + }, + { + "epoch": 3.956896551724138, + "grad_norm": 0.05156997591257095, + "learning_rate": 1.709932714601901e-06, + "loss": 0.0003, + "step": 459 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 0.14590847492218018, + "learning_rate": 1.6974412872567598e-06, + "loss": 0.0008, + "step": 460 + }, + { + "epoch": 3.9741379310344827, + "grad_norm": 0.750783383846283, + "learning_rate": 1.6849721587907887e-06, + "loss": 0.0019, + "step": 461 + }, + { + "epoch": 3.9827586206896552, + "grad_norm": 0.1131512001156807, + "learning_rate": 1.672525675655387e-06, + "loss": 0.0007, + "step": 462 + }, + { + "epoch": 3.9913793103448274, + "grad_norm": 0.42991429567337036, + "learning_rate": 1.6601021836727597e-06, + "loss": 0.0008, + "step": 463 + }, + { + "epoch": 4.0, + "grad_norm": 0.11112646013498306, + "learning_rate": 1.647702028026308e-06, + "loss": 0.0005, + "step": 464 + }, + { + "epoch": 4.008620689655173, + "grad_norm": 0.08795543015003204, + "learning_rate": 1.63532555325104e-06, + "loss": 0.0005, + "step": 465 + }, + { + "epoch": 4.017241379310345, + "grad_norm": 0.0381544753909111, + "learning_rate": 1.6229731032239986e-06, + "loss": 0.0002, + "step": 466 + }, + { + "epoch": 4.025862068965517, + "grad_norm": 0.025873109698295593, + "learning_rate": 1.6106450211547015e-06, + "loss": 0.0001, + "step": 467 + }, + { + "epoch": 4.0344827586206895, + "grad_norm": 0.0740472748875618, + "learning_rate": 1.5983416495756116e-06, + "loss": 0.0003, + "step": 468 + }, + { + "epoch": 4.043103448275862, + "grad_norm": 0.02801835909485817, + "learning_rate": 1.586063330332619e-06, + "loss": 0.0002, + "step": 469 + }, + { + "epoch": 4.051724137931035, + "grad_norm": 0.03600400686264038, + "learning_rate": 1.573810404575539e-06, + "loss": 0.0002, + "step": 470 + }, + { + "epoch": 4.060344827586207, + "grad_norm": 0.07549732178449631, + "learning_rate": 1.5615832127486368e-06, + "loss": 0.0004, + "step": 471 + }, + { + "epoch": 4.068965517241379, + "grad_norm": 0.0136073287576437, + "learning_rate": 1.549382094581166e-06, + "loss": 0.0001, + "step": 472 + }, + { + "epoch": 4.077586206896552, + "grad_norm": 0.03338685631752014, + "learning_rate": 1.537207389077932e-06, + "loss": 0.0002, + "step": 473 + }, + { + "epoch": 4.086206896551724, + "grad_norm": 0.018992526456713676, + "learning_rate": 1.5250594345098709e-06, + "loss": 0.0001, + "step": 474 + }, + { + "epoch": 4.094827586206897, + "grad_norm": 0.08795375376939774, + "learning_rate": 1.5129385684046512e-06, + "loss": 0.0004, + "step": 475 + }, + { + "epoch": 4.103448275862069, + "grad_norm": 0.014034583233296871, + "learning_rate": 1.5008451275372948e-06, + "loss": 0.0001, + "step": 476 + }, + { + "epoch": 4.112068965517241, + "grad_norm": 0.10508130490779877, + "learning_rate": 1.4887794479208224e-06, + "loss": 0.0003, + "step": 477 + }, + { + "epoch": 4.120689655172414, + "grad_norm": 0.29925230145454407, + "learning_rate": 1.4767418647969134e-06, + "loss": 0.0009, + "step": 478 + }, + { + "epoch": 4.129310344827586, + "grad_norm": 0.034915853291749954, + "learning_rate": 1.464732712626597e-06, + "loss": 0.0002, + "step": 479 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 0.05039063096046448, + "learning_rate": 1.4527523250809544e-06, + "loss": 0.0002, + "step": 480 + }, + { + "epoch": 4.146551724137931, + "grad_norm": 0.01586967520415783, + "learning_rate": 1.4408010350318475e-06, + "loss": 0.0001, + "step": 481 + }, + { + "epoch": 4.155172413793103, + "grad_norm": 0.12373235076665878, + "learning_rate": 1.428879174542674e-06, + "loss": 0.0006, + "step": 482 + }, + { + "epoch": 4.163793103448276, + "grad_norm": 0.061160214245319366, + "learning_rate": 1.4169870748591388e-06, + "loss": 0.0002, + "step": 483 + }, + { + "epoch": 4.172413793103448, + "grad_norm": 0.0263685192912817, + "learning_rate": 1.4051250664000515e-06, + "loss": 0.0001, + "step": 484 + }, + { + "epoch": 4.181034482758621, + "grad_norm": 0.18039947748184204, + "learning_rate": 1.3932934787481437e-06, + "loss": 0.0006, + "step": 485 + }, + { + "epoch": 4.189655172413793, + "grad_norm": 0.03929466754198074, + "learning_rate": 1.381492640640914e-06, + "loss": 0.0003, + "step": 486 + }, + { + "epoch": 4.198275862068965, + "grad_norm": 0.04226909577846527, + "learning_rate": 1.3697228799614926e-06, + "loss": 0.0002, + "step": 487 + }, + { + "epoch": 4.206896551724138, + "grad_norm": 0.08892680704593658, + "learning_rate": 1.357984523729533e-06, + "loss": 0.0004, + "step": 488 + }, + { + "epoch": 4.2155172413793105, + "grad_norm": 0.034578073769807816, + "learning_rate": 1.3462778980921214e-06, + "loss": 0.0001, + "step": 489 + }, + { + "epoch": 4.224137931034483, + "grad_norm": 0.05692879110574722, + "learning_rate": 1.3346033283147196e-06, + "loss": 0.0002, + "step": 490 + }, + { + "epoch": 4.232758620689655, + "grad_norm": 0.05728021264076233, + "learning_rate": 1.3229611387721291e-06, + "loss": 0.0002, + "step": 491 + }, + { + "epoch": 4.241379310344827, + "grad_norm": 0.0304628424346447, + "learning_rate": 1.3113516529394705e-06, + "loss": 0.0002, + "step": 492 + }, + { + "epoch": 4.25, + "grad_norm": 0.03307940065860748, + "learning_rate": 1.2997751933832038e-06, + "loss": 0.0002, + "step": 493 + }, + { + "epoch": 4.258620689655173, + "grad_norm": 0.03960256651043892, + "learning_rate": 1.2882320817521587e-06, + "loss": 0.0001, + "step": 494 + }, + { + "epoch": 4.267241379310345, + "grad_norm": 0.09631030261516571, + "learning_rate": 1.2767226387686065e-06, + "loss": 0.0004, + "step": 495 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.048445411026477814, + "learning_rate": 1.2652471842193415e-06, + "loss": 0.0002, + "step": 496 + }, + { + "epoch": 4.2844827586206895, + "grad_norm": 0.019632607698440552, + "learning_rate": 1.2538060369467989e-06, + "loss": 0.0001, + "step": 497 + }, + { + "epoch": 4.293103448275862, + "grad_norm": 0.05726362392306328, + "learning_rate": 1.2423995148401954e-06, + "loss": 0.0002, + "step": 498 + }, + { + "epoch": 4.301724137931035, + "grad_norm": 0.008523290976881981, + "learning_rate": 1.231027934826697e-06, + "loss": 0.0001, + "step": 499 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 0.1245892345905304, + "learning_rate": 1.2196916128626126e-06, + "loss": 0.0006, + "step": 500 + }, + { + "epoch": 4.318965517241379, + "grad_norm": 0.018376415595412254, + "learning_rate": 1.2083908639246169e-06, + "loss": 0.0001, + "step": 501 + }, + { + "epoch": 4.327586206896552, + "grad_norm": 0.027295252308249474, + "learning_rate": 1.1971260020009944e-06, + "loss": 0.0002, + "step": 502 + }, + { + "epoch": 4.336206896551724, + "grad_norm": 0.020805874839425087, + "learning_rate": 1.1858973400829208e-06, + "loss": 0.0002, + "step": 503 + }, + { + "epoch": 4.344827586206897, + "grad_norm": 0.015192719176411629, + "learning_rate": 1.174705190155766e-06, + "loss": 0.0001, + "step": 504 + }, + { + "epoch": 4.353448275862069, + "grad_norm": 0.03198520466685295, + "learning_rate": 1.163549863190421e-06, + "loss": 0.0002, + "step": 505 + }, + { + "epoch": 4.362068965517241, + "grad_norm": 0.038549166172742844, + "learning_rate": 1.152431669134663e-06, + "loss": 0.0002, + "step": 506 + }, + { + "epoch": 4.370689655172414, + "grad_norm": 0.01120319589972496, + "learning_rate": 1.1413509169045376e-06, + "loss": 0.0001, + "step": 507 + }, + { + "epoch": 4.379310344827586, + "grad_norm": 0.07101059705018997, + "learning_rate": 1.1303079143757831e-06, + "loss": 0.0002, + "step": 508 + }, + { + "epoch": 4.387931034482759, + "grad_norm": 0.03514993190765381, + "learning_rate": 1.1193029683752712e-06, + "loss": 0.0002, + "step": 509 + }, + { + "epoch": 4.396551724137931, + "grad_norm": 0.015733392909169197, + "learning_rate": 1.1083363846724824e-06, + "loss": 0.0001, + "step": 510 + }, + { + "epoch": 4.405172413793103, + "grad_norm": 0.07725036144256592, + "learning_rate": 1.097408467971012e-06, + "loss": 0.0002, + "step": 511 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 0.03870030492544174, + "learning_rate": 1.0865195219001028e-06, + "loss": 0.0001, + "step": 512 + }, + { + "epoch": 4.422413793103448, + "grad_norm": 0.041263483464717865, + "learning_rate": 1.0756698490062085e-06, + "loss": 0.0001, + "step": 513 + }, + { + "epoch": 4.431034482758621, + "grad_norm": 0.028052903711795807, + "learning_rate": 1.0648597507445884e-06, + "loss": 0.0001, + "step": 514 + }, + { + "epoch": 4.439655172413793, + "grad_norm": 0.020984163507819176, + "learning_rate": 1.0540895274709325e-06, + "loss": 0.0001, + "step": 515 + }, + { + "epoch": 4.448275862068965, + "grad_norm": 0.006086215376853943, + "learning_rate": 1.043359478433012e-06, + "loss": 0.0001, + "step": 516 + }, + { + "epoch": 4.456896551724138, + "grad_norm": 0.022190986201167107, + "learning_rate": 1.0326699017623689e-06, + "loss": 0.0001, + "step": 517 + }, + { + "epoch": 4.4655172413793105, + "grad_norm": 0.02403254061937332, + "learning_rate": 1.022021094466034e-06, + "loss": 0.0001, + "step": 518 + }, + { + "epoch": 4.474137931034483, + "grad_norm": 0.017441941425204277, + "learning_rate": 1.0114133524182696e-06, + "loss": 0.0001, + "step": 519 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 0.029313763603568077, + "learning_rate": 1.0008469703523493e-06, + "loss": 0.0001, + "step": 520 + }, + { + "epoch": 4.491379310344827, + "grad_norm": 0.016915587708353996, + "learning_rate": 9.903222418523739e-07, + "loss": 0.0001, + "step": 521 + }, + { + "epoch": 4.5, + "grad_norm": 0.03886621445417404, + "learning_rate": 9.798394593451092e-07, + "loss": 0.0001, + "step": 522 + }, + { + "epoch": 4.508620689655173, + "grad_norm": 0.007353832945227623, + "learning_rate": 9.693989140918635e-07, + "loss": 0.0001, + "step": 523 + }, + { + "epoch": 4.517241379310345, + "grad_norm": 0.00810723565518856, + "learning_rate": 9.590008961803942e-07, + "loss": 0.0001, + "step": 524 + }, + { + "epoch": 4.525862068965517, + "grad_norm": 0.21131834387779236, + "learning_rate": 9.486456945168476e-07, + "loss": 0.0003, + "step": 525 + }, + { + "epoch": 4.5344827586206895, + "grad_norm": 0.034072235226631165, + "learning_rate": 9.383335968177324e-07, + "loss": 0.0001, + "step": 526 + }, + { + "epoch": 4.543103448275862, + "grad_norm": 0.035911768674850464, + "learning_rate": 9.280648896019245e-07, + "loss": 0.0001, + "step": 527 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.027105217799544334, + "learning_rate": 9.178398581827086e-07, + "loss": 0.0002, + "step": 528 + }, + { + "epoch": 4.560344827586206, + "grad_norm": 0.020529363304376602, + "learning_rate": 9.07658786659846e-07, + "loss": 0.0001, + "step": 529 + }, + { + "epoch": 4.568965517241379, + "grad_norm": 0.03402241691946983, + "learning_rate": 8.975219579116865e-07, + "loss": 0.0001, + "step": 530 + }, + { + "epoch": 4.577586206896552, + "grad_norm": 0.03538055345416069, + "learning_rate": 8.874296535873044e-07, + "loss": 0.0002, + "step": 531 + }, + { + "epoch": 4.586206896551724, + "grad_norm": 0.011194778606295586, + "learning_rate": 8.77382154098679e-07, + "loss": 0.0001, + "step": 532 + }, + { + "epoch": 4.594827586206897, + "grad_norm": 0.029824107885360718, + "learning_rate": 8.673797386128932e-07, + "loss": 0.0001, + "step": 533 + }, + { + "epoch": 4.603448275862069, + "grad_norm": 0.04908660054206848, + "learning_rate": 8.574226850443873e-07, + "loss": 0.0001, + "step": 534 + }, + { + "epoch": 4.612068965517241, + "grad_norm": 0.0360528863966465, + "learning_rate": 8.475112700472307e-07, + "loss": 0.0002, + "step": 535 + }, + { + "epoch": 4.620689655172414, + "grad_norm": 0.05588826164603233, + "learning_rate": 8.376457690074386e-07, + "loss": 0.0002, + "step": 536 + }, + { + "epoch": 4.629310344827586, + "grad_norm": 0.23618784546852112, + "learning_rate": 8.278264560353183e-07, + "loss": 0.0011, + "step": 537 + }, + { + "epoch": 4.637931034482759, + "grad_norm": 0.050062019377946854, + "learning_rate": 8.180536039578546e-07, + "loss": 0.0002, + "step": 538 + }, + { + "epoch": 4.646551724137931, + "grad_norm": 0.027577703818678856, + "learning_rate": 8.083274843111282e-07, + "loss": 0.0001, + "step": 539 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 0.01449573878198862, + "learning_rate": 7.986483673327724e-07, + "loss": 0.0001, + "step": 540 + }, + { + "epoch": 4.663793103448276, + "grad_norm": 0.020338334143161774, + "learning_rate": 7.89016521954464e-07, + "loss": 0.0001, + "step": 541 + }, + { + "epoch": 4.672413793103448, + "grad_norm": 0.009396952576935291, + "learning_rate": 7.794322157944489e-07, + "loss": 0.0001, + "step": 542 + }, + { + "epoch": 4.681034482758621, + "grad_norm": 0.012877953238785267, + "learning_rate": 7.698957151501113e-07, + "loss": 0.0001, + "step": 543 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 0.019344555214047432, + "learning_rate": 7.604072849905708e-07, + "loss": 0.0001, + "step": 544 + }, + { + "epoch": 4.698275862068965, + "grad_norm": 0.2258051335811615, + "learning_rate": 7.509671889493215e-07, + "loss": 0.0005, + "step": 545 + }, + { + "epoch": 4.706896551724138, + "grad_norm": 0.022811248898506165, + "learning_rate": 7.415756893169063e-07, + "loss": 0.0001, + "step": 546 + }, + { + "epoch": 4.7155172413793105, + "grad_norm": 0.015631260350346565, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0001, + "step": 547 + }, + { + "epoch": 4.724137931034483, + "grad_norm": 0.014984537847340107, + "learning_rate": 7.229395216823132e-07, + "loss": 0.0001, + "step": 548 + }, + { + "epoch": 4.732758620689655, + "grad_norm": 0.010196071118116379, + "learning_rate": 7.136953714810682e-07, + "loss": 0.0001, + "step": 549 + }, + { + "epoch": 4.741379310344827, + "grad_norm": 0.02455035224556923, + "learning_rate": 7.045008532761366e-07, + "loss": 0.0001, + "step": 550 + }, + { + "epoch": 4.75, + "grad_norm": 0.010271081700921059, + "learning_rate": 6.95356222534748e-07, + "loss": 0.0001, + "step": 551 + }, + { + "epoch": 4.758620689655173, + "grad_norm": 0.029762504622340202, + "learning_rate": 6.862617333380214e-07, + "loss": 0.0002, + "step": 552 + }, + { + "epoch": 4.767241379310345, + "grad_norm": 0.0284776221960783, + "learning_rate": 6.772176383739065e-07, + "loss": 0.0001, + "step": 553 + }, + { + "epoch": 4.775862068965517, + "grad_norm": 0.046974748373031616, + "learning_rate": 6.682241889301636e-07, + "loss": 0.0002, + "step": 554 + }, + { + "epoch": 4.7844827586206895, + "grad_norm": 0.04300546646118164, + "learning_rate": 6.592816348873785e-07, + "loss": 0.0001, + "step": 555 + }, + { + "epoch": 4.793103448275862, + "grad_norm": 0.10295592993497849, + "learning_rate": 6.503902247120239e-07, + "loss": 0.0002, + "step": 556 + }, + { + "epoch": 4.801724137931035, + "grad_norm": 0.027378961443901062, + "learning_rate": 6.41550205449554e-07, + "loss": 0.0001, + "step": 557 + }, + { + "epoch": 4.810344827586206, + "grad_norm": 0.05989762395620346, + "learning_rate": 6.327618227175389e-07, + "loss": 0.0002, + "step": 558 + }, + { + "epoch": 4.818965517241379, + "grad_norm": 0.019639194011688232, + "learning_rate": 6.240253206988422e-07, + "loss": 0.0001, + "step": 559 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 0.010606248863041401, + "learning_rate": 6.153409421348358e-07, + "loss": 0.0001, + "step": 560 + }, + { + "epoch": 4.836206896551724, + "grad_norm": 0.010039673186838627, + "learning_rate": 6.067089283186555e-07, + "loss": 0.0001, + "step": 561 + }, + { + "epoch": 4.844827586206897, + "grad_norm": 0.01142662763595581, + "learning_rate": 5.981295190884962e-07, + "loss": 0.0001, + "step": 562 + }, + { + "epoch": 4.853448275862069, + "grad_norm": 0.02358933351933956, + "learning_rate": 5.89602952820949e-07, + "loss": 0.0001, + "step": 563 + }, + { + "epoch": 4.862068965517241, + "grad_norm": 0.06325159221887589, + "learning_rate": 5.811294664243752e-07, + "loss": 0.0002, + "step": 564 + }, + { + "epoch": 4.870689655172414, + "grad_norm": 0.022270936518907547, + "learning_rate": 5.727092953323299e-07, + "loss": 0.0001, + "step": 565 + }, + { + "epoch": 4.879310344827586, + "grad_norm": 0.04418841376900673, + "learning_rate": 5.64342673497014e-07, + "loss": 0.0002, + "step": 566 + }, + { + "epoch": 4.887931034482759, + "grad_norm": 0.048862069845199585, + "learning_rate": 5.560298333827782e-07, + "loss": 0.0002, + "step": 567 + }, + { + "epoch": 4.896551724137931, + "grad_norm": 0.022158561274409294, + "learning_rate": 5.4777100595966e-07, + "loss": 0.0002, + "step": 568 + }, + { + "epoch": 4.905172413793103, + "grad_norm": 0.00710646528750658, + "learning_rate": 5.395664206969712e-07, + "loss": 0.0001, + "step": 569 + }, + { + "epoch": 4.913793103448276, + "grad_norm": 0.03146041929721832, + "learning_rate": 5.314163055569188e-07, + "loss": 0.0002, + "step": 570 + }, + { + "epoch": 4.922413793103448, + "grad_norm": 0.048966363072395325, + "learning_rate": 5.23320886988272e-07, + "loss": 0.0002, + "step": 571 + }, + { + "epoch": 4.931034482758621, + "grad_norm": 0.0071070631965994835, + "learning_rate": 5.1528038992007e-07, + "loss": 0.0001, + "step": 572 + }, + { + "epoch": 4.939655172413794, + "grad_norm": 0.06218729913234711, + "learning_rate": 5.07295037755374e-07, + "loss": 0.0001, + "step": 573 + }, + { + "epoch": 4.948275862068965, + "grad_norm": 0.021700145676732063, + "learning_rate": 4.993650523650575e-07, + "loss": 0.0001, + "step": 574 + }, + { + "epoch": 4.956896551724138, + "grad_norm": 0.006461620330810547, + "learning_rate": 4.914906540816436e-07, + "loss": 0.0001, + "step": 575 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.00985821895301342, + "learning_rate": 4.836720616931831e-07, + "loss": 0.0001, + "step": 576 + }, + { + "epoch": 4.974137931034483, + "grad_norm": 0.011432650499045849, + "learning_rate": 4.7590949243717323e-07, + "loss": 0.0001, + "step": 577 + }, + { + "epoch": 4.982758620689655, + "grad_norm": 0.007266015280038118, + "learning_rate": 4.682031619945238e-07, + "loss": 0.0001, + "step": 578 + }, + { + "epoch": 4.991379310344827, + "grad_norm": 0.009189185686409473, + "learning_rate": 4.605532844835667e-07, + "loss": 0.0001, + "step": 579 + }, + { + "epoch": 5.0, + "grad_norm": 0.08564585447311401, + "learning_rate": 4.5296007245410225e-07, + "loss": 0.0003, + "step": 580 + } + ], + "logging_steps": 1, + "max_steps": 696, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 116, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2992829423248998e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}