{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5855562784645413, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006506180871828237, "grad_norm": 3.778571605682373, "learning_rate": 0.0001, "loss": 4.706, "step": 1 }, { "epoch": 0.0013012361743656475, "grad_norm": 0.7331739068031311, "learning_rate": 0.0001, "loss": 2.6402, "step": 2 }, { "epoch": 0.001951854261548471, "grad_norm": 0.5679969191551208, "learning_rate": 0.0001, "loss": 2.5315, "step": 3 }, { "epoch": 0.002602472348731295, "grad_norm": 0.6543067693710327, "learning_rate": 0.0001, "loss": 2.5226, "step": 4 }, { "epoch": 0.0032530904359141183, "grad_norm": 0.42487671971321106, "learning_rate": 0.0001, "loss": 2.1375, "step": 5 }, { "epoch": 0.003903708523096942, "grad_norm": 0.48795655369758606, "learning_rate": 0.0001, "loss": 2.253, "step": 6 }, { "epoch": 0.004554326610279766, "grad_norm": 0.6054234504699707, "learning_rate": 0.0001, "loss": 2.3411, "step": 7 }, { "epoch": 0.00520494469746259, "grad_norm": 0.3039970397949219, "learning_rate": 0.0001, "loss": 2.1293, "step": 8 }, { "epoch": 0.005855562784645413, "grad_norm": 0.6592361330986023, "learning_rate": 0.0001, "loss": 3.1615, "step": 9 }, { "epoch": 0.006506180871828237, "grad_norm": 0.4017999470233917, "learning_rate": 0.0001, "loss": 2.5068, "step": 10 }, { "epoch": 0.0071567989590110605, "grad_norm": 0.31507641077041626, "learning_rate": 0.0001, "loss": 2.1894, "step": 11 }, { "epoch": 0.007807417046193884, "grad_norm": 0.33226895332336426, "learning_rate": 0.0001, "loss": 2.2006, "step": 12 }, { "epoch": 0.008458035133376708, "grad_norm": 0.2632739841938019, "learning_rate": 0.0001, "loss": 2.0998, "step": 13 }, { "epoch": 0.009108653220559532, "grad_norm": 0.2794795036315918, "learning_rate": 0.0001, "loss": 2.113, "step": 14 }, { "epoch": 0.009759271307742356, "grad_norm": 0.29168492555618286, "learning_rate": 0.0001, "loss": 2.354, "step": 15 }, { "epoch": 0.01040988939492518, "grad_norm": 0.2537970244884491, "learning_rate": 0.0001, "loss": 2.2939, "step": 16 }, { "epoch": 0.011060507482108002, "grad_norm": 0.5140053033828735, "learning_rate": 0.0001, "loss": 2.6237, "step": 17 }, { "epoch": 0.011711125569290826, "grad_norm": 0.3093675971031189, "learning_rate": 0.0001, "loss": 2.3502, "step": 18 }, { "epoch": 0.01236174365647365, "grad_norm": 0.29241421818733215, "learning_rate": 0.0001, "loss": 2.5365, "step": 19 }, { "epoch": 0.013012361743656473, "grad_norm": 0.3164322078227997, "learning_rate": 0.0001, "loss": 2.396, "step": 20 }, { "epoch": 0.013662979830839297, "grad_norm": 0.24512743949890137, "learning_rate": 0.0001, "loss": 2.2759, "step": 21 }, { "epoch": 0.014313597918022121, "grad_norm": 0.24328342080116272, "learning_rate": 0.0001, "loss": 2.2103, "step": 22 }, { "epoch": 0.014964216005204945, "grad_norm": 0.2563220262527466, "learning_rate": 0.0001, "loss": 2.4836, "step": 23 }, { "epoch": 0.015614834092387769, "grad_norm": 0.33601588010787964, "learning_rate": 0.0001, "loss": 2.4446, "step": 24 }, { "epoch": 0.01626545217957059, "grad_norm": 0.28699007630348206, "learning_rate": 0.0001, "loss": 2.8504, "step": 25 }, { "epoch": 0.016916070266753416, "grad_norm": 0.3181653618812561, "learning_rate": 0.0001, "loss": 2.3042, "step": 26 }, { "epoch": 0.01756668835393624, "grad_norm": 0.2349390834569931, "learning_rate": 0.0001, "loss": 2.1024, "step": 27 }, { "epoch": 0.018217306441119064, "grad_norm": 0.2751820981502533, "learning_rate": 0.0001, "loss": 2.2646, "step": 28 }, { "epoch": 0.018867924528301886, "grad_norm": 0.25547271966934204, "learning_rate": 0.0001, "loss": 2.1928, "step": 29 }, { "epoch": 0.01951854261548471, "grad_norm": 0.283507764339447, "learning_rate": 0.0001, "loss": 2.3073, "step": 30 }, { "epoch": 0.020169160702667534, "grad_norm": 0.3354213237762451, "learning_rate": 0.0001, "loss": 2.6273, "step": 31 }, { "epoch": 0.02081977878985036, "grad_norm": 0.40484553575515747, "learning_rate": 0.0001, "loss": 2.4919, "step": 32 }, { "epoch": 0.02147039687703318, "grad_norm": 0.34319421648979187, "learning_rate": 0.0001, "loss": 2.8381, "step": 33 }, { "epoch": 0.022121014964216004, "grad_norm": 0.32958984375, "learning_rate": 0.0001, "loss": 2.3062, "step": 34 }, { "epoch": 0.02277163305139883, "grad_norm": 0.4503105878829956, "learning_rate": 0.0001, "loss": 2.4647, "step": 35 }, { "epoch": 0.02342225113858165, "grad_norm": 0.5084238052368164, "learning_rate": 0.0001, "loss": 3.0047, "step": 36 }, { "epoch": 0.024072869225764477, "grad_norm": 0.5192400813102722, "learning_rate": 0.0001, "loss": 2.2899, "step": 37 }, { "epoch": 0.0247234873129473, "grad_norm": 0.4197874665260315, "learning_rate": 0.0001, "loss": 2.4057, "step": 38 }, { "epoch": 0.025374105400130124, "grad_norm": 0.5170285105705261, "learning_rate": 0.0001, "loss": 3.2918, "step": 39 }, { "epoch": 0.026024723487312947, "grad_norm": 0.2491147667169571, "learning_rate": 0.0001, "loss": 2.1957, "step": 40 }, { "epoch": 0.026675341574495772, "grad_norm": 0.6597635746002197, "learning_rate": 0.0001, "loss": 2.7474, "step": 41 }, { "epoch": 0.027325959661678594, "grad_norm": 0.40205034613609314, "learning_rate": 0.0001, "loss": 2.4561, "step": 42 }, { "epoch": 0.02797657774886142, "grad_norm": 0.27388331294059753, "learning_rate": 0.0001, "loss": 2.0477, "step": 43 }, { "epoch": 0.028627195836044242, "grad_norm": 0.9163908958435059, "learning_rate": 0.0001, "loss": 3.334, "step": 44 }, { "epoch": 0.029277813923227064, "grad_norm": 0.2747696042060852, "learning_rate": 0.0001, "loss": 2.1604, "step": 45 }, { "epoch": 0.02992843201040989, "grad_norm": 0.36308085918426514, "learning_rate": 0.0001, "loss": 2.693, "step": 46 }, { "epoch": 0.03057905009759271, "grad_norm": 0.6159886121749878, "learning_rate": 0.0001, "loss": 2.5515, "step": 47 }, { "epoch": 0.031229668184775537, "grad_norm": 0.4801373779773712, "learning_rate": 0.0001, "loss": 2.809, "step": 48 }, { "epoch": 0.03188028627195836, "grad_norm": 0.32580915093421936, "learning_rate": 0.0001, "loss": 2.5236, "step": 49 }, { "epoch": 0.03253090435914118, "grad_norm": 0.3028671443462372, "learning_rate": 0.0001, "loss": 2.2685, "step": 50 }, { "epoch": 0.03318152244632401, "grad_norm": 0.5660931468009949, "learning_rate": 0.0001, "loss": 2.2564, "step": 51 }, { "epoch": 0.03383214053350683, "grad_norm": 0.24634602665901184, "learning_rate": 0.0001, "loss": 2.1355, "step": 52 }, { "epoch": 0.034482758620689655, "grad_norm": 0.24830913543701172, "learning_rate": 0.0001, "loss": 2.0425, "step": 53 }, { "epoch": 0.03513337670787248, "grad_norm": 0.23614570498466492, "learning_rate": 0.0001, "loss": 2.1975, "step": 54 }, { "epoch": 0.035783994795055306, "grad_norm": 0.2624325156211853, "learning_rate": 0.0001, "loss": 2.3071, "step": 55 }, { "epoch": 0.03643461288223813, "grad_norm": 0.3967755436897278, "learning_rate": 0.0001, "loss": 2.6088, "step": 56 }, { "epoch": 0.03708523096942095, "grad_norm": 0.22147373855113983, "learning_rate": 0.0001, "loss": 2.003, "step": 57 }, { "epoch": 0.03773584905660377, "grad_norm": 0.47795867919921875, "learning_rate": 0.0001, "loss": 2.1473, "step": 58 }, { "epoch": 0.038386467143786594, "grad_norm": 0.43953707814216614, "learning_rate": 0.0001, "loss": 2.6595, "step": 59 }, { "epoch": 0.03903708523096942, "grad_norm": 0.29031845927238464, "learning_rate": 0.0001, "loss": 2.3173, "step": 60 }, { "epoch": 0.039687703318152245, "grad_norm": 0.2491024285554886, "learning_rate": 0.0001, "loss": 2.0575, "step": 61 }, { "epoch": 0.04033832140533507, "grad_norm": 0.3025687634944916, "learning_rate": 0.0001, "loss": 2.0965, "step": 62 }, { "epoch": 0.04098893949251789, "grad_norm": 0.26097819209098816, "learning_rate": 0.0001, "loss": 2.2583, "step": 63 }, { "epoch": 0.04163955757970072, "grad_norm": 0.2413238286972046, "learning_rate": 0.0001, "loss": 2.2441, "step": 64 }, { "epoch": 0.04229017566688354, "grad_norm": 0.2332315295934677, "learning_rate": 0.0001, "loss": 2.185, "step": 65 }, { "epoch": 0.04294079375406636, "grad_norm": 0.4037252366542816, "learning_rate": 0.0001, "loss": 2.3875, "step": 66 }, { "epoch": 0.043591411841249185, "grad_norm": 0.34149354696273804, "learning_rate": 0.0001, "loss": 2.3835, "step": 67 }, { "epoch": 0.04424202992843201, "grad_norm": 0.23793481290340424, "learning_rate": 0.0001, "loss": 2.3521, "step": 68 }, { "epoch": 0.044892648015614836, "grad_norm": 0.24252744019031525, "learning_rate": 0.0001, "loss": 2.0984, "step": 69 }, { "epoch": 0.04554326610279766, "grad_norm": 0.2870447635650635, "learning_rate": 0.0001, "loss": 2.5408, "step": 70 }, { "epoch": 0.04619388418998048, "grad_norm": 0.5050077438354492, "learning_rate": 0.0001, "loss": 2.7091, "step": 71 }, { "epoch": 0.0468445022771633, "grad_norm": 0.2391565591096878, "learning_rate": 0.0001, "loss": 2.1601, "step": 72 }, { "epoch": 0.04749512036434613, "grad_norm": 0.20647507905960083, "learning_rate": 0.0001, "loss": 1.9582, "step": 73 }, { "epoch": 0.048145738451528954, "grad_norm": 0.26072338223457336, "learning_rate": 0.0001, "loss": 2.3577, "step": 74 }, { "epoch": 0.048796356538711776, "grad_norm": 0.28378504514694214, "learning_rate": 0.0001, "loss": 2.349, "step": 75 }, { "epoch": 0.0494469746258946, "grad_norm": 0.2536943256855011, "learning_rate": 0.0001, "loss": 2.375, "step": 76 }, { "epoch": 0.05009759271307743, "grad_norm": 0.29276445508003235, "learning_rate": 0.0001, "loss": 2.5003, "step": 77 }, { "epoch": 0.05074821080026025, "grad_norm": 0.2649310231208801, "learning_rate": 0.0001, "loss": 2.3247, "step": 78 }, { "epoch": 0.05139882888744307, "grad_norm": 0.38125383853912354, "learning_rate": 0.0001, "loss": 2.5405, "step": 79 }, { "epoch": 0.05204944697462589, "grad_norm": 0.40980008244514465, "learning_rate": 0.0001, "loss": 2.212, "step": 80 }, { "epoch": 0.052700065061808715, "grad_norm": 0.5363492965698242, "learning_rate": 0.0001, "loss": 2.6499, "step": 81 }, { "epoch": 0.053350683148991544, "grad_norm": 0.34647300839424133, "learning_rate": 0.0001, "loss": 2.6302, "step": 82 }, { "epoch": 0.054001301236174366, "grad_norm": 0.27607980370521545, "learning_rate": 0.0001, "loss": 2.1819, "step": 83 }, { "epoch": 0.05465191932335719, "grad_norm": 0.27654680609703064, "learning_rate": 0.0001, "loss": 2.1763, "step": 84 }, { "epoch": 0.05530253741054001, "grad_norm": 0.24596217274665833, "learning_rate": 0.0001, "loss": 2.2585, "step": 85 }, { "epoch": 0.05595315549772284, "grad_norm": 0.24279890954494476, "learning_rate": 0.0001, "loss": 2.4247, "step": 86 }, { "epoch": 0.05660377358490566, "grad_norm": 0.2918747365474701, "learning_rate": 0.0001, "loss": 2.3986, "step": 87 }, { "epoch": 0.057254391672088484, "grad_norm": 0.26778745651245117, "learning_rate": 0.0001, "loss": 2.3592, "step": 88 }, { "epoch": 0.057905009759271306, "grad_norm": 0.39637815952301025, "learning_rate": 0.0001, "loss": 2.8006, "step": 89 }, { "epoch": 0.05855562784645413, "grad_norm": 0.2676962614059448, "learning_rate": 0.0001, "loss": 2.2384, "step": 90 }, { "epoch": 0.05920624593363696, "grad_norm": 0.3044937252998352, "learning_rate": 0.0001, "loss": 2.7762, "step": 91 }, { "epoch": 0.05985686402081978, "grad_norm": 0.23922136425971985, "learning_rate": 0.0001, "loss": 2.0873, "step": 92 }, { "epoch": 0.0605074821080026, "grad_norm": 0.25385046005249023, "learning_rate": 0.0001, "loss": 2.2708, "step": 93 }, { "epoch": 0.06115810019518542, "grad_norm": 0.378401517868042, "learning_rate": 0.0001, "loss": 3.0583, "step": 94 }, { "epoch": 0.06180871828236825, "grad_norm": 0.37193092703819275, "learning_rate": 0.0001, "loss": 2.3632, "step": 95 }, { "epoch": 0.062459336369551074, "grad_norm": 0.3757643699645996, "learning_rate": 0.0001, "loss": 2.4071, "step": 96 }, { "epoch": 0.0631099544567339, "grad_norm": 0.272833913564682, "learning_rate": 0.0001, "loss": 2.3989, "step": 97 }, { "epoch": 0.06376057254391672, "grad_norm": 0.26533326506614685, "learning_rate": 0.0001, "loss": 2.1716, "step": 98 }, { "epoch": 0.06441119063109954, "grad_norm": 0.5787199139595032, "learning_rate": 0.0001, "loss": 2.9445, "step": 99 }, { "epoch": 0.06506180871828236, "grad_norm": 0.29046157002449036, "learning_rate": 0.0001, "loss": 2.3325, "step": 100 }, { "epoch": 0.06571242680546518, "grad_norm": 0.531452476978302, "learning_rate": 0.0001, "loss": 2.7445, "step": 101 }, { "epoch": 0.06636304489264802, "grad_norm": 0.3969165086746216, "learning_rate": 0.0001, "loss": 2.7126, "step": 102 }, { "epoch": 0.06701366297983084, "grad_norm": 0.24183356761932373, "learning_rate": 0.0001, "loss": 1.9971, "step": 103 }, { "epoch": 0.06766428106701367, "grad_norm": 0.3268399238586426, "learning_rate": 0.0001, "loss": 2.1055, "step": 104 }, { "epoch": 0.06831489915419649, "grad_norm": 0.2625877559185028, "learning_rate": 0.0001, "loss": 1.9946, "step": 105 }, { "epoch": 0.06896551724137931, "grad_norm": 0.2720443308353424, "learning_rate": 0.0001, "loss": 2.0764, "step": 106 }, { "epoch": 0.06961613532856213, "grad_norm": 0.20969334244728088, "learning_rate": 0.0001, "loss": 1.8687, "step": 107 }, { "epoch": 0.07026675341574495, "grad_norm": 0.26211223006248474, "learning_rate": 0.0001, "loss": 2.2042, "step": 108 }, { "epoch": 0.07091737150292778, "grad_norm": 0.27889683842658997, "learning_rate": 0.0001, "loss": 2.3146, "step": 109 }, { "epoch": 0.07156798959011061, "grad_norm": 0.2657179832458496, "learning_rate": 0.0001, "loss": 2.1021, "step": 110 }, { "epoch": 0.07221860767729343, "grad_norm": 0.26620885729789734, "learning_rate": 0.0001, "loss": 2.3488, "step": 111 }, { "epoch": 0.07286922576447626, "grad_norm": 0.4223373830318451, "learning_rate": 0.0001, "loss": 2.5289, "step": 112 }, { "epoch": 0.07351984385165908, "grad_norm": 0.35398781299591064, "learning_rate": 0.0001, "loss": 2.5702, "step": 113 }, { "epoch": 0.0741704619388419, "grad_norm": 0.23328129947185516, "learning_rate": 0.0001, "loss": 2.1292, "step": 114 }, { "epoch": 0.07482108002602472, "grad_norm": 0.33508536219596863, "learning_rate": 0.0001, "loss": 2.2049, "step": 115 }, { "epoch": 0.07547169811320754, "grad_norm": 0.2646953761577606, "learning_rate": 0.0001, "loss": 2.3445, "step": 116 }, { "epoch": 0.07612231620039037, "grad_norm": 0.27866706252098083, "learning_rate": 0.0001, "loss": 2.2472, "step": 117 }, { "epoch": 0.07677293428757319, "grad_norm": 0.35688602924346924, "learning_rate": 0.0001, "loss": 2.5045, "step": 118 }, { "epoch": 0.07742355237475602, "grad_norm": 0.24262933433055878, "learning_rate": 0.0001, "loss": 2.4565, "step": 119 }, { "epoch": 0.07807417046193885, "grad_norm": 0.44757333397865295, "learning_rate": 0.0001, "loss": 2.1619, "step": 120 }, { "epoch": 0.07872478854912167, "grad_norm": 0.3279111385345459, "learning_rate": 0.0001, "loss": 2.3996, "step": 121 }, { "epoch": 0.07937540663630449, "grad_norm": 0.25862693786621094, "learning_rate": 0.0001, "loss": 2.3214, "step": 122 }, { "epoch": 0.08002602472348731, "grad_norm": 0.30093592405319214, "learning_rate": 0.0001, "loss": 2.6446, "step": 123 }, { "epoch": 0.08067664281067013, "grad_norm": 0.25440871715545654, "learning_rate": 0.0001, "loss": 2.1181, "step": 124 }, { "epoch": 0.08132726089785296, "grad_norm": 0.19935627281665802, "learning_rate": 0.0001, "loss": 2.0904, "step": 125 }, { "epoch": 0.08197787898503578, "grad_norm": 0.27385473251342773, "learning_rate": 0.0001, "loss": 2.0829, "step": 126 }, { "epoch": 0.0826284970722186, "grad_norm": 0.24417711794376373, "learning_rate": 0.0001, "loss": 2.0019, "step": 127 }, { "epoch": 0.08327911515940144, "grad_norm": 0.27386653423309326, "learning_rate": 0.0001, "loss": 2.2743, "step": 128 }, { "epoch": 0.08392973324658426, "grad_norm": 0.22413575649261475, "learning_rate": 0.0001, "loss": 2.1584, "step": 129 }, { "epoch": 0.08458035133376708, "grad_norm": 0.27748343348503113, "learning_rate": 0.0001, "loss": 2.1428, "step": 130 }, { "epoch": 0.0852309694209499, "grad_norm": 0.18890976905822754, "learning_rate": 0.0001, "loss": 1.9474, "step": 131 }, { "epoch": 0.08588158750813273, "grad_norm": 0.3067719340324402, "learning_rate": 0.0001, "loss": 2.287, "step": 132 }, { "epoch": 0.08653220559531555, "grad_norm": 0.35126858949661255, "learning_rate": 0.0001, "loss": 2.5086, "step": 133 }, { "epoch": 0.08718282368249837, "grad_norm": 0.19619591534137726, "learning_rate": 0.0001, "loss": 2.0132, "step": 134 }, { "epoch": 0.08783344176968119, "grad_norm": 0.360569566488266, "learning_rate": 0.0001, "loss": 2.607, "step": 135 }, { "epoch": 0.08848405985686401, "grad_norm": 0.22566738724708557, "learning_rate": 0.0001, "loss": 2.0942, "step": 136 }, { "epoch": 0.08913467794404685, "grad_norm": 0.27346086502075195, "learning_rate": 0.0001, "loss": 2.3139, "step": 137 }, { "epoch": 0.08978529603122967, "grad_norm": 0.2500152289867401, "learning_rate": 0.0001, "loss": 2.0815, "step": 138 }, { "epoch": 0.0904359141184125, "grad_norm": 0.22101153433322906, "learning_rate": 0.0001, "loss": 2.374, "step": 139 }, { "epoch": 0.09108653220559532, "grad_norm": 0.2173723727464676, "learning_rate": 0.0001, "loss": 2.0084, "step": 140 }, { "epoch": 0.09173715029277814, "grad_norm": 0.28956499695777893, "learning_rate": 0.0001, "loss": 2.6283, "step": 141 }, { "epoch": 0.09238776837996096, "grad_norm": 0.27032795548439026, "learning_rate": 0.0001, "loss": 2.142, "step": 142 }, { "epoch": 0.09303838646714378, "grad_norm": 0.24320480227470398, "learning_rate": 0.0001, "loss": 2.1402, "step": 143 }, { "epoch": 0.0936890045543266, "grad_norm": 0.3127799332141876, "learning_rate": 0.0001, "loss": 2.6671, "step": 144 }, { "epoch": 0.09433962264150944, "grad_norm": 0.30706024169921875, "learning_rate": 0.0001, "loss": 2.3026, "step": 145 }, { "epoch": 0.09499024072869226, "grad_norm": 0.2378646731376648, "learning_rate": 0.0001, "loss": 2.0422, "step": 146 }, { "epoch": 0.09564085881587508, "grad_norm": 0.24755406379699707, "learning_rate": 0.0001, "loss": 2.2574, "step": 147 }, { "epoch": 0.09629147690305791, "grad_norm": 0.34464696049690247, "learning_rate": 0.0001, "loss": 2.2817, "step": 148 }, { "epoch": 0.09694209499024073, "grad_norm": 0.30485469102859497, "learning_rate": 0.0001, "loss": 2.7303, "step": 149 }, { "epoch": 0.09759271307742355, "grad_norm": 0.1860698163509369, "learning_rate": 0.0001, "loss": 1.8582, "step": 150 }, { "epoch": 0.09824333116460637, "grad_norm": 0.23853841423988342, "learning_rate": 0.0001, "loss": 2.1378, "step": 151 }, { "epoch": 0.0988939492517892, "grad_norm": 0.20248261094093323, "learning_rate": 0.0001, "loss": 2.1888, "step": 152 }, { "epoch": 0.09954456733897202, "grad_norm": 0.3582792282104492, "learning_rate": 0.0001, "loss": 2.6726, "step": 153 }, { "epoch": 0.10019518542615485, "grad_norm": 0.2576686441898346, "learning_rate": 0.0001, "loss": 2.4494, "step": 154 }, { "epoch": 0.10084580351333768, "grad_norm": 0.306029349565506, "learning_rate": 0.0001, "loss": 2.2273, "step": 155 }, { "epoch": 0.1014964216005205, "grad_norm": 0.31375500559806824, "learning_rate": 0.0001, "loss": 2.2474, "step": 156 }, { "epoch": 0.10214703968770332, "grad_norm": 0.253250390291214, "learning_rate": 0.0001, "loss": 2.0142, "step": 157 }, { "epoch": 0.10279765777488614, "grad_norm": 0.3098273277282715, "learning_rate": 0.0001, "loss": 2.2516, "step": 158 }, { "epoch": 0.10344827586206896, "grad_norm": 0.3239591717720032, "learning_rate": 0.0001, "loss": 2.2432, "step": 159 }, { "epoch": 0.10409889394925179, "grad_norm": 0.24929773807525635, "learning_rate": 0.0001, "loss": 2.2495, "step": 160 }, { "epoch": 0.10474951203643461, "grad_norm": 0.3203783929347992, "learning_rate": 0.0001, "loss": 2.68, "step": 161 }, { "epoch": 0.10540013012361743, "grad_norm": 0.38844674825668335, "learning_rate": 0.0001, "loss": 2.7457, "step": 162 }, { "epoch": 0.10605074821080027, "grad_norm": 0.21753644943237305, "learning_rate": 0.0001, "loss": 2.1284, "step": 163 }, { "epoch": 0.10670136629798309, "grad_norm": 0.20610418915748596, "learning_rate": 0.0001, "loss": 1.8377, "step": 164 }, { "epoch": 0.10735198438516591, "grad_norm": 0.3555772304534912, "learning_rate": 0.0001, "loss": 2.3599, "step": 165 }, { "epoch": 0.10800260247234873, "grad_norm": 0.3971005380153656, "learning_rate": 0.0001, "loss": 2.2771, "step": 166 }, { "epoch": 0.10865322055953155, "grad_norm": 0.28628769516944885, "learning_rate": 0.0001, "loss": 2.2438, "step": 167 }, { "epoch": 0.10930383864671438, "grad_norm": 0.38728833198547363, "learning_rate": 0.0001, "loss": 2.4103, "step": 168 }, { "epoch": 0.1099544567338972, "grad_norm": 0.26340189576148987, "learning_rate": 0.0001, "loss": 2.6832, "step": 169 }, { "epoch": 0.11060507482108002, "grad_norm": 0.20119386911392212, "learning_rate": 0.0001, "loss": 1.9622, "step": 170 }, { "epoch": 0.11125569290826284, "grad_norm": 0.2929171621799469, "learning_rate": 0.0001, "loss": 2.2762, "step": 171 }, { "epoch": 0.11190631099544568, "grad_norm": 0.422146201133728, "learning_rate": 0.0001, "loss": 2.4015, "step": 172 }, { "epoch": 0.1125569290826285, "grad_norm": 0.29050537943840027, "learning_rate": 0.0001, "loss": 2.4399, "step": 173 }, { "epoch": 0.11320754716981132, "grad_norm": 0.2646816074848175, "learning_rate": 0.0001, "loss": 2.3058, "step": 174 }, { "epoch": 0.11385816525699415, "grad_norm": 0.2643061578273773, "learning_rate": 0.0001, "loss": 2.1892, "step": 175 }, { "epoch": 0.11450878334417697, "grad_norm": 0.5878323316574097, "learning_rate": 0.0001, "loss": 3.2198, "step": 176 }, { "epoch": 0.11515940143135979, "grad_norm": 0.36881884932518005, "learning_rate": 0.0001, "loss": 2.4112, "step": 177 }, { "epoch": 0.11581001951854261, "grad_norm": 0.25198304653167725, "learning_rate": 0.0001, "loss": 2.1667, "step": 178 }, { "epoch": 0.11646063760572543, "grad_norm": 0.34164664149284363, "learning_rate": 0.0001, "loss": 2.6248, "step": 179 }, { "epoch": 0.11711125569290826, "grad_norm": 0.41471973061561584, "learning_rate": 0.0001, "loss": 2.5616, "step": 180 }, { "epoch": 0.11776187378009109, "grad_norm": 0.26372480392456055, "learning_rate": 0.0001, "loss": 2.2904, "step": 181 }, { "epoch": 0.11841249186727391, "grad_norm": 0.2271176278591156, "learning_rate": 0.0001, "loss": 2.0312, "step": 182 }, { "epoch": 0.11906310995445674, "grad_norm": 0.2106996774673462, "learning_rate": 0.0001, "loss": 1.9661, "step": 183 }, { "epoch": 0.11971372804163956, "grad_norm": 0.22870291769504547, "learning_rate": 0.0001, "loss": 1.9052, "step": 184 }, { "epoch": 0.12036434612882238, "grad_norm": 0.41253864765167236, "learning_rate": 0.0001, "loss": 2.3747, "step": 185 }, { "epoch": 0.1210149642160052, "grad_norm": 0.3258817791938782, "learning_rate": 0.0001, "loss": 2.5401, "step": 186 }, { "epoch": 0.12166558230318802, "grad_norm": 0.3461870551109314, "learning_rate": 0.0001, "loss": 2.8027, "step": 187 }, { "epoch": 0.12231620039037085, "grad_norm": 0.3704046607017517, "learning_rate": 0.0001, "loss": 2.799, "step": 188 }, { "epoch": 0.12296681847755368, "grad_norm": 0.30265969038009644, "learning_rate": 0.0001, "loss": 2.4287, "step": 189 }, { "epoch": 0.1236174365647365, "grad_norm": 0.4215582013130188, "learning_rate": 0.0001, "loss": 2.6857, "step": 190 }, { "epoch": 0.12426805465191933, "grad_norm": 0.3003520965576172, "learning_rate": 0.0001, "loss": 2.4155, "step": 191 }, { "epoch": 0.12491867273910215, "grad_norm": 0.412749320268631, "learning_rate": 0.0001, "loss": 2.6352, "step": 192 }, { "epoch": 0.12556929082628496, "grad_norm": 0.2772350013256073, "learning_rate": 0.0001, "loss": 2.2452, "step": 193 }, { "epoch": 0.1262199089134678, "grad_norm": 0.21457143127918243, "learning_rate": 0.0001, "loss": 2.0172, "step": 194 }, { "epoch": 0.12687052700065063, "grad_norm": 0.40995845198631287, "learning_rate": 0.0001, "loss": 2.6218, "step": 195 }, { "epoch": 0.12752114508783344, "grad_norm": 0.2253209501504898, "learning_rate": 0.0001, "loss": 2.2319, "step": 196 }, { "epoch": 0.12817176317501627, "grad_norm": 0.36564287543296814, "learning_rate": 0.0001, "loss": 2.4585, "step": 197 }, { "epoch": 0.12882238126219908, "grad_norm": 0.41084784269332886, "learning_rate": 0.0001, "loss": 2.6326, "step": 198 }, { "epoch": 0.12947299934938192, "grad_norm": 0.36012157797813416, "learning_rate": 0.0001, "loss": 2.0168, "step": 199 }, { "epoch": 0.13012361743656473, "grad_norm": 0.5138425230979919, "learning_rate": 0.0001, "loss": 2.3377, "step": 200 }, { "epoch": 0.13077423552374756, "grad_norm": 0.2799031436443329, "learning_rate": 0.0001, "loss": 2.532, "step": 201 }, { "epoch": 0.13142485361093037, "grad_norm": 0.3078779876232147, "learning_rate": 0.0001, "loss": 2.044, "step": 202 }, { "epoch": 0.1320754716981132, "grad_norm": 0.31270912289619446, "learning_rate": 0.0001, "loss": 1.8576, "step": 203 }, { "epoch": 0.13272608978529604, "grad_norm": 0.23117204010486603, "learning_rate": 0.0001, "loss": 2.1908, "step": 204 }, { "epoch": 0.13337670787247885, "grad_norm": 0.2531285285949707, "learning_rate": 0.0001, "loss": 2.143, "step": 205 }, { "epoch": 0.1340273259596617, "grad_norm": 0.28053218126296997, "learning_rate": 0.0001, "loss": 2.6902, "step": 206 }, { "epoch": 0.1346779440468445, "grad_norm": 0.2600589692592621, "learning_rate": 0.0001, "loss": 2.0355, "step": 207 }, { "epoch": 0.13532856213402733, "grad_norm": 0.2725912630558014, "learning_rate": 0.0001, "loss": 2.3949, "step": 208 }, { "epoch": 0.13597918022121014, "grad_norm": 0.6166338324546814, "learning_rate": 0.0001, "loss": 2.8146, "step": 209 }, { "epoch": 0.13662979830839297, "grad_norm": 0.4028575122356415, "learning_rate": 0.0001, "loss": 2.888, "step": 210 }, { "epoch": 0.1372804163955758, "grad_norm": 0.23181548714637756, "learning_rate": 0.0001, "loss": 2.1406, "step": 211 }, { "epoch": 0.13793103448275862, "grad_norm": 0.24338063597679138, "learning_rate": 0.0001, "loss": 2.1564, "step": 212 }, { "epoch": 0.13858165256994145, "grad_norm": 0.233146533370018, "learning_rate": 0.0001, "loss": 2.1695, "step": 213 }, { "epoch": 0.13923227065712426, "grad_norm": 0.21236726641654968, "learning_rate": 0.0001, "loss": 1.9272, "step": 214 }, { "epoch": 0.1398828887443071, "grad_norm": 0.25471317768096924, "learning_rate": 0.0001, "loss": 2.3447, "step": 215 }, { "epoch": 0.1405335068314899, "grad_norm": 0.35532835125923157, "learning_rate": 0.0001, "loss": 2.4328, "step": 216 }, { "epoch": 0.14118412491867274, "grad_norm": 0.32900944352149963, "learning_rate": 0.0001, "loss": 2.385, "step": 217 }, { "epoch": 0.14183474300585555, "grad_norm": 0.45404863357543945, "learning_rate": 0.0001, "loss": 2.8053, "step": 218 }, { "epoch": 0.1424853610930384, "grad_norm": 0.33968400955200195, "learning_rate": 0.0001, "loss": 2.4524, "step": 219 }, { "epoch": 0.14313597918022122, "grad_norm": 0.3250170946121216, "learning_rate": 0.0001, "loss": 2.6173, "step": 220 }, { "epoch": 0.14378659726740403, "grad_norm": 0.34765559434890747, "learning_rate": 0.0001, "loss": 2.8468, "step": 221 }, { "epoch": 0.14443721535458687, "grad_norm": 0.2274564653635025, "learning_rate": 0.0001, "loss": 2.1305, "step": 222 }, { "epoch": 0.14508783344176968, "grad_norm": 0.42719507217407227, "learning_rate": 0.0001, "loss": 2.3682, "step": 223 }, { "epoch": 0.1457384515289525, "grad_norm": 0.2848481833934784, "learning_rate": 0.0001, "loss": 2.0923, "step": 224 }, { "epoch": 0.14638906961613532, "grad_norm": 0.266548752784729, "learning_rate": 0.0001, "loss": 2.0393, "step": 225 }, { "epoch": 0.14703968770331816, "grad_norm": 0.24076099693775177, "learning_rate": 0.0001, "loss": 2.2674, "step": 226 }, { "epoch": 0.14769030579050096, "grad_norm": 0.23347622156143188, "learning_rate": 0.0001, "loss": 1.9455, "step": 227 }, { "epoch": 0.1483409238776838, "grad_norm": 0.3925648033618927, "learning_rate": 0.0001, "loss": 2.7117, "step": 228 }, { "epoch": 0.14899154196486664, "grad_norm": 0.27654924988746643, "learning_rate": 0.0001, "loss": 2.1306, "step": 229 }, { "epoch": 0.14964216005204944, "grad_norm": 0.2853853702545166, "learning_rate": 0.0001, "loss": 2.4369, "step": 230 }, { "epoch": 0.15029277813923228, "grad_norm": 0.4509859085083008, "learning_rate": 0.0001, "loss": 2.6047, "step": 231 }, { "epoch": 0.1509433962264151, "grad_norm": 0.2515909671783447, "learning_rate": 0.0001, "loss": 2.2065, "step": 232 }, { "epoch": 0.15159401431359792, "grad_norm": 0.5977367162704468, "learning_rate": 0.0001, "loss": 2.7133, "step": 233 }, { "epoch": 0.15224463240078073, "grad_norm": 0.30381399393081665, "learning_rate": 0.0001, "loss": 2.343, "step": 234 }, { "epoch": 0.15289525048796357, "grad_norm": 0.27204832434654236, "learning_rate": 0.0001, "loss": 2.2908, "step": 235 }, { "epoch": 0.15354586857514638, "grad_norm": 0.6246710419654846, "learning_rate": 0.0001, "loss": 2.7862, "step": 236 }, { "epoch": 0.1541964866623292, "grad_norm": 0.4803178012371063, "learning_rate": 0.0001, "loss": 3.4388, "step": 237 }, { "epoch": 0.15484710474951205, "grad_norm": 0.3038940727710724, "learning_rate": 0.0001, "loss": 2.7409, "step": 238 }, { "epoch": 0.15549772283669486, "grad_norm": 0.2494591474533081, "learning_rate": 0.0001, "loss": 2.2601, "step": 239 }, { "epoch": 0.1561483409238777, "grad_norm": 0.23808616399765015, "learning_rate": 0.0001, "loss": 2.1319, "step": 240 }, { "epoch": 0.1567989590110605, "grad_norm": 0.3111306130886078, "learning_rate": 0.0001, "loss": 2.7414, "step": 241 }, { "epoch": 0.15744957709824334, "grad_norm": 0.22197599709033966, "learning_rate": 0.0001, "loss": 2.1346, "step": 242 }, { "epoch": 0.15810019518542615, "grad_norm": 0.2681500315666199, "learning_rate": 0.0001, "loss": 2.3779, "step": 243 }, { "epoch": 0.15875081327260898, "grad_norm": 0.2612643241882324, "learning_rate": 0.0001, "loss": 2.5743, "step": 244 }, { "epoch": 0.1594014313597918, "grad_norm": 0.201397106051445, "learning_rate": 0.0001, "loss": 2.0312, "step": 245 }, { "epoch": 0.16005204944697463, "grad_norm": 0.25662410259246826, "learning_rate": 0.0001, "loss": 2.5085, "step": 246 }, { "epoch": 0.16070266753415746, "grad_norm": 0.21460294723510742, "learning_rate": 0.0001, "loss": 2.1099, "step": 247 }, { "epoch": 0.16135328562134027, "grad_norm": 0.19971312582492828, "learning_rate": 0.0001, "loss": 2.1024, "step": 248 }, { "epoch": 0.1620039037085231, "grad_norm": 0.1986059844493866, "learning_rate": 0.0001, "loss": 1.9306, "step": 249 }, { "epoch": 0.16265452179570591, "grad_norm": 0.21961884200572968, "learning_rate": 0.0001, "loss": 2.1218, "step": 250 }, { "epoch": 0.16330513988288875, "grad_norm": 0.20071017742156982, "learning_rate": 0.0001, "loss": 2.0581, "step": 251 }, { "epoch": 0.16395575797007156, "grad_norm": 0.32734909653663635, "learning_rate": 0.0001, "loss": 2.6229, "step": 252 }, { "epoch": 0.1646063760572544, "grad_norm": 0.21822451055049896, "learning_rate": 0.0001, "loss": 1.9954, "step": 253 }, { "epoch": 0.1652569941444372, "grad_norm": 0.3013177216053009, "learning_rate": 0.0001, "loss": 2.454, "step": 254 }, { "epoch": 0.16590761223162004, "grad_norm": 0.31199347972869873, "learning_rate": 0.0001, "loss": 2.815, "step": 255 }, { "epoch": 0.16655823031880287, "grad_norm": 0.2255464345216751, "learning_rate": 0.0001, "loss": 2.0232, "step": 256 }, { "epoch": 0.16720884840598568, "grad_norm": 0.21208804845809937, "learning_rate": 0.0001, "loss": 1.9663, "step": 257 }, { "epoch": 0.16785946649316852, "grad_norm": 0.2432132512331009, "learning_rate": 0.0001, "loss": 2.4189, "step": 258 }, { "epoch": 0.16851008458035133, "grad_norm": 0.21116623282432556, "learning_rate": 0.0001, "loss": 2.0761, "step": 259 }, { "epoch": 0.16916070266753416, "grad_norm": 0.18722975254058838, "learning_rate": 0.0001, "loss": 1.9537, "step": 260 }, { "epoch": 0.16981132075471697, "grad_norm": 0.2683362662792206, "learning_rate": 0.0001, "loss": 2.4483, "step": 261 }, { "epoch": 0.1704619388418998, "grad_norm": 0.2739648222923279, "learning_rate": 0.0001, "loss": 2.3754, "step": 262 }, { "epoch": 0.17111255692908262, "grad_norm": 0.1836375594139099, "learning_rate": 0.0001, "loss": 2.0103, "step": 263 }, { "epoch": 0.17176317501626545, "grad_norm": 0.34002602100372314, "learning_rate": 0.0001, "loss": 2.2626, "step": 264 }, { "epoch": 0.1724137931034483, "grad_norm": 0.19341516494750977, "learning_rate": 0.0001, "loss": 1.9751, "step": 265 }, { "epoch": 0.1730644111906311, "grad_norm": 0.25080743432044983, "learning_rate": 0.0001, "loss": 2.2162, "step": 266 }, { "epoch": 0.17371502927781393, "grad_norm": 0.2362661212682724, "learning_rate": 0.0001, "loss": 2.0226, "step": 267 }, { "epoch": 0.17436564736499674, "grad_norm": 0.25844064354896545, "learning_rate": 0.0001, "loss": 2.3176, "step": 268 }, { "epoch": 0.17501626545217958, "grad_norm": 0.3904498517513275, "learning_rate": 0.0001, "loss": 2.4871, "step": 269 }, { "epoch": 0.17566688353936238, "grad_norm": 0.22143317759037018, "learning_rate": 0.0001, "loss": 2.2073, "step": 270 }, { "epoch": 0.17631750162654522, "grad_norm": 0.20974211394786835, "learning_rate": 0.0001, "loss": 2.1393, "step": 271 }, { "epoch": 0.17696811971372803, "grad_norm": 0.24463056027889252, "learning_rate": 0.0001, "loss": 2.0203, "step": 272 }, { "epoch": 0.17761873780091086, "grad_norm": 0.23296399414539337, "learning_rate": 0.0001, "loss": 2.1096, "step": 273 }, { "epoch": 0.1782693558880937, "grad_norm": 0.4122619926929474, "learning_rate": 0.0001, "loss": 3.1512, "step": 274 }, { "epoch": 0.1789199739752765, "grad_norm": 0.2744470536708832, "learning_rate": 0.0001, "loss": 2.2211, "step": 275 }, { "epoch": 0.17957059206245934, "grad_norm": 0.21010619401931763, "learning_rate": 0.0001, "loss": 2.2203, "step": 276 }, { "epoch": 0.18022121014964215, "grad_norm": 0.27855056524276733, "learning_rate": 0.0001, "loss": 2.2903, "step": 277 }, { "epoch": 0.180871828236825, "grad_norm": 0.2909989058971405, "learning_rate": 0.0001, "loss": 2.237, "step": 278 }, { "epoch": 0.1815224463240078, "grad_norm": 0.21754448115825653, "learning_rate": 0.0001, "loss": 2.0138, "step": 279 }, { "epoch": 0.18217306441119063, "grad_norm": 0.35209745168685913, "learning_rate": 0.0001, "loss": 2.652, "step": 280 }, { "epoch": 0.18282368249837344, "grad_norm": 0.29994750022888184, "learning_rate": 0.0001, "loss": 2.1868, "step": 281 }, { "epoch": 0.18347430058555628, "grad_norm": 0.2645902633666992, "learning_rate": 0.0001, "loss": 2.2925, "step": 282 }, { "epoch": 0.1841249186727391, "grad_norm": 0.3492202162742615, "learning_rate": 0.0001, "loss": 2.4176, "step": 283 }, { "epoch": 0.18477553675992192, "grad_norm": 0.256651371717453, "learning_rate": 0.0001, "loss": 2.3414, "step": 284 }, { "epoch": 0.18542615484710476, "grad_norm": 0.23287786543369293, "learning_rate": 0.0001, "loss": 2.5488, "step": 285 }, { "epoch": 0.18607677293428757, "grad_norm": 0.26059290766716003, "learning_rate": 0.0001, "loss": 2.4551, "step": 286 }, { "epoch": 0.1867273910214704, "grad_norm": 0.2482365071773529, "learning_rate": 0.0001, "loss": 2.0818, "step": 287 }, { "epoch": 0.1873780091086532, "grad_norm": 0.23024773597717285, "learning_rate": 0.0001, "loss": 2.2592, "step": 288 }, { "epoch": 0.18802862719583605, "grad_norm": 0.2590011656284332, "learning_rate": 0.0001, "loss": 2.4177, "step": 289 }, { "epoch": 0.18867924528301888, "grad_norm": 0.19760870933532715, "learning_rate": 0.0001, "loss": 2.0731, "step": 290 }, { "epoch": 0.1893298633702017, "grad_norm": 0.20266428589820862, "learning_rate": 0.0001, "loss": 2.1221, "step": 291 }, { "epoch": 0.18998048145738453, "grad_norm": 0.20199884474277496, "learning_rate": 0.0001, "loss": 2.0489, "step": 292 }, { "epoch": 0.19063109954456733, "grad_norm": 0.23876360058784485, "learning_rate": 0.0001, "loss": 2.1392, "step": 293 }, { "epoch": 0.19128171763175017, "grad_norm": 0.23555997014045715, "learning_rate": 0.0001, "loss": 2.4116, "step": 294 }, { "epoch": 0.19193233571893298, "grad_norm": 0.5010725259780884, "learning_rate": 0.0001, "loss": 2.7444, "step": 295 }, { "epoch": 0.19258295380611581, "grad_norm": 0.37809622287750244, "learning_rate": 0.0001, "loss": 2.2635, "step": 296 }, { "epoch": 0.19323357189329862, "grad_norm": 0.499888151884079, "learning_rate": 0.0001, "loss": 2.1984, "step": 297 }, { "epoch": 0.19388418998048146, "grad_norm": 0.43810585141181946, "learning_rate": 0.0001, "loss": 3.084, "step": 298 }, { "epoch": 0.1945348080676643, "grad_norm": 0.35633769631385803, "learning_rate": 0.0001, "loss": 2.0351, "step": 299 }, { "epoch": 0.1951854261548471, "grad_norm": 0.3693079650402069, "learning_rate": 0.0001, "loss": 1.9525, "step": 300 }, { "epoch": 0.19583604424202994, "grad_norm": 0.36550503969192505, "learning_rate": 0.0001, "loss": 2.2469, "step": 301 }, { "epoch": 0.19648666232921275, "grad_norm": 0.2579827308654785, "learning_rate": 0.0001, "loss": 2.3585, "step": 302 }, { "epoch": 0.19713728041639558, "grad_norm": 0.2603841722011566, "learning_rate": 0.0001, "loss": 2.3959, "step": 303 }, { "epoch": 0.1977878985035784, "grad_norm": 0.33103683590888977, "learning_rate": 0.0001, "loss": 2.2197, "step": 304 }, { "epoch": 0.19843851659076123, "grad_norm": 0.2977697551250458, "learning_rate": 0.0001, "loss": 2.2569, "step": 305 }, { "epoch": 0.19908913467794404, "grad_norm": 0.2085130512714386, "learning_rate": 0.0001, "loss": 2.2284, "step": 306 }, { "epoch": 0.19973975276512687, "grad_norm": 0.409212201833725, "learning_rate": 0.0001, "loss": 2.7014, "step": 307 }, { "epoch": 0.2003903708523097, "grad_norm": 0.2447553277015686, "learning_rate": 0.0001, "loss": 2.2826, "step": 308 }, { "epoch": 0.20104098893949252, "grad_norm": 0.21881726384162903, "learning_rate": 0.0001, "loss": 1.8573, "step": 309 }, { "epoch": 0.20169160702667535, "grad_norm": 0.24484936892986298, "learning_rate": 0.0001, "loss": 2.318, "step": 310 }, { "epoch": 0.20234222511385816, "grad_norm": 0.3251173198223114, "learning_rate": 0.0001, "loss": 2.3346, "step": 311 }, { "epoch": 0.202992843201041, "grad_norm": 0.22313712537288666, "learning_rate": 0.0001, "loss": 1.9119, "step": 312 }, { "epoch": 0.2036434612882238, "grad_norm": 0.3086949288845062, "learning_rate": 0.0001, "loss": 2.1809, "step": 313 }, { "epoch": 0.20429407937540664, "grad_norm": 0.28272122144699097, "learning_rate": 0.0001, "loss": 2.3335, "step": 314 }, { "epoch": 0.20494469746258945, "grad_norm": 0.208637535572052, "learning_rate": 0.0001, "loss": 2.1947, "step": 315 }, { "epoch": 0.20559531554977228, "grad_norm": 0.2913041114807129, "learning_rate": 0.0001, "loss": 2.3009, "step": 316 }, { "epoch": 0.20624593363695512, "grad_norm": 0.2813785970211029, "learning_rate": 0.0001, "loss": 2.0133, "step": 317 }, { "epoch": 0.20689655172413793, "grad_norm": 0.2324337363243103, "learning_rate": 0.0001, "loss": 2.0827, "step": 318 }, { "epoch": 0.20754716981132076, "grad_norm": 0.25195491313934326, "learning_rate": 0.0001, "loss": 2.5201, "step": 319 }, { "epoch": 0.20819778789850357, "grad_norm": 0.3435034453868866, "learning_rate": 0.0001, "loss": 2.321, "step": 320 }, { "epoch": 0.2088484059856864, "grad_norm": 0.2735581696033478, "learning_rate": 0.0001, "loss": 2.2218, "step": 321 }, { "epoch": 0.20949902407286922, "grad_norm": 0.2250661551952362, "learning_rate": 0.0001, "loss": 1.9416, "step": 322 }, { "epoch": 0.21014964216005205, "grad_norm": 0.3160262107849121, "learning_rate": 0.0001, "loss": 2.5494, "step": 323 }, { "epoch": 0.21080026024723486, "grad_norm": 0.3669279217720032, "learning_rate": 0.0001, "loss": 2.7751, "step": 324 }, { "epoch": 0.2114508783344177, "grad_norm": 0.2052752673625946, "learning_rate": 0.0001, "loss": 2.0139, "step": 325 }, { "epoch": 0.21210149642160053, "grad_norm": 0.2906612455844879, "learning_rate": 0.0001, "loss": 2.227, "step": 326 }, { "epoch": 0.21275211450878334, "grad_norm": 0.30327048897743225, "learning_rate": 0.0001, "loss": 2.2905, "step": 327 }, { "epoch": 0.21340273259596618, "grad_norm": 0.33950623869895935, "learning_rate": 0.0001, "loss": 3.0731, "step": 328 }, { "epoch": 0.21405335068314899, "grad_norm": 0.31319788098335266, "learning_rate": 0.0001, "loss": 2.1374, "step": 329 }, { "epoch": 0.21470396877033182, "grad_norm": 0.21442054212093353, "learning_rate": 0.0001, "loss": 1.7588, "step": 330 }, { "epoch": 0.21535458685751463, "grad_norm": 0.23125174641609192, "learning_rate": 0.0001, "loss": 1.9295, "step": 331 }, { "epoch": 0.21600520494469747, "grad_norm": 0.23220308125019073, "learning_rate": 0.0001, "loss": 2.2606, "step": 332 }, { "epoch": 0.21665582303188027, "grad_norm": 0.24599219858646393, "learning_rate": 0.0001, "loss": 2.2687, "step": 333 }, { "epoch": 0.2173064411190631, "grad_norm": 0.22226236760616302, "learning_rate": 0.0001, "loss": 2.1428, "step": 334 }, { "epoch": 0.21795705920624595, "grad_norm": 0.2653510570526123, "learning_rate": 0.0001, "loss": 2.4381, "step": 335 }, { "epoch": 0.21860767729342875, "grad_norm": 0.23770929872989655, "learning_rate": 0.0001, "loss": 1.9655, "step": 336 }, { "epoch": 0.2192582953806116, "grad_norm": 0.1932332068681717, "learning_rate": 0.0001, "loss": 1.9465, "step": 337 }, { "epoch": 0.2199089134677944, "grad_norm": 0.181661456823349, "learning_rate": 0.0001, "loss": 1.9912, "step": 338 }, { "epoch": 0.22055953155497723, "grad_norm": 0.22275297343730927, "learning_rate": 0.0001, "loss": 2.1964, "step": 339 }, { "epoch": 0.22121014964216004, "grad_norm": 0.22086840867996216, "learning_rate": 0.0001, "loss": 2.2216, "step": 340 }, { "epoch": 0.22186076772934288, "grad_norm": 0.22807130217552185, "learning_rate": 0.0001, "loss": 2.2434, "step": 341 }, { "epoch": 0.2225113858165257, "grad_norm": 0.26616647839546204, "learning_rate": 0.0001, "loss": 2.442, "step": 342 }, { "epoch": 0.22316200390370852, "grad_norm": 0.2841719388961792, "learning_rate": 0.0001, "loss": 2.2358, "step": 343 }, { "epoch": 0.22381262199089136, "grad_norm": 0.23251943290233612, "learning_rate": 0.0001, "loss": 2.3436, "step": 344 }, { "epoch": 0.22446324007807417, "grad_norm": 0.20406994223594666, "learning_rate": 0.0001, "loss": 2.101, "step": 345 }, { "epoch": 0.225113858165257, "grad_norm": 0.18677304685115814, "learning_rate": 0.0001, "loss": 2.0596, "step": 346 }, { "epoch": 0.2257644762524398, "grad_norm": 0.22367873787879944, "learning_rate": 0.0001, "loss": 2.2051, "step": 347 }, { "epoch": 0.22641509433962265, "grad_norm": 0.2521246671676636, "learning_rate": 0.0001, "loss": 2.1718, "step": 348 }, { "epoch": 0.22706571242680545, "grad_norm": 0.23043319582939148, "learning_rate": 0.0001, "loss": 2.2818, "step": 349 }, { "epoch": 0.2277163305139883, "grad_norm": 0.22021251916885376, "learning_rate": 0.0001, "loss": 2.0337, "step": 350 }, { "epoch": 0.2283669486011711, "grad_norm": 0.18043603003025055, "learning_rate": 0.0001, "loss": 1.9434, "step": 351 }, { "epoch": 0.22901756668835394, "grad_norm": 0.4757142961025238, "learning_rate": 0.0001, "loss": 2.2467, "step": 352 }, { "epoch": 0.22966818477553677, "grad_norm": 0.30740290880203247, "learning_rate": 0.0001, "loss": 2.5296, "step": 353 }, { "epoch": 0.23031880286271958, "grad_norm": 0.23037666082382202, "learning_rate": 0.0001, "loss": 2.311, "step": 354 }, { "epoch": 0.23096942094990242, "grad_norm": 0.22314564883708954, "learning_rate": 0.0001, "loss": 2.0494, "step": 355 }, { "epoch": 0.23162003903708522, "grad_norm": 0.21417242288589478, "learning_rate": 0.0001, "loss": 2.2459, "step": 356 }, { "epoch": 0.23227065712426806, "grad_norm": 0.2895831763744354, "learning_rate": 0.0001, "loss": 2.2705, "step": 357 }, { "epoch": 0.23292127521145087, "grad_norm": 0.2110838145017624, "learning_rate": 0.0001, "loss": 2.1175, "step": 358 }, { "epoch": 0.2335718932986337, "grad_norm": 0.3999682664871216, "learning_rate": 0.0001, "loss": 2.6891, "step": 359 }, { "epoch": 0.2342225113858165, "grad_norm": 0.5169201493263245, "learning_rate": 0.0001, "loss": 2.5764, "step": 360 }, { "epoch": 0.23487312947299935, "grad_norm": 0.24382548034191132, "learning_rate": 0.0001, "loss": 2.1065, "step": 361 }, { "epoch": 0.23552374756018218, "grad_norm": 0.2830081582069397, "learning_rate": 0.0001, "loss": 2.1186, "step": 362 }, { "epoch": 0.236174365647365, "grad_norm": 0.23680554330348969, "learning_rate": 0.0001, "loss": 2.118, "step": 363 }, { "epoch": 0.23682498373454783, "grad_norm": 0.3790690302848816, "learning_rate": 0.0001, "loss": 2.3566, "step": 364 }, { "epoch": 0.23747560182173064, "grad_norm": 0.2664685845375061, "learning_rate": 0.0001, "loss": 2.2118, "step": 365 }, { "epoch": 0.23812621990891347, "grad_norm": 0.22439126670360565, "learning_rate": 0.0001, "loss": 2.0897, "step": 366 }, { "epoch": 0.23877683799609628, "grad_norm": 0.2559892237186432, "learning_rate": 0.0001, "loss": 2.2559, "step": 367 }, { "epoch": 0.23942745608327912, "grad_norm": 0.43989577889442444, "learning_rate": 0.0001, "loss": 2.5208, "step": 368 }, { "epoch": 0.24007807417046195, "grad_norm": 0.24543894827365875, "learning_rate": 0.0001, "loss": 2.1692, "step": 369 }, { "epoch": 0.24072869225764476, "grad_norm": 0.37020954489707947, "learning_rate": 0.0001, "loss": 2.1287, "step": 370 }, { "epoch": 0.2413793103448276, "grad_norm": 0.41815564036369324, "learning_rate": 0.0001, "loss": 2.5952, "step": 371 }, { "epoch": 0.2420299284320104, "grad_norm": 0.22579136490821838, "learning_rate": 0.0001, "loss": 2.2427, "step": 372 }, { "epoch": 0.24268054651919324, "grad_norm": 0.3004798889160156, "learning_rate": 0.0001, "loss": 2.2767, "step": 373 }, { "epoch": 0.24333116460637605, "grad_norm": 0.27470141649246216, "learning_rate": 0.0001, "loss": 2.092, "step": 374 }, { "epoch": 0.24398178269355889, "grad_norm": 0.25301867723464966, "learning_rate": 0.0001, "loss": 2.1816, "step": 375 }, { "epoch": 0.2446324007807417, "grad_norm": 0.21194620430469513, "learning_rate": 0.0001, "loss": 2.1322, "step": 376 }, { "epoch": 0.24528301886792453, "grad_norm": 0.28737103939056396, "learning_rate": 0.0001, "loss": 2.6685, "step": 377 }, { "epoch": 0.24593363695510737, "grad_norm": 0.28857922554016113, "learning_rate": 0.0001, "loss": 2.2219, "step": 378 }, { "epoch": 0.24658425504229017, "grad_norm": 0.29493409395217896, "learning_rate": 0.0001, "loss": 2.717, "step": 379 }, { "epoch": 0.247234873129473, "grad_norm": 0.33975929021835327, "learning_rate": 0.0001, "loss": 2.3499, "step": 380 }, { "epoch": 0.24788549121665582, "grad_norm": 0.21486152708530426, "learning_rate": 0.0001, "loss": 2.306, "step": 381 }, { "epoch": 0.24853610930383865, "grad_norm": 0.2686431109905243, "learning_rate": 0.0001, "loss": 2.0942, "step": 382 }, { "epoch": 0.24918672739102146, "grad_norm": 0.2812007963657379, "learning_rate": 0.0001, "loss": 2.3729, "step": 383 }, { "epoch": 0.2498373454782043, "grad_norm": 0.31875330209732056, "learning_rate": 0.0001, "loss": 2.5766, "step": 384 }, { "epoch": 0.2504879635653871, "grad_norm": 0.2624376714229584, "learning_rate": 0.0001, "loss": 2.2057, "step": 385 }, { "epoch": 0.2511385816525699, "grad_norm": 0.265286386013031, "learning_rate": 0.0001, "loss": 2.2405, "step": 386 }, { "epoch": 0.2517891997397528, "grad_norm": 0.3202246129512787, "learning_rate": 0.0001, "loss": 2.2817, "step": 387 }, { "epoch": 0.2524398178269356, "grad_norm": 0.22770161926746368, "learning_rate": 0.0001, "loss": 1.9564, "step": 388 }, { "epoch": 0.2530904359141184, "grad_norm": 0.3313138484954834, "learning_rate": 0.0001, "loss": 2.4424, "step": 389 }, { "epoch": 0.25374105400130126, "grad_norm": 0.2961839437484741, "learning_rate": 0.0001, "loss": 2.4122, "step": 390 }, { "epoch": 0.25439167208848407, "grad_norm": 0.24270308017730713, "learning_rate": 0.0001, "loss": 1.99, "step": 391 }, { "epoch": 0.2550422901756669, "grad_norm": 0.2306670844554901, "learning_rate": 0.0001, "loss": 2.3529, "step": 392 }, { "epoch": 0.2556929082628497, "grad_norm": 0.28387176990509033, "learning_rate": 0.0001, "loss": 2.0824, "step": 393 }, { "epoch": 0.25634352635003255, "grad_norm": 0.3105824291706085, "learning_rate": 0.0001, "loss": 2.437, "step": 394 }, { "epoch": 0.25699414443721535, "grad_norm": 0.1932361125946045, "learning_rate": 0.0001, "loss": 1.9747, "step": 395 }, { "epoch": 0.25764476252439816, "grad_norm": 0.31146278977394104, "learning_rate": 0.0001, "loss": 2.263, "step": 396 }, { "epoch": 0.258295380611581, "grad_norm": 0.24420365691184998, "learning_rate": 0.0001, "loss": 2.015, "step": 397 }, { "epoch": 0.25894599869876384, "grad_norm": 0.24144989252090454, "learning_rate": 0.0001, "loss": 2.2536, "step": 398 }, { "epoch": 0.25959661678594664, "grad_norm": 0.3478517532348633, "learning_rate": 0.0001, "loss": 2.5835, "step": 399 }, { "epoch": 0.26024723487312945, "grad_norm": 0.24381348490715027, "learning_rate": 0.0001, "loss": 2.2439, "step": 400 }, { "epoch": 0.2608978529603123, "grad_norm": 0.2834983468055725, "learning_rate": 0.0001, "loss": 2.3991, "step": 401 }, { "epoch": 0.2615484710474951, "grad_norm": 0.28689858317375183, "learning_rate": 0.0001, "loss": 1.9156, "step": 402 }, { "epoch": 0.26219908913467793, "grad_norm": 0.23692357540130615, "learning_rate": 0.0001, "loss": 2.0189, "step": 403 }, { "epoch": 0.26284970722186074, "grad_norm": 0.30104926228523254, "learning_rate": 0.0001, "loss": 2.4945, "step": 404 }, { "epoch": 0.2635003253090436, "grad_norm": 0.23472270369529724, "learning_rate": 0.0001, "loss": 1.8892, "step": 405 }, { "epoch": 0.2641509433962264, "grad_norm": 0.31508034467697144, "learning_rate": 0.0001, "loss": 2.4935, "step": 406 }, { "epoch": 0.2648015614834092, "grad_norm": 0.25103551149368286, "learning_rate": 0.0001, "loss": 2.4428, "step": 407 }, { "epoch": 0.2654521795705921, "grad_norm": 0.2387259602546692, "learning_rate": 0.0001, "loss": 2.0989, "step": 408 }, { "epoch": 0.2661027976577749, "grad_norm": 0.2606028616428375, "learning_rate": 0.0001, "loss": 1.9494, "step": 409 }, { "epoch": 0.2667534157449577, "grad_norm": 0.25114724040031433, "learning_rate": 0.0001, "loss": 2.2432, "step": 410 }, { "epoch": 0.2674040338321405, "grad_norm": 0.3072582483291626, "learning_rate": 0.0001, "loss": 2.3506, "step": 411 }, { "epoch": 0.2680546519193234, "grad_norm": 0.23917561769485474, "learning_rate": 0.0001, "loss": 2.2665, "step": 412 }, { "epoch": 0.2687052700065062, "grad_norm": 0.2120814174413681, "learning_rate": 0.0001, "loss": 1.9625, "step": 413 }, { "epoch": 0.269355888093689, "grad_norm": 0.22003813087940216, "learning_rate": 0.0001, "loss": 2.1179, "step": 414 }, { "epoch": 0.27000650618087185, "grad_norm": 0.33217060565948486, "learning_rate": 0.0001, "loss": 2.6353, "step": 415 }, { "epoch": 0.27065712426805466, "grad_norm": 0.2260630577802658, "learning_rate": 0.0001, "loss": 2.0355, "step": 416 }, { "epoch": 0.27130774235523747, "grad_norm": 0.30081093311309814, "learning_rate": 0.0001, "loss": 2.1825, "step": 417 }, { "epoch": 0.2719583604424203, "grad_norm": 0.27275893092155457, "learning_rate": 0.0001, "loss": 2.6183, "step": 418 }, { "epoch": 0.27260897852960314, "grad_norm": 0.4902358651161194, "learning_rate": 0.0001, "loss": 3.0888, "step": 419 }, { "epoch": 0.27325959661678595, "grad_norm": 0.21213112771511078, "learning_rate": 0.0001, "loss": 2.1172, "step": 420 }, { "epoch": 0.27391021470396876, "grad_norm": 0.35953450202941895, "learning_rate": 0.0001, "loss": 2.5109, "step": 421 }, { "epoch": 0.2745608327911516, "grad_norm": 0.2081584334373474, "learning_rate": 0.0001, "loss": 2.0894, "step": 422 }, { "epoch": 0.27521145087833443, "grad_norm": 0.20892906188964844, "learning_rate": 0.0001, "loss": 1.9643, "step": 423 }, { "epoch": 0.27586206896551724, "grad_norm": 0.30058735609054565, "learning_rate": 0.0001, "loss": 2.6503, "step": 424 }, { "epoch": 0.27651268705270005, "grad_norm": 0.32902124524116516, "learning_rate": 0.0001, "loss": 2.3271, "step": 425 }, { "epoch": 0.2771633051398829, "grad_norm": 0.2003614902496338, "learning_rate": 0.0001, "loss": 1.9881, "step": 426 }, { "epoch": 0.2778139232270657, "grad_norm": 0.33349111676216125, "learning_rate": 0.0001, "loss": 2.7625, "step": 427 }, { "epoch": 0.2784645413142485, "grad_norm": 0.25051257014274597, "learning_rate": 0.0001, "loss": 2.0825, "step": 428 }, { "epoch": 0.27911515940143133, "grad_norm": 0.3301559388637543, "learning_rate": 0.0001, "loss": 2.85, "step": 429 }, { "epoch": 0.2797657774886142, "grad_norm": 0.18224254250526428, "learning_rate": 0.0001, "loss": 1.9687, "step": 430 }, { "epoch": 0.280416395575797, "grad_norm": 0.21809989213943481, "learning_rate": 0.0001, "loss": 2.2596, "step": 431 }, { "epoch": 0.2810670136629798, "grad_norm": 0.2473779171705246, "learning_rate": 0.0001, "loss": 2.2042, "step": 432 }, { "epoch": 0.2817176317501627, "grad_norm": 0.20744885504245758, "learning_rate": 0.0001, "loss": 2.1546, "step": 433 }, { "epoch": 0.2823682498373455, "grad_norm": 0.2620698809623718, "learning_rate": 0.0001, "loss": 2.5195, "step": 434 }, { "epoch": 0.2830188679245283, "grad_norm": 0.291421115398407, "learning_rate": 0.0001, "loss": 2.4983, "step": 435 }, { "epoch": 0.2836694860117111, "grad_norm": 0.3294708728790283, "learning_rate": 0.0001, "loss": 2.3146, "step": 436 }, { "epoch": 0.28432010409889397, "grad_norm": 0.26191362738609314, "learning_rate": 0.0001, "loss": 2.2818, "step": 437 }, { "epoch": 0.2849707221860768, "grad_norm": 0.29155483841896057, "learning_rate": 0.0001, "loss": 2.4888, "step": 438 }, { "epoch": 0.2856213402732596, "grad_norm": 0.19482360780239105, "learning_rate": 0.0001, "loss": 2.0061, "step": 439 }, { "epoch": 0.28627195836044245, "grad_norm": 0.2594612240791321, "learning_rate": 0.0001, "loss": 2.1891, "step": 440 }, { "epoch": 0.28692257644762525, "grad_norm": 0.21656309068202972, "learning_rate": 0.0001, "loss": 1.7911, "step": 441 }, { "epoch": 0.28757319453480806, "grad_norm": 0.18664829432964325, "learning_rate": 0.0001, "loss": 1.9634, "step": 442 }, { "epoch": 0.28822381262199087, "grad_norm": 0.2178332507610321, "learning_rate": 0.0001, "loss": 2.32, "step": 443 }, { "epoch": 0.28887443070917374, "grad_norm": 0.351418673992157, "learning_rate": 0.0001, "loss": 3.0873, "step": 444 }, { "epoch": 0.28952504879635654, "grad_norm": 0.23604457080364227, "learning_rate": 0.0001, "loss": 2.46, "step": 445 }, { "epoch": 0.29017566688353935, "grad_norm": 0.2599848806858063, "learning_rate": 0.0001, "loss": 2.0207, "step": 446 }, { "epoch": 0.29082628497072216, "grad_norm": 0.340314120054245, "learning_rate": 0.0001, "loss": 2.279, "step": 447 }, { "epoch": 0.291476903057905, "grad_norm": 0.23228399455547333, "learning_rate": 0.0001, "loss": 2.3561, "step": 448 }, { "epoch": 0.29212752114508783, "grad_norm": 0.25504687428474426, "learning_rate": 0.0001, "loss": 2.2251, "step": 449 }, { "epoch": 0.29277813923227064, "grad_norm": 0.2465014010667801, "learning_rate": 0.0001, "loss": 2.1031, "step": 450 }, { "epoch": 0.2934287573194535, "grad_norm": 0.2188328504562378, "learning_rate": 0.0001, "loss": 2.1483, "step": 451 }, { "epoch": 0.2940793754066363, "grad_norm": 0.24546551704406738, "learning_rate": 0.0001, "loss": 2.2334, "step": 452 }, { "epoch": 0.2947299934938191, "grad_norm": 0.23416215181350708, "learning_rate": 0.0001, "loss": 2.1846, "step": 453 }, { "epoch": 0.29538061158100193, "grad_norm": 0.25267231464385986, "learning_rate": 0.0001, "loss": 2.2134, "step": 454 }, { "epoch": 0.2960312296681848, "grad_norm": 0.26632416248321533, "learning_rate": 0.0001, "loss": 2.5012, "step": 455 }, { "epoch": 0.2966818477553676, "grad_norm": 0.18289139866828918, "learning_rate": 0.0001, "loss": 2.0524, "step": 456 }, { "epoch": 0.2973324658425504, "grad_norm": 0.19033563137054443, "learning_rate": 0.0001, "loss": 2.0165, "step": 457 }, { "epoch": 0.2979830839297333, "grad_norm": 0.200730562210083, "learning_rate": 0.0001, "loss": 1.8021, "step": 458 }, { "epoch": 0.2986337020169161, "grad_norm": 0.2109062522649765, "learning_rate": 0.0001, "loss": 2.0655, "step": 459 }, { "epoch": 0.2992843201040989, "grad_norm": 0.23461318016052246, "learning_rate": 0.0001, "loss": 2.3335, "step": 460 }, { "epoch": 0.2999349381912817, "grad_norm": 0.2085726112127304, "learning_rate": 0.0001, "loss": 2.0061, "step": 461 }, { "epoch": 0.30058555627846456, "grad_norm": 0.2938329875469208, "learning_rate": 0.0001, "loss": 2.5245, "step": 462 }, { "epoch": 0.30123617436564737, "grad_norm": 0.22131232917308807, "learning_rate": 0.0001, "loss": 2.4115, "step": 463 }, { "epoch": 0.3018867924528302, "grad_norm": 0.3459152579307556, "learning_rate": 0.0001, "loss": 2.3896, "step": 464 }, { "epoch": 0.302537410540013, "grad_norm": 0.27464184165000916, "learning_rate": 0.0001, "loss": 2.6592, "step": 465 }, { "epoch": 0.30318802862719585, "grad_norm": 0.28379327058792114, "learning_rate": 0.0001, "loss": 2.1453, "step": 466 }, { "epoch": 0.30383864671437866, "grad_norm": 0.28283926844596863, "learning_rate": 0.0001, "loss": 2.1704, "step": 467 }, { "epoch": 0.30448926480156147, "grad_norm": 0.22243599593639374, "learning_rate": 0.0001, "loss": 2.1175, "step": 468 }, { "epoch": 0.30513988288874433, "grad_norm": 0.22331124544143677, "learning_rate": 0.0001, "loss": 1.8857, "step": 469 }, { "epoch": 0.30579050097592714, "grad_norm": 0.21995989978313446, "learning_rate": 0.0001, "loss": 2.1316, "step": 470 }, { "epoch": 0.30644111906310995, "grad_norm": 0.21140341460704803, "learning_rate": 0.0001, "loss": 2.0742, "step": 471 }, { "epoch": 0.30709173715029275, "grad_norm": 0.31053757667541504, "learning_rate": 0.0001, "loss": 2.615, "step": 472 }, { "epoch": 0.3077423552374756, "grad_norm": 0.2768484354019165, "learning_rate": 0.0001, "loss": 2.713, "step": 473 }, { "epoch": 0.3083929733246584, "grad_norm": 0.2538318336009979, "learning_rate": 0.0001, "loss": 2.1917, "step": 474 }, { "epoch": 0.30904359141184123, "grad_norm": 0.2105240672826767, "learning_rate": 0.0001, "loss": 2.2741, "step": 475 }, { "epoch": 0.3096942094990241, "grad_norm": 0.2915903925895691, "learning_rate": 0.0001, "loss": 2.115, "step": 476 }, { "epoch": 0.3103448275862069, "grad_norm": 0.30282047390937805, "learning_rate": 0.0001, "loss": 2.7806, "step": 477 }, { "epoch": 0.3109954456733897, "grad_norm": 0.2707601487636566, "learning_rate": 0.0001, "loss": 2.6137, "step": 478 }, { "epoch": 0.3116460637605725, "grad_norm": 0.34574300050735474, "learning_rate": 0.0001, "loss": 2.5957, "step": 479 }, { "epoch": 0.3122966818477554, "grad_norm": 0.22767509520053864, "learning_rate": 0.0001, "loss": 2.3543, "step": 480 }, { "epoch": 0.3129472999349382, "grad_norm": 0.25194215774536133, "learning_rate": 0.0001, "loss": 2.6586, "step": 481 }, { "epoch": 0.313597918022121, "grad_norm": 0.20427219569683075, "learning_rate": 0.0001, "loss": 1.9091, "step": 482 }, { "epoch": 0.3142485361093038, "grad_norm": 0.2993704378604889, "learning_rate": 0.0001, "loss": 2.4704, "step": 483 }, { "epoch": 0.3148991541964867, "grad_norm": 0.18951758742332458, "learning_rate": 0.0001, "loss": 2.1108, "step": 484 }, { "epoch": 0.3155497722836695, "grad_norm": 0.2622709572315216, "learning_rate": 0.0001, "loss": 2.4144, "step": 485 }, { "epoch": 0.3162003903708523, "grad_norm": 0.20735126733779907, "learning_rate": 0.0001, "loss": 2.3065, "step": 486 }, { "epoch": 0.31685100845803515, "grad_norm": 0.22782085835933685, "learning_rate": 0.0001, "loss": 2.4377, "step": 487 }, { "epoch": 0.31750162654521796, "grad_norm": 0.2568935453891754, "learning_rate": 0.0001, "loss": 2.1199, "step": 488 }, { "epoch": 0.31815224463240077, "grad_norm": 0.23917409777641296, "learning_rate": 0.0001, "loss": 2.2457, "step": 489 }, { "epoch": 0.3188028627195836, "grad_norm": 0.21531902253627777, "learning_rate": 0.0001, "loss": 2.0489, "step": 490 }, { "epoch": 0.31945348080676644, "grad_norm": 0.21461109817028046, "learning_rate": 0.0001, "loss": 2.1915, "step": 491 }, { "epoch": 0.32010409889394925, "grad_norm": 0.2458680123090744, "learning_rate": 0.0001, "loss": 2.3939, "step": 492 }, { "epoch": 0.32075471698113206, "grad_norm": 0.2617323696613312, "learning_rate": 0.0001, "loss": 2.5611, "step": 493 }, { "epoch": 0.3214053350683149, "grad_norm": 0.22562618553638458, "learning_rate": 0.0001, "loss": 2.2703, "step": 494 }, { "epoch": 0.32205595315549773, "grad_norm": 0.2290688008069992, "learning_rate": 0.0001, "loss": 2.3049, "step": 495 }, { "epoch": 0.32270657124268054, "grad_norm": 0.4118833541870117, "learning_rate": 0.0001, "loss": 2.9194, "step": 496 }, { "epoch": 0.32335718932986335, "grad_norm": 0.22502999007701874, "learning_rate": 0.0001, "loss": 2.2362, "step": 497 }, { "epoch": 0.3240078074170462, "grad_norm": 0.23599191009998322, "learning_rate": 0.0001, "loss": 2.35, "step": 498 }, { "epoch": 0.324658425504229, "grad_norm": 0.3065047860145569, "learning_rate": 0.0001, "loss": 2.3984, "step": 499 }, { "epoch": 0.32530904359141183, "grad_norm": 0.19241982698440552, "learning_rate": 0.0001, "loss": 1.8787, "step": 500 }, { "epoch": 0.3259596616785947, "grad_norm": 0.20695632696151733, "learning_rate": 0.0001, "loss": 1.9397, "step": 501 }, { "epoch": 0.3266102797657775, "grad_norm": 0.1998564749956131, "learning_rate": 0.0001, "loss": 2.1463, "step": 502 }, { "epoch": 0.3272608978529603, "grad_norm": 0.27775317430496216, "learning_rate": 0.0001, "loss": 2.7956, "step": 503 }, { "epoch": 0.3279115159401431, "grad_norm": 0.2393936961889267, "learning_rate": 0.0001, "loss": 2.3785, "step": 504 }, { "epoch": 0.328562134027326, "grad_norm": 0.20921163260936737, "learning_rate": 0.0001, "loss": 2.1909, "step": 505 }, { "epoch": 0.3292127521145088, "grad_norm": 0.25875911116600037, "learning_rate": 0.0001, "loss": 2.129, "step": 506 }, { "epoch": 0.3298633702016916, "grad_norm": 0.2382909208536148, "learning_rate": 0.0001, "loss": 2.3786, "step": 507 }, { "epoch": 0.3305139882888744, "grad_norm": 0.19657136499881744, "learning_rate": 0.0001, "loss": 1.951, "step": 508 }, { "epoch": 0.33116460637605727, "grad_norm": 0.23688004910945892, "learning_rate": 0.0001, "loss": 2.4348, "step": 509 }, { "epoch": 0.3318152244632401, "grad_norm": 0.1988734006881714, "learning_rate": 0.0001, "loss": 2.2352, "step": 510 }, { "epoch": 0.3324658425504229, "grad_norm": 0.2078763097524643, "learning_rate": 0.0001, "loss": 2.1376, "step": 511 }, { "epoch": 0.33311646063760575, "grad_norm": 0.18860888481140137, "learning_rate": 0.0001, "loss": 1.9367, "step": 512 }, { "epoch": 0.33376707872478856, "grad_norm": 0.30205249786376953, "learning_rate": 0.0001, "loss": 2.6822, "step": 513 }, { "epoch": 0.33441769681197137, "grad_norm": 0.2146618664264679, "learning_rate": 0.0001, "loss": 2.1927, "step": 514 }, { "epoch": 0.3350683148991542, "grad_norm": 0.19332504272460938, "learning_rate": 0.0001, "loss": 2.0442, "step": 515 }, { "epoch": 0.33571893298633704, "grad_norm": 0.2289431244134903, "learning_rate": 0.0001, "loss": 2.0152, "step": 516 }, { "epoch": 0.33636955107351985, "grad_norm": 0.21815945208072662, "learning_rate": 0.0001, "loss": 2.0015, "step": 517 }, { "epoch": 0.33702016916070265, "grad_norm": 0.2226189821958542, "learning_rate": 0.0001, "loss": 2.2989, "step": 518 }, { "epoch": 0.3376707872478855, "grad_norm": 0.22195078432559967, "learning_rate": 0.0001, "loss": 2.2237, "step": 519 }, { "epoch": 0.3383214053350683, "grad_norm": 0.1946515589952469, "learning_rate": 0.0001, "loss": 1.9459, "step": 520 }, { "epoch": 0.33897202342225113, "grad_norm": 0.21510568261146545, "learning_rate": 0.0001, "loss": 2.1305, "step": 521 }, { "epoch": 0.33962264150943394, "grad_norm": 0.23448903858661652, "learning_rate": 0.0001, "loss": 2.1838, "step": 522 }, { "epoch": 0.3402732595966168, "grad_norm": 0.19046911597251892, "learning_rate": 0.0001, "loss": 1.9739, "step": 523 }, { "epoch": 0.3409238776837996, "grad_norm": 0.2314033806324005, "learning_rate": 0.0001, "loss": 2.2053, "step": 524 }, { "epoch": 0.3415744957709824, "grad_norm": 0.2206612378358841, "learning_rate": 0.0001, "loss": 2.2566, "step": 525 }, { "epoch": 0.34222511385816523, "grad_norm": 0.19578076899051666, "learning_rate": 0.0001, "loss": 2.045, "step": 526 }, { "epoch": 0.3428757319453481, "grad_norm": 0.1787755936384201, "learning_rate": 0.0001, "loss": 1.8942, "step": 527 }, { "epoch": 0.3435263500325309, "grad_norm": 0.20091751217842102, "learning_rate": 0.0001, "loss": 2.1576, "step": 528 }, { "epoch": 0.3441769681197137, "grad_norm": 0.21869762241840363, "learning_rate": 0.0001, "loss": 2.1938, "step": 529 }, { "epoch": 0.3448275862068966, "grad_norm": 0.26101449131965637, "learning_rate": 0.0001, "loss": 2.3642, "step": 530 }, { "epoch": 0.3454782042940794, "grad_norm": 0.21874766051769257, "learning_rate": 0.0001, "loss": 2.4553, "step": 531 }, { "epoch": 0.3461288223812622, "grad_norm": 0.224325492978096, "learning_rate": 0.0001, "loss": 2.2959, "step": 532 }, { "epoch": 0.346779440468445, "grad_norm": 0.21268363296985626, "learning_rate": 0.0001, "loss": 2.1021, "step": 533 }, { "epoch": 0.34743005855562786, "grad_norm": 0.20979231595993042, "learning_rate": 0.0001, "loss": 2.0304, "step": 534 }, { "epoch": 0.34808067664281067, "grad_norm": 0.19552691280841827, "learning_rate": 0.0001, "loss": 1.9747, "step": 535 }, { "epoch": 0.3487312947299935, "grad_norm": 0.27929842472076416, "learning_rate": 0.0001, "loss": 2.445, "step": 536 }, { "epoch": 0.34938191281717634, "grad_norm": 0.19953188300132751, "learning_rate": 0.0001, "loss": 1.9766, "step": 537 }, { "epoch": 0.35003253090435915, "grad_norm": 0.29898926615715027, "learning_rate": 0.0001, "loss": 2.4818, "step": 538 }, { "epoch": 0.35068314899154196, "grad_norm": 0.18719644844532013, "learning_rate": 0.0001, "loss": 1.9046, "step": 539 }, { "epoch": 0.35133376707872477, "grad_norm": 0.2602563798427582, "learning_rate": 0.0001, "loss": 2.1539, "step": 540 }, { "epoch": 0.35198438516590763, "grad_norm": 0.23460406064987183, "learning_rate": 0.0001, "loss": 2.3826, "step": 541 }, { "epoch": 0.35263500325309044, "grad_norm": 0.2821134328842163, "learning_rate": 0.0001, "loss": 2.223, "step": 542 }, { "epoch": 0.35328562134027325, "grad_norm": 0.2641044557094574, "learning_rate": 0.0001, "loss": 2.2402, "step": 543 }, { "epoch": 0.35393623942745606, "grad_norm": 0.21963565051555634, "learning_rate": 0.0001, "loss": 2.3988, "step": 544 }, { "epoch": 0.3545868575146389, "grad_norm": 0.26475685834884644, "learning_rate": 0.0001, "loss": 2.3046, "step": 545 }, { "epoch": 0.35523747560182173, "grad_norm": 0.27148157358169556, "learning_rate": 0.0001, "loss": 2.5076, "step": 546 }, { "epoch": 0.35588809368900454, "grad_norm": 0.28925588726997375, "learning_rate": 0.0001, "loss": 2.8395, "step": 547 }, { "epoch": 0.3565387117761874, "grad_norm": 0.22953632473945618, "learning_rate": 0.0001, "loss": 2.1198, "step": 548 }, { "epoch": 0.3571893298633702, "grad_norm": 0.23960557579994202, "learning_rate": 0.0001, "loss": 2.3064, "step": 549 }, { "epoch": 0.357839947950553, "grad_norm": 0.3133333921432495, "learning_rate": 0.0001, "loss": 2.6034, "step": 550 }, { "epoch": 0.3584905660377358, "grad_norm": 0.21745215356349945, "learning_rate": 0.0001, "loss": 2.4553, "step": 551 }, { "epoch": 0.3591411841249187, "grad_norm": 0.23547130823135376, "learning_rate": 0.0001, "loss": 2.0469, "step": 552 }, { "epoch": 0.3597918022121015, "grad_norm": 0.2646094262599945, "learning_rate": 0.0001, "loss": 1.9016, "step": 553 }, { "epoch": 0.3604424202992843, "grad_norm": 0.3079530596733093, "learning_rate": 0.0001, "loss": 2.8979, "step": 554 }, { "epoch": 0.36109303838646717, "grad_norm": 0.38223740458488464, "learning_rate": 0.0001, "loss": 3.066, "step": 555 }, { "epoch": 0.36174365647365, "grad_norm": 0.2535337209701538, "learning_rate": 0.0001, "loss": 2.1327, "step": 556 }, { "epoch": 0.3623942745608328, "grad_norm": 0.2373637855052948, "learning_rate": 0.0001, "loss": 2.1141, "step": 557 }, { "epoch": 0.3630448926480156, "grad_norm": 0.19437271356582642, "learning_rate": 0.0001, "loss": 1.9753, "step": 558 }, { "epoch": 0.36369551073519846, "grad_norm": 0.20236878097057343, "learning_rate": 0.0001, "loss": 2.2516, "step": 559 }, { "epoch": 0.36434612882238127, "grad_norm": 0.21252363920211792, "learning_rate": 0.0001, "loss": 2.3645, "step": 560 }, { "epoch": 0.3649967469095641, "grad_norm": 0.21689258515834808, "learning_rate": 0.0001, "loss": 2.1145, "step": 561 }, { "epoch": 0.3656473649967469, "grad_norm": 0.22365228831768036, "learning_rate": 0.0001, "loss": 2.3083, "step": 562 }, { "epoch": 0.36629798308392975, "grad_norm": 0.21607807278633118, "learning_rate": 0.0001, "loss": 2.3199, "step": 563 }, { "epoch": 0.36694860117111255, "grad_norm": 0.1885683536529541, "learning_rate": 0.0001, "loss": 1.9303, "step": 564 }, { "epoch": 0.36759921925829536, "grad_norm": 0.20064905285835266, "learning_rate": 0.0001, "loss": 2.0661, "step": 565 }, { "epoch": 0.3682498373454782, "grad_norm": 0.23532240092754364, "learning_rate": 0.0001, "loss": 2.6942, "step": 566 }, { "epoch": 0.36890045543266103, "grad_norm": 0.22937807440757751, "learning_rate": 0.0001, "loss": 2.1962, "step": 567 }, { "epoch": 0.36955107351984384, "grad_norm": 0.2540866732597351, "learning_rate": 0.0001, "loss": 2.5012, "step": 568 }, { "epoch": 0.37020169160702665, "grad_norm": 0.23405294120311737, "learning_rate": 0.0001, "loss": 2.2439, "step": 569 }, { "epoch": 0.3708523096942095, "grad_norm": 0.24394820630550385, "learning_rate": 0.0001, "loss": 2.0741, "step": 570 }, { "epoch": 0.3715029277813923, "grad_norm": 0.2063736468553543, "learning_rate": 0.0001, "loss": 2.0864, "step": 571 }, { "epoch": 0.37215354586857513, "grad_norm": 0.3300686180591583, "learning_rate": 0.0001, "loss": 2.4983, "step": 572 }, { "epoch": 0.372804163955758, "grad_norm": 0.21294772624969482, "learning_rate": 0.0001, "loss": 2.2273, "step": 573 }, { "epoch": 0.3734547820429408, "grad_norm": 0.2629190981388092, "learning_rate": 0.0001, "loss": 2.1732, "step": 574 }, { "epoch": 0.3741054001301236, "grad_norm": 0.2141999751329422, "learning_rate": 0.0001, "loss": 2.3038, "step": 575 }, { "epoch": 0.3747560182173064, "grad_norm": 0.3467566668987274, "learning_rate": 0.0001, "loss": 2.7748, "step": 576 }, { "epoch": 0.3754066363044893, "grad_norm": 0.3112248182296753, "learning_rate": 0.0001, "loss": 2.2376, "step": 577 }, { "epoch": 0.3760572543916721, "grad_norm": 0.21217738091945648, "learning_rate": 0.0001, "loss": 1.9146, "step": 578 }, { "epoch": 0.3767078724788549, "grad_norm": 0.19359458982944489, "learning_rate": 0.0001, "loss": 2.0913, "step": 579 }, { "epoch": 0.37735849056603776, "grad_norm": 0.27635738253593445, "learning_rate": 0.0001, "loss": 2.2855, "step": 580 }, { "epoch": 0.37800910865322057, "grad_norm": 0.19366882741451263, "learning_rate": 0.0001, "loss": 2.0194, "step": 581 }, { "epoch": 0.3786597267404034, "grad_norm": 0.2016839236021042, "learning_rate": 0.0001, "loss": 2.1519, "step": 582 }, { "epoch": 0.3793103448275862, "grad_norm": 0.22154097259044647, "learning_rate": 0.0001, "loss": 1.9849, "step": 583 }, { "epoch": 0.37996096291476905, "grad_norm": 0.2089187502861023, "learning_rate": 0.0001, "loss": 2.3624, "step": 584 }, { "epoch": 0.38061158100195186, "grad_norm": 0.25050756335258484, "learning_rate": 0.0001, "loss": 2.1773, "step": 585 }, { "epoch": 0.38126219908913467, "grad_norm": 0.23007918894290924, "learning_rate": 0.0001, "loss": 2.2054, "step": 586 }, { "epoch": 0.3819128171763175, "grad_norm": 0.25022968649864197, "learning_rate": 0.0001, "loss": 2.219, "step": 587 }, { "epoch": 0.38256343526350034, "grad_norm": 0.2205193042755127, "learning_rate": 0.0001, "loss": 2.2049, "step": 588 }, { "epoch": 0.38321405335068315, "grad_norm": 0.21454961597919464, "learning_rate": 0.0001, "loss": 2.0683, "step": 589 }, { "epoch": 0.38386467143786596, "grad_norm": 0.2088347226381302, "learning_rate": 0.0001, "loss": 2.1301, "step": 590 }, { "epoch": 0.3845152895250488, "grad_norm": 0.20322394371032715, "learning_rate": 0.0001, "loss": 2.2098, "step": 591 }, { "epoch": 0.38516590761223163, "grad_norm": 0.231514111161232, "learning_rate": 0.0001, "loss": 2.5523, "step": 592 }, { "epoch": 0.38581652569941444, "grad_norm": 0.24791982769966125, "learning_rate": 0.0001, "loss": 2.2259, "step": 593 }, { "epoch": 0.38646714378659724, "grad_norm": 0.21148578822612762, "learning_rate": 0.0001, "loss": 2.0834, "step": 594 }, { "epoch": 0.3871177618737801, "grad_norm": 0.263713538646698, "learning_rate": 0.0001, "loss": 2.3101, "step": 595 }, { "epoch": 0.3877683799609629, "grad_norm": 0.22197774052619934, "learning_rate": 0.0001, "loss": 2.1173, "step": 596 }, { "epoch": 0.3884189980481457, "grad_norm": 0.2237439900636673, "learning_rate": 0.0001, "loss": 2.1109, "step": 597 }, { "epoch": 0.3890696161353286, "grad_norm": 0.27451419830322266, "learning_rate": 0.0001, "loss": 2.5311, "step": 598 }, { "epoch": 0.3897202342225114, "grad_norm": 0.18475750088691711, "learning_rate": 0.0001, "loss": 1.9241, "step": 599 }, { "epoch": 0.3903708523096942, "grad_norm": 0.20120149850845337, "learning_rate": 0.0001, "loss": 2.1033, "step": 600 }, { "epoch": 0.391021470396877, "grad_norm": 0.19626259803771973, "learning_rate": 0.0001, "loss": 2.1223, "step": 601 }, { "epoch": 0.3916720884840599, "grad_norm": 0.22795897722244263, "learning_rate": 0.0001, "loss": 2.2021, "step": 602 }, { "epoch": 0.3923227065712427, "grad_norm": 0.5195867419242859, "learning_rate": 0.0001, "loss": 3.1849, "step": 603 }, { "epoch": 0.3929733246584255, "grad_norm": 0.2636241614818573, "learning_rate": 0.0001, "loss": 2.0739, "step": 604 }, { "epoch": 0.3936239427456083, "grad_norm": 0.33922895789146423, "learning_rate": 0.0001, "loss": 2.31, "step": 605 }, { "epoch": 0.39427456083279117, "grad_norm": 0.17467042803764343, "learning_rate": 0.0001, "loss": 1.9201, "step": 606 }, { "epoch": 0.394925178919974, "grad_norm": 0.22457371652126312, "learning_rate": 0.0001, "loss": 1.9783, "step": 607 }, { "epoch": 0.3955757970071568, "grad_norm": 0.5104444026947021, "learning_rate": 0.0001, "loss": 2.3777, "step": 608 }, { "epoch": 0.39622641509433965, "grad_norm": 0.4531616270542145, "learning_rate": 0.0001, "loss": 2.8208, "step": 609 }, { "epoch": 0.39687703318152245, "grad_norm": 0.20649151504039764, "learning_rate": 0.0001, "loss": 2.1377, "step": 610 }, { "epoch": 0.39752765126870526, "grad_norm": 0.39769667387008667, "learning_rate": 0.0001, "loss": 2.2228, "step": 611 }, { "epoch": 0.39817826935588807, "grad_norm": 0.2832731008529663, "learning_rate": 0.0001, "loss": 1.9664, "step": 612 }, { "epoch": 0.39882888744307093, "grad_norm": 0.2754386067390442, "learning_rate": 0.0001, "loss": 2.5595, "step": 613 }, { "epoch": 0.39947950553025374, "grad_norm": 0.404364675283432, "learning_rate": 0.0001, "loss": 2.8133, "step": 614 }, { "epoch": 0.40013012361743655, "grad_norm": 0.30304789543151855, "learning_rate": 0.0001, "loss": 2.2729, "step": 615 }, { "epoch": 0.4007807417046194, "grad_norm": 0.2519910931587219, "learning_rate": 0.0001, "loss": 2.3655, "step": 616 }, { "epoch": 0.4014313597918022, "grad_norm": 0.2863995134830475, "learning_rate": 0.0001, "loss": 2.0774, "step": 617 }, { "epoch": 0.40208197787898503, "grad_norm": 0.393622487783432, "learning_rate": 0.0001, "loss": 2.5082, "step": 618 }, { "epoch": 0.40273259596616784, "grad_norm": 0.21836060285568237, "learning_rate": 0.0001, "loss": 1.9548, "step": 619 }, { "epoch": 0.4033832140533507, "grad_norm": 0.358052521944046, "learning_rate": 0.0001, "loss": 2.5158, "step": 620 }, { "epoch": 0.4040338321405335, "grad_norm": 0.237140953540802, "learning_rate": 0.0001, "loss": 2.2111, "step": 621 }, { "epoch": 0.4046844502277163, "grad_norm": 0.20998883247375488, "learning_rate": 0.0001, "loss": 2.1351, "step": 622 }, { "epoch": 0.4053350683148991, "grad_norm": 0.18059247732162476, "learning_rate": 0.0001, "loss": 1.9451, "step": 623 }, { "epoch": 0.405985686402082, "grad_norm": 0.17532669007778168, "learning_rate": 0.0001, "loss": 1.8591, "step": 624 }, { "epoch": 0.4066363044892648, "grad_norm": 0.24097976088523865, "learning_rate": 0.0001, "loss": 2.6534, "step": 625 }, { "epoch": 0.4072869225764476, "grad_norm": 0.19505445659160614, "learning_rate": 0.0001, "loss": 1.8952, "step": 626 }, { "epoch": 0.40793754066363047, "grad_norm": 0.232722207903862, "learning_rate": 0.0001, "loss": 2.2055, "step": 627 }, { "epoch": 0.4085881587508133, "grad_norm": 0.23899732530117035, "learning_rate": 0.0001, "loss": 2.5848, "step": 628 }, { "epoch": 0.4092387768379961, "grad_norm": 0.2411729097366333, "learning_rate": 0.0001, "loss": 2.5315, "step": 629 }, { "epoch": 0.4098893949251789, "grad_norm": 0.25042012333869934, "learning_rate": 0.0001, "loss": 2.4154, "step": 630 }, { "epoch": 0.41054001301236176, "grad_norm": 0.2764488160610199, "learning_rate": 0.0001, "loss": 2.0564, "step": 631 }, { "epoch": 0.41119063109954457, "grad_norm": 0.24761155247688293, "learning_rate": 0.0001, "loss": 2.3245, "step": 632 }, { "epoch": 0.4118412491867274, "grad_norm": 0.22376200556755066, "learning_rate": 0.0001, "loss": 2.1881, "step": 633 }, { "epoch": 0.41249186727391024, "grad_norm": 0.19060148298740387, "learning_rate": 0.0001, "loss": 1.9588, "step": 634 }, { "epoch": 0.41314248536109305, "grad_norm": 0.4157400131225586, "learning_rate": 0.0001, "loss": 2.9024, "step": 635 }, { "epoch": 0.41379310344827586, "grad_norm": 0.2557002007961273, "learning_rate": 0.0001, "loss": 1.9819, "step": 636 }, { "epoch": 0.41444372153545866, "grad_norm": 0.2908417880535126, "learning_rate": 0.0001, "loss": 2.112, "step": 637 }, { "epoch": 0.41509433962264153, "grad_norm": 0.32937270402908325, "learning_rate": 0.0001, "loss": 2.4976, "step": 638 }, { "epoch": 0.41574495770982434, "grad_norm": 0.20382268726825714, "learning_rate": 0.0001, "loss": 2.0448, "step": 639 }, { "epoch": 0.41639557579700714, "grad_norm": 0.23484939336776733, "learning_rate": 0.0001, "loss": 1.9514, "step": 640 }, { "epoch": 0.41704619388418995, "grad_norm": 0.23023058474063873, "learning_rate": 0.0001, "loss": 2.0768, "step": 641 }, { "epoch": 0.4176968119713728, "grad_norm": 0.22951190173625946, "learning_rate": 0.0001, "loss": 2.0764, "step": 642 }, { "epoch": 0.4183474300585556, "grad_norm": 0.18971513211727142, "learning_rate": 0.0001, "loss": 1.9693, "step": 643 }, { "epoch": 0.41899804814573843, "grad_norm": 0.24955709278583527, "learning_rate": 0.0001, "loss": 2.4898, "step": 644 }, { "epoch": 0.4196486662329213, "grad_norm": 0.3344306945800781, "learning_rate": 0.0001, "loss": 2.4779, "step": 645 }, { "epoch": 0.4202992843201041, "grad_norm": 0.21661825478076935, "learning_rate": 0.0001, "loss": 2.0472, "step": 646 }, { "epoch": 0.4209499024072869, "grad_norm": 0.1972419023513794, "learning_rate": 0.0001, "loss": 2.1712, "step": 647 }, { "epoch": 0.4216005204944697, "grad_norm": 0.21619470417499542, "learning_rate": 0.0001, "loss": 2.0739, "step": 648 }, { "epoch": 0.4222511385816526, "grad_norm": 0.2329091727733612, "learning_rate": 0.0001, "loss": 2.1362, "step": 649 }, { "epoch": 0.4229017566688354, "grad_norm": 0.22971969842910767, "learning_rate": 0.0001, "loss": 1.9898, "step": 650 }, { "epoch": 0.4235523747560182, "grad_norm": 0.20185063779354095, "learning_rate": 0.0001, "loss": 2.1008, "step": 651 }, { "epoch": 0.42420299284320107, "grad_norm": 0.2658546566963196, "learning_rate": 0.0001, "loss": 2.5734, "step": 652 }, { "epoch": 0.4248536109303839, "grad_norm": 0.23109374940395355, "learning_rate": 0.0001, "loss": 2.2569, "step": 653 }, { "epoch": 0.4255042290175667, "grad_norm": 0.25115352869033813, "learning_rate": 0.0001, "loss": 2.5967, "step": 654 }, { "epoch": 0.4261548471047495, "grad_norm": 0.20470669865608215, "learning_rate": 0.0001, "loss": 2.0302, "step": 655 }, { "epoch": 0.42680546519193235, "grad_norm": 0.2151513546705246, "learning_rate": 0.0001, "loss": 2.5183, "step": 656 }, { "epoch": 0.42745608327911516, "grad_norm": 0.2571411728858948, "learning_rate": 0.0001, "loss": 2.255, "step": 657 }, { "epoch": 0.42810670136629797, "grad_norm": 0.2414022833108902, "learning_rate": 0.0001, "loss": 2.4076, "step": 658 }, { "epoch": 0.42875731945348083, "grad_norm": 0.21041014790534973, "learning_rate": 0.0001, "loss": 2.0091, "step": 659 }, { "epoch": 0.42940793754066364, "grad_norm": 0.21241822838783264, "learning_rate": 0.0001, "loss": 2.355, "step": 660 }, { "epoch": 0.43005855562784645, "grad_norm": 0.21031403541564941, "learning_rate": 0.0001, "loss": 1.9887, "step": 661 }, { "epoch": 0.43070917371502926, "grad_norm": 0.19765952229499817, "learning_rate": 0.0001, "loss": 2.1555, "step": 662 }, { "epoch": 0.4313597918022121, "grad_norm": 0.24740834534168243, "learning_rate": 0.0001, "loss": 2.2349, "step": 663 }, { "epoch": 0.43201040988939493, "grad_norm": 0.22086234390735626, "learning_rate": 0.0001, "loss": 2.0948, "step": 664 }, { "epoch": 0.43266102797657774, "grad_norm": 0.21949239075183868, "learning_rate": 0.0001, "loss": 2.3905, "step": 665 }, { "epoch": 0.43331164606376055, "grad_norm": 0.20536834001541138, "learning_rate": 0.0001, "loss": 2.0547, "step": 666 }, { "epoch": 0.4339622641509434, "grad_norm": 0.2570655941963196, "learning_rate": 0.0001, "loss": 2.0261, "step": 667 }, { "epoch": 0.4346128822381262, "grad_norm": 0.3293687701225281, "learning_rate": 0.0001, "loss": 2.344, "step": 668 }, { "epoch": 0.435263500325309, "grad_norm": 0.22947120666503906, "learning_rate": 0.0001, "loss": 2.232, "step": 669 }, { "epoch": 0.4359141184124919, "grad_norm": 0.2425599992275238, "learning_rate": 0.0001, "loss": 2.309, "step": 670 }, { "epoch": 0.4365647364996747, "grad_norm": 0.2506352663040161, "learning_rate": 0.0001, "loss": 2.1249, "step": 671 }, { "epoch": 0.4372153545868575, "grad_norm": 0.19457192718982697, "learning_rate": 0.0001, "loss": 1.9461, "step": 672 }, { "epoch": 0.4378659726740403, "grad_norm": 0.3749271035194397, "learning_rate": 0.0001, "loss": 2.8532, "step": 673 }, { "epoch": 0.4385165907612232, "grad_norm": 0.25384366512298584, "learning_rate": 0.0001, "loss": 2.6495, "step": 674 }, { "epoch": 0.439167208848406, "grad_norm": 0.21413469314575195, "learning_rate": 0.0001, "loss": 2.084, "step": 675 }, { "epoch": 0.4398178269355888, "grad_norm": 0.228125661611557, "learning_rate": 0.0001, "loss": 2.2175, "step": 676 }, { "epoch": 0.44046844502277166, "grad_norm": 0.1948491632938385, "learning_rate": 0.0001, "loss": 1.9702, "step": 677 }, { "epoch": 0.44111906310995447, "grad_norm": 0.307992547750473, "learning_rate": 0.0001, "loss": 2.5884, "step": 678 }, { "epoch": 0.4417696811971373, "grad_norm": 0.23681728541851044, "learning_rate": 0.0001, "loss": 2.2104, "step": 679 }, { "epoch": 0.4424202992843201, "grad_norm": 0.23185166716575623, "learning_rate": 0.0001, "loss": 2.0823, "step": 680 }, { "epoch": 0.44307091737150295, "grad_norm": 0.2772667109966278, "learning_rate": 0.0001, "loss": 2.3729, "step": 681 }, { "epoch": 0.44372153545868576, "grad_norm": 0.18908965587615967, "learning_rate": 0.0001, "loss": 2.0585, "step": 682 }, { "epoch": 0.44437215354586856, "grad_norm": 0.2063988745212555, "learning_rate": 0.0001, "loss": 1.9474, "step": 683 }, { "epoch": 0.4450227716330514, "grad_norm": 0.19444917142391205, "learning_rate": 0.0001, "loss": 1.9269, "step": 684 }, { "epoch": 0.44567338972023424, "grad_norm": 0.2866727113723755, "learning_rate": 0.0001, "loss": 2.5145, "step": 685 }, { "epoch": 0.44632400780741704, "grad_norm": 0.24801641702651978, "learning_rate": 0.0001, "loss": 2.2954, "step": 686 }, { "epoch": 0.44697462589459985, "grad_norm": 0.2115658074617386, "learning_rate": 0.0001, "loss": 2.1956, "step": 687 }, { "epoch": 0.4476252439817827, "grad_norm": 0.3155558109283447, "learning_rate": 0.0001, "loss": 2.7396, "step": 688 }, { "epoch": 0.4482758620689655, "grad_norm": 0.22418133914470673, "learning_rate": 0.0001, "loss": 2.1066, "step": 689 }, { "epoch": 0.44892648015614833, "grad_norm": 0.2707614600658417, "learning_rate": 0.0001, "loss": 2.3353, "step": 690 }, { "epoch": 0.44957709824333114, "grad_norm": 0.22262880206108093, "learning_rate": 0.0001, "loss": 2.2143, "step": 691 }, { "epoch": 0.450227716330514, "grad_norm": 0.25256767868995667, "learning_rate": 0.0001, "loss": 2.2786, "step": 692 }, { "epoch": 0.4508783344176968, "grad_norm": 0.20360921323299408, "learning_rate": 0.0001, "loss": 2.0059, "step": 693 }, { "epoch": 0.4515289525048796, "grad_norm": 0.20573420822620392, "learning_rate": 0.0001, "loss": 2.0884, "step": 694 }, { "epoch": 0.4521795705920625, "grad_norm": 0.31812623143196106, "learning_rate": 0.0001, "loss": 2.5905, "step": 695 }, { "epoch": 0.4528301886792453, "grad_norm": 0.24690969288349152, "learning_rate": 0.0001, "loss": 2.5157, "step": 696 }, { "epoch": 0.4534808067664281, "grad_norm": 0.256793737411499, "learning_rate": 0.0001, "loss": 2.1548, "step": 697 }, { "epoch": 0.4541314248536109, "grad_norm": 0.2659960985183716, "learning_rate": 0.0001, "loss": 2.2977, "step": 698 }, { "epoch": 0.4547820429407938, "grad_norm": 0.23824195563793182, "learning_rate": 0.0001, "loss": 2.5946, "step": 699 }, { "epoch": 0.4554326610279766, "grad_norm": 0.2580608129501343, "learning_rate": 0.0001, "loss": 2.2608, "step": 700 }, { "epoch": 0.4560832791151594, "grad_norm": 0.270622193813324, "learning_rate": 0.0001, "loss": 2.5848, "step": 701 }, { "epoch": 0.4567338972023422, "grad_norm": 0.2170489877462387, "learning_rate": 0.0001, "loss": 2.4315, "step": 702 }, { "epoch": 0.45738451528952506, "grad_norm": 0.20716050267219543, "learning_rate": 0.0001, "loss": 2.1592, "step": 703 }, { "epoch": 0.45803513337670787, "grad_norm": 0.24847671389579773, "learning_rate": 0.0001, "loss": 2.3202, "step": 704 }, { "epoch": 0.4586857514638907, "grad_norm": 0.24049146473407745, "learning_rate": 0.0001, "loss": 2.1968, "step": 705 }, { "epoch": 0.45933636955107354, "grad_norm": 0.2079533487558365, "learning_rate": 0.0001, "loss": 2.2966, "step": 706 }, { "epoch": 0.45998698763825635, "grad_norm": 0.18255428969860077, "learning_rate": 0.0001, "loss": 1.9931, "step": 707 }, { "epoch": 0.46063760572543916, "grad_norm": 0.28015655279159546, "learning_rate": 0.0001, "loss": 2.2605, "step": 708 }, { "epoch": 0.46128822381262197, "grad_norm": 0.27453094720840454, "learning_rate": 0.0001, "loss": 2.2835, "step": 709 }, { "epoch": 0.46193884189980483, "grad_norm": 0.2751506268978119, "learning_rate": 0.0001, "loss": 2.665, "step": 710 }, { "epoch": 0.46258945998698764, "grad_norm": 0.2759210169315338, "learning_rate": 0.0001, "loss": 2.3593, "step": 711 }, { "epoch": 0.46324007807417045, "grad_norm": 0.2902829051017761, "learning_rate": 0.0001, "loss": 2.7421, "step": 712 }, { "epoch": 0.4638906961613533, "grad_norm": 0.24083854258060455, "learning_rate": 0.0001, "loss": 2.4644, "step": 713 }, { "epoch": 0.4645413142485361, "grad_norm": 0.23614934086799622, "learning_rate": 0.0001, "loss": 2.2939, "step": 714 }, { "epoch": 0.4651919323357189, "grad_norm": 0.1972537487745285, "learning_rate": 0.0001, "loss": 1.9391, "step": 715 }, { "epoch": 0.46584255042290174, "grad_norm": 0.2227838933467865, "learning_rate": 0.0001, "loss": 1.9396, "step": 716 }, { "epoch": 0.4664931685100846, "grad_norm": 0.3672918379306793, "learning_rate": 0.0001, "loss": 2.7508, "step": 717 }, { "epoch": 0.4671437865972674, "grad_norm": 0.2712246775627136, "learning_rate": 0.0001, "loss": 2.2838, "step": 718 }, { "epoch": 0.4677944046844502, "grad_norm": 0.2337927669286728, "learning_rate": 0.0001, "loss": 1.9807, "step": 719 }, { "epoch": 0.468445022771633, "grad_norm": 0.2051180601119995, "learning_rate": 0.0001, "loss": 2.0311, "step": 720 }, { "epoch": 0.4690956408588159, "grad_norm": 0.1965889185667038, "learning_rate": 0.0001, "loss": 2.1114, "step": 721 }, { "epoch": 0.4697462589459987, "grad_norm": 0.2106337547302246, "learning_rate": 0.0001, "loss": 2.0792, "step": 722 }, { "epoch": 0.4703968770331815, "grad_norm": 0.19918356835842133, "learning_rate": 0.0001, "loss": 2.1323, "step": 723 }, { "epoch": 0.47104749512036437, "grad_norm": 0.20124401152133942, "learning_rate": 0.0001, "loss": 2.0008, "step": 724 }, { "epoch": 0.4716981132075472, "grad_norm": 0.2172473967075348, "learning_rate": 0.0001, "loss": 2.3891, "step": 725 }, { "epoch": 0.47234873129473, "grad_norm": 0.2524811029434204, "learning_rate": 0.0001, "loss": 2.3343, "step": 726 }, { "epoch": 0.4729993493819128, "grad_norm": 0.22882957756519318, "learning_rate": 0.0001, "loss": 2.6723, "step": 727 }, { "epoch": 0.47364996746909566, "grad_norm": 0.2434161901473999, "learning_rate": 0.0001, "loss": 1.9549, "step": 728 }, { "epoch": 0.47430058555627846, "grad_norm": 0.19140364229679108, "learning_rate": 0.0001, "loss": 2.0468, "step": 729 }, { "epoch": 0.4749512036434613, "grad_norm": 0.22166937589645386, "learning_rate": 0.0001, "loss": 2.3432, "step": 730 }, { "epoch": 0.47560182173064414, "grad_norm": 0.2005748748779297, "learning_rate": 0.0001, "loss": 2.0616, "step": 731 }, { "epoch": 0.47625243981782694, "grad_norm": 0.3115980923175812, "learning_rate": 0.0001, "loss": 2.6153, "step": 732 }, { "epoch": 0.47690305790500975, "grad_norm": 0.27135169506073, "learning_rate": 0.0001, "loss": 2.3225, "step": 733 }, { "epoch": 0.47755367599219256, "grad_norm": 0.20748727023601532, "learning_rate": 0.0001, "loss": 1.834, "step": 734 }, { "epoch": 0.4782042940793754, "grad_norm": 0.4031495153903961, "learning_rate": 0.0001, "loss": 2.8177, "step": 735 }, { "epoch": 0.47885491216655823, "grad_norm": 0.2978368401527405, "learning_rate": 0.0001, "loss": 2.6178, "step": 736 }, { "epoch": 0.47950553025374104, "grad_norm": 0.3466270864009857, "learning_rate": 0.0001, "loss": 2.6031, "step": 737 }, { "epoch": 0.4801561483409239, "grad_norm": 0.20074127614498138, "learning_rate": 0.0001, "loss": 2.247, "step": 738 }, { "epoch": 0.4808067664281067, "grad_norm": 0.2393479198217392, "learning_rate": 0.0001, "loss": 2.1265, "step": 739 }, { "epoch": 0.4814573845152895, "grad_norm": 0.27758634090423584, "learning_rate": 0.0001, "loss": 2.5025, "step": 740 }, { "epoch": 0.48210800260247233, "grad_norm": 0.20123820006847382, "learning_rate": 0.0001, "loss": 2.0083, "step": 741 }, { "epoch": 0.4827586206896552, "grad_norm": 0.19012506306171417, "learning_rate": 0.0001, "loss": 2.0212, "step": 742 }, { "epoch": 0.483409238776838, "grad_norm": 0.19451047480106354, "learning_rate": 0.0001, "loss": 2.0295, "step": 743 }, { "epoch": 0.4840598568640208, "grad_norm": 0.3339052200317383, "learning_rate": 0.0001, "loss": 2.4813, "step": 744 }, { "epoch": 0.4847104749512036, "grad_norm": 0.2646152973175049, "learning_rate": 0.0001, "loss": 2.4302, "step": 745 }, { "epoch": 0.4853610930383865, "grad_norm": 0.23590324819087982, "learning_rate": 0.0001, "loss": 2.1723, "step": 746 }, { "epoch": 0.4860117111255693, "grad_norm": 0.28924039006233215, "learning_rate": 0.0001, "loss": 2.8005, "step": 747 }, { "epoch": 0.4866623292127521, "grad_norm": 0.21145464479923248, "learning_rate": 0.0001, "loss": 2.3501, "step": 748 }, { "epoch": 0.48731294729993496, "grad_norm": 0.22815656661987305, "learning_rate": 0.0001, "loss": 2.1997, "step": 749 }, { "epoch": 0.48796356538711777, "grad_norm": 0.24325215816497803, "learning_rate": 0.0001, "loss": 2.039, "step": 750 }, { "epoch": 0.4886141834743006, "grad_norm": 0.3235335052013397, "learning_rate": 0.0001, "loss": 2.4533, "step": 751 }, { "epoch": 0.4892648015614834, "grad_norm": 0.25513559579849243, "learning_rate": 0.0001, "loss": 2.3779, "step": 752 }, { "epoch": 0.48991541964866625, "grad_norm": 0.2905427813529968, "learning_rate": 0.0001, "loss": 1.9843, "step": 753 }, { "epoch": 0.49056603773584906, "grad_norm": 0.23760183155536652, "learning_rate": 0.0001, "loss": 2.1825, "step": 754 }, { "epoch": 0.49121665582303187, "grad_norm": 0.2170071303844452, "learning_rate": 0.0001, "loss": 1.9877, "step": 755 }, { "epoch": 0.49186727391021473, "grad_norm": 0.2555190920829773, "learning_rate": 0.0001, "loss": 2.457, "step": 756 }, { "epoch": 0.49251789199739754, "grad_norm": 0.2571033835411072, "learning_rate": 0.0001, "loss": 2.1152, "step": 757 }, { "epoch": 0.49316851008458035, "grad_norm": 0.23969238996505737, "learning_rate": 0.0001, "loss": 2.3439, "step": 758 }, { "epoch": 0.49381912817176316, "grad_norm": 0.1900262087583542, "learning_rate": 0.0001, "loss": 1.8999, "step": 759 }, { "epoch": 0.494469746258946, "grad_norm": 0.19621430337429047, "learning_rate": 0.0001, "loss": 2.0658, "step": 760 }, { "epoch": 0.4951203643461288, "grad_norm": 0.21956481039524078, "learning_rate": 0.0001, "loss": 2.5427, "step": 761 }, { "epoch": 0.49577098243331164, "grad_norm": 0.22567258775234222, "learning_rate": 0.0001, "loss": 2.2777, "step": 762 }, { "epoch": 0.49642160052049444, "grad_norm": 0.20233570039272308, "learning_rate": 0.0001, "loss": 2.0342, "step": 763 }, { "epoch": 0.4970722186076773, "grad_norm": 0.23662947118282318, "learning_rate": 0.0001, "loss": 2.3668, "step": 764 }, { "epoch": 0.4977228366948601, "grad_norm": 0.2625278830528259, "learning_rate": 0.0001, "loss": 2.6536, "step": 765 }, { "epoch": 0.4983734547820429, "grad_norm": 0.23235228657722473, "learning_rate": 0.0001, "loss": 2.1891, "step": 766 }, { "epoch": 0.4990240728692258, "grad_norm": 0.19439217448234558, "learning_rate": 0.0001, "loss": 1.9647, "step": 767 }, { "epoch": 0.4996746909564086, "grad_norm": 0.19810114800930023, "learning_rate": 0.0001, "loss": 1.9965, "step": 768 }, { "epoch": 0.5003253090435914, "grad_norm": 0.2525380253791809, "learning_rate": 0.0001, "loss": 2.2444, "step": 769 }, { "epoch": 0.5009759271307742, "grad_norm": 0.2409314513206482, "learning_rate": 0.0001, "loss": 2.1717, "step": 770 }, { "epoch": 0.501626545217957, "grad_norm": 0.25244686007499695, "learning_rate": 0.0001, "loss": 2.0126, "step": 771 }, { "epoch": 0.5022771633051398, "grad_norm": 0.19767141342163086, "learning_rate": 0.0001, "loss": 2.1384, "step": 772 }, { "epoch": 0.5029277813923227, "grad_norm": 0.39446812868118286, "learning_rate": 0.0001, "loss": 2.8039, "step": 773 }, { "epoch": 0.5035783994795056, "grad_norm": 0.2643390893936157, "learning_rate": 0.0001, "loss": 2.1524, "step": 774 }, { "epoch": 0.5042290175666884, "grad_norm": 0.27606508135795593, "learning_rate": 0.0001, "loss": 2.1802, "step": 775 }, { "epoch": 0.5048796356538712, "grad_norm": 0.364106148481369, "learning_rate": 0.0001, "loss": 2.9694, "step": 776 }, { "epoch": 0.505530253741054, "grad_norm": 0.23091645538806915, "learning_rate": 0.0001, "loss": 2.5471, "step": 777 }, { "epoch": 0.5061808718282368, "grad_norm": 0.19318193197250366, "learning_rate": 0.0001, "loss": 2.2082, "step": 778 }, { "epoch": 0.5068314899154196, "grad_norm": 0.28997862339019775, "learning_rate": 0.0001, "loss": 2.4399, "step": 779 }, { "epoch": 0.5074821080026025, "grad_norm": 0.22487197816371918, "learning_rate": 0.0001, "loss": 2.1946, "step": 780 }, { "epoch": 0.5081327260897853, "grad_norm": 0.24430596828460693, "learning_rate": 0.0001, "loss": 2.4456, "step": 781 }, { "epoch": 0.5087833441769681, "grad_norm": 0.21677151322364807, "learning_rate": 0.0001, "loss": 2.2082, "step": 782 }, { "epoch": 0.5094339622641509, "grad_norm": 0.47995632886886597, "learning_rate": 0.0001, "loss": 3.1358, "step": 783 }, { "epoch": 0.5100845803513337, "grad_norm": 0.19044414162635803, "learning_rate": 0.0001, "loss": 1.8924, "step": 784 }, { "epoch": 0.5107351984385166, "grad_norm": 0.19143608212471008, "learning_rate": 0.0001, "loss": 2.0459, "step": 785 }, { "epoch": 0.5113858165256994, "grad_norm": 0.22588413953781128, "learning_rate": 0.0001, "loss": 2.1369, "step": 786 }, { "epoch": 0.5120364346128823, "grad_norm": 0.2786167860031128, "learning_rate": 0.0001, "loss": 2.2029, "step": 787 }, { "epoch": 0.5126870527000651, "grad_norm": 0.24471627175807953, "learning_rate": 0.0001, "loss": 2.1248, "step": 788 }, { "epoch": 0.5133376707872479, "grad_norm": 0.17795225977897644, "learning_rate": 0.0001, "loss": 1.7926, "step": 789 }, { "epoch": 0.5139882888744307, "grad_norm": 0.2173709124326706, "learning_rate": 0.0001, "loss": 2.0538, "step": 790 }, { "epoch": 0.5146389069616135, "grad_norm": 0.2027692049741745, "learning_rate": 0.0001, "loss": 1.8568, "step": 791 }, { "epoch": 0.5152895250487963, "grad_norm": 0.2013595849275589, "learning_rate": 0.0001, "loss": 2.0501, "step": 792 }, { "epoch": 0.5159401431359791, "grad_norm": 0.21996662020683289, "learning_rate": 0.0001, "loss": 2.0374, "step": 793 }, { "epoch": 0.516590761223162, "grad_norm": 0.21435722708702087, "learning_rate": 0.0001, "loss": 2.1907, "step": 794 }, { "epoch": 0.5172413793103449, "grad_norm": 0.21512284874916077, "learning_rate": 0.0001, "loss": 2.315, "step": 795 }, { "epoch": 0.5178919973975277, "grad_norm": 0.19432400166988373, "learning_rate": 0.0001, "loss": 2.103, "step": 796 }, { "epoch": 0.5185426154847105, "grad_norm": 0.23112992942333221, "learning_rate": 0.0001, "loss": 2.328, "step": 797 }, { "epoch": 0.5191932335718933, "grad_norm": 0.19719737768173218, "learning_rate": 0.0001, "loss": 1.9569, "step": 798 }, { "epoch": 0.5198438516590761, "grad_norm": 0.2115892618894577, "learning_rate": 0.0001, "loss": 2.2533, "step": 799 }, { "epoch": 0.5204944697462589, "grad_norm": 0.24321842193603516, "learning_rate": 0.0001, "loss": 2.6597, "step": 800 }, { "epoch": 0.5211450878334418, "grad_norm": 0.18219350278377533, "learning_rate": 0.0001, "loss": 1.8709, "step": 801 }, { "epoch": 0.5217957059206246, "grad_norm": 0.18715021014213562, "learning_rate": 0.0001, "loss": 2.0021, "step": 802 }, { "epoch": 0.5224463240078074, "grad_norm": 0.25940024852752686, "learning_rate": 0.0001, "loss": 2.3742, "step": 803 }, { "epoch": 0.5230969420949902, "grad_norm": 0.18714728951454163, "learning_rate": 0.0001, "loss": 2.211, "step": 804 }, { "epoch": 0.523747560182173, "grad_norm": 0.20145951211452484, "learning_rate": 0.0001, "loss": 2.0047, "step": 805 }, { "epoch": 0.5243981782693559, "grad_norm": 0.18992845714092255, "learning_rate": 0.0001, "loss": 1.8559, "step": 806 }, { "epoch": 0.5250487963565387, "grad_norm": 0.2682324945926666, "learning_rate": 0.0001, "loss": 2.4791, "step": 807 }, { "epoch": 0.5256994144437215, "grad_norm": 0.33034664392471313, "learning_rate": 0.0001, "loss": 2.3089, "step": 808 }, { "epoch": 0.5263500325309044, "grad_norm": 0.18838956952095032, "learning_rate": 0.0001, "loss": 1.9462, "step": 809 }, { "epoch": 0.5270006506180872, "grad_norm": 0.42872169613838196, "learning_rate": 0.0001, "loss": 2.6874, "step": 810 }, { "epoch": 0.52765126870527, "grad_norm": 0.2108643501996994, "learning_rate": 0.0001, "loss": 2.3627, "step": 811 }, { "epoch": 0.5283018867924528, "grad_norm": 0.21745599806308746, "learning_rate": 0.0001, "loss": 2.1204, "step": 812 }, { "epoch": 0.5289525048796356, "grad_norm": 0.2577585279941559, "learning_rate": 0.0001, "loss": 1.9746, "step": 813 }, { "epoch": 0.5296031229668184, "grad_norm": 0.372471421957016, "learning_rate": 0.0001, "loss": 2.688, "step": 814 }, { "epoch": 0.5302537410540012, "grad_norm": 0.2425181120634079, "learning_rate": 0.0001, "loss": 2.1377, "step": 815 }, { "epoch": 0.5309043591411842, "grad_norm": 0.2638307511806488, "learning_rate": 0.0001, "loss": 2.1088, "step": 816 }, { "epoch": 0.531554977228367, "grad_norm": 0.2356933355331421, "learning_rate": 0.0001, "loss": 2.2291, "step": 817 }, { "epoch": 0.5322055953155498, "grad_norm": 0.23714864253997803, "learning_rate": 0.0001, "loss": 2.0929, "step": 818 }, { "epoch": 0.5328562134027326, "grad_norm": 0.19541950523853302, "learning_rate": 0.0001, "loss": 2.0883, "step": 819 }, { "epoch": 0.5335068314899154, "grad_norm": 0.3091617822647095, "learning_rate": 0.0001, "loss": 3.0127, "step": 820 }, { "epoch": 0.5341574495770982, "grad_norm": 0.2592740058898926, "learning_rate": 0.0001, "loss": 1.8307, "step": 821 }, { "epoch": 0.534808067664281, "grad_norm": 0.22505807876586914, "learning_rate": 0.0001, "loss": 2.462, "step": 822 }, { "epoch": 0.5354586857514639, "grad_norm": 0.22032824158668518, "learning_rate": 0.0001, "loss": 2.2718, "step": 823 }, { "epoch": 0.5361093038386467, "grad_norm": 0.2457459270954132, "learning_rate": 0.0001, "loss": 2.4213, "step": 824 }, { "epoch": 0.5367599219258296, "grad_norm": 0.24181683361530304, "learning_rate": 0.0001, "loss": 1.9347, "step": 825 }, { "epoch": 0.5374105400130124, "grad_norm": 0.29988738894462585, "learning_rate": 0.0001, "loss": 2.7697, "step": 826 }, { "epoch": 0.5380611581001952, "grad_norm": 0.24946388602256775, "learning_rate": 0.0001, "loss": 2.2117, "step": 827 }, { "epoch": 0.538711776187378, "grad_norm": 0.20339331030845642, "learning_rate": 0.0001, "loss": 1.9936, "step": 828 }, { "epoch": 0.5393623942745608, "grad_norm": 0.22250457108020782, "learning_rate": 0.0001, "loss": 2.0785, "step": 829 }, { "epoch": 0.5400130123617437, "grad_norm": 0.1869298666715622, "learning_rate": 0.0001, "loss": 2.0406, "step": 830 }, { "epoch": 0.5406636304489265, "grad_norm": 0.1873755156993866, "learning_rate": 0.0001, "loss": 1.9126, "step": 831 }, { "epoch": 0.5413142485361093, "grad_norm": 0.3135535418987274, "learning_rate": 0.0001, "loss": 2.2881, "step": 832 }, { "epoch": 0.5419648666232921, "grad_norm": 0.20596185326576233, "learning_rate": 0.0001, "loss": 2.0682, "step": 833 }, { "epoch": 0.5426154847104749, "grad_norm": 0.25786712765693665, "learning_rate": 0.0001, "loss": 2.0591, "step": 834 }, { "epoch": 0.5432661027976577, "grad_norm": 0.2592066824436188, "learning_rate": 0.0001, "loss": 2.052, "step": 835 }, { "epoch": 0.5439167208848406, "grad_norm": 0.20738951861858368, "learning_rate": 0.0001, "loss": 1.9726, "step": 836 }, { "epoch": 0.5445673389720235, "grad_norm": 0.21384763717651367, "learning_rate": 0.0001, "loss": 2.1897, "step": 837 }, { "epoch": 0.5452179570592063, "grad_norm": 0.22050943970680237, "learning_rate": 0.0001, "loss": 2.3597, "step": 838 }, { "epoch": 0.5458685751463891, "grad_norm": 0.1996280699968338, "learning_rate": 0.0001, "loss": 2.0492, "step": 839 }, { "epoch": 0.5465191932335719, "grad_norm": 0.2430533468723297, "learning_rate": 0.0001, "loss": 2.2774, "step": 840 }, { "epoch": 0.5471698113207547, "grad_norm": 0.22777177393436432, "learning_rate": 0.0001, "loss": 2.0779, "step": 841 }, { "epoch": 0.5478204294079375, "grad_norm": 0.22464539110660553, "learning_rate": 0.0001, "loss": 2.3316, "step": 842 }, { "epoch": 0.5484710474951203, "grad_norm": 0.17759400606155396, "learning_rate": 0.0001, "loss": 1.8407, "step": 843 }, { "epoch": 0.5491216655823032, "grad_norm": 0.22264355421066284, "learning_rate": 0.0001, "loss": 2.2869, "step": 844 }, { "epoch": 0.549772283669486, "grad_norm": 0.20819737017154694, "learning_rate": 0.0001, "loss": 2.1209, "step": 845 }, { "epoch": 0.5504229017566689, "grad_norm": 0.2194463461637497, "learning_rate": 0.0001, "loss": 2.1457, "step": 846 }, { "epoch": 0.5510735198438517, "grad_norm": 0.19314661622047424, "learning_rate": 0.0001, "loss": 2.1063, "step": 847 }, { "epoch": 0.5517241379310345, "grad_norm": 0.186354860663414, "learning_rate": 0.0001, "loss": 2.0833, "step": 848 }, { "epoch": 0.5523747560182173, "grad_norm": 0.1862732619047165, "learning_rate": 0.0001, "loss": 1.9441, "step": 849 }, { "epoch": 0.5530253741054001, "grad_norm": 0.24664181470870972, "learning_rate": 0.0001, "loss": 2.3277, "step": 850 }, { "epoch": 0.5536759921925829, "grad_norm": 0.20182165503501892, "learning_rate": 0.0001, "loss": 2.1902, "step": 851 }, { "epoch": 0.5543266102797658, "grad_norm": 0.2108999788761139, "learning_rate": 0.0001, "loss": 2.0826, "step": 852 }, { "epoch": 0.5549772283669486, "grad_norm": 0.25388890504837036, "learning_rate": 0.0001, "loss": 2.5149, "step": 853 }, { "epoch": 0.5556278464541314, "grad_norm": 0.2074718177318573, "learning_rate": 0.0001, "loss": 1.9135, "step": 854 }, { "epoch": 0.5562784645413142, "grad_norm": 0.1992723047733307, "learning_rate": 0.0001, "loss": 2.186, "step": 855 }, { "epoch": 0.556929082628497, "grad_norm": 0.18721085786819458, "learning_rate": 0.0001, "loss": 1.9453, "step": 856 }, { "epoch": 0.5575797007156799, "grad_norm": 0.21606992185115814, "learning_rate": 0.0001, "loss": 2.1703, "step": 857 }, { "epoch": 0.5582303188028627, "grad_norm": 0.2854723334312439, "learning_rate": 0.0001, "loss": 2.9538, "step": 858 }, { "epoch": 0.5588809368900456, "grad_norm": 0.21503040194511414, "learning_rate": 0.0001, "loss": 2.0194, "step": 859 }, { "epoch": 0.5595315549772284, "grad_norm": 0.2690679430961609, "learning_rate": 0.0001, "loss": 2.1562, "step": 860 }, { "epoch": 0.5601821730644112, "grad_norm": 0.2811613976955414, "learning_rate": 0.0001, "loss": 2.2475, "step": 861 }, { "epoch": 0.560832791151594, "grad_norm": 0.2551681697368622, "learning_rate": 0.0001, "loss": 2.5585, "step": 862 }, { "epoch": 0.5614834092387768, "grad_norm": 0.21423856914043427, "learning_rate": 0.0001, "loss": 2.1194, "step": 863 }, { "epoch": 0.5621340273259596, "grad_norm": 0.22121264040470123, "learning_rate": 0.0001, "loss": 1.9257, "step": 864 }, { "epoch": 0.5627846454131424, "grad_norm": 0.38684332370758057, "learning_rate": 0.0001, "loss": 2.5203, "step": 865 }, { "epoch": 0.5634352635003254, "grad_norm": 0.20299634337425232, "learning_rate": 0.0001, "loss": 2.0868, "step": 866 }, { "epoch": 0.5640858815875082, "grad_norm": 0.33485493063926697, "learning_rate": 0.0001, "loss": 2.457, "step": 867 }, { "epoch": 0.564736499674691, "grad_norm": 0.23778866231441498, "learning_rate": 0.0001, "loss": 1.9863, "step": 868 }, { "epoch": 0.5653871177618738, "grad_norm": 0.18562458455562592, "learning_rate": 0.0001, "loss": 1.915, "step": 869 }, { "epoch": 0.5660377358490566, "grad_norm": 0.3780176341533661, "learning_rate": 0.0001, "loss": 2.5518, "step": 870 }, { "epoch": 0.5666883539362394, "grad_norm": 0.1924014538526535, "learning_rate": 0.0001, "loss": 2.0665, "step": 871 }, { "epoch": 0.5673389720234222, "grad_norm": 0.19788160920143127, "learning_rate": 0.0001, "loss": 1.9408, "step": 872 }, { "epoch": 0.5679895901106051, "grad_norm": 0.2435147911310196, "learning_rate": 0.0001, "loss": 2.3716, "step": 873 }, { "epoch": 0.5686402081977879, "grad_norm": 0.2023211270570755, "learning_rate": 0.0001, "loss": 2.2786, "step": 874 }, { "epoch": 0.5692908262849707, "grad_norm": 0.29936715960502625, "learning_rate": 0.0001, "loss": 2.6689, "step": 875 }, { "epoch": 0.5699414443721535, "grad_norm": 0.18846483528614044, "learning_rate": 0.0001, "loss": 1.9436, "step": 876 }, { "epoch": 0.5705920624593364, "grad_norm": 0.44592785835266113, "learning_rate": 0.0001, "loss": 2.8648, "step": 877 }, { "epoch": 0.5712426805465192, "grad_norm": 0.221640944480896, "learning_rate": 0.0001, "loss": 2.1613, "step": 878 }, { "epoch": 0.571893298633702, "grad_norm": 0.22345726191997528, "learning_rate": 0.0001, "loss": 2.076, "step": 879 }, { "epoch": 0.5725439167208849, "grad_norm": 0.20094214379787445, "learning_rate": 0.0001, "loss": 2.0474, "step": 880 }, { "epoch": 0.5731945348080677, "grad_norm": 0.1997043937444687, "learning_rate": 0.0001, "loss": 1.9812, "step": 881 }, { "epoch": 0.5738451528952505, "grad_norm": 0.3758605420589447, "learning_rate": 0.0001, "loss": 2.8357, "step": 882 }, { "epoch": 0.5744957709824333, "grad_norm": 0.2940578758716583, "learning_rate": 0.0001, "loss": 2.4955, "step": 883 }, { "epoch": 0.5751463890696161, "grad_norm": 0.2434762865304947, "learning_rate": 0.0001, "loss": 2.0011, "step": 884 }, { "epoch": 0.5757970071567989, "grad_norm": 0.24335308372974396, "learning_rate": 0.0001, "loss": 2.5458, "step": 885 }, { "epoch": 0.5764476252439817, "grad_norm": 0.2063351422548294, "learning_rate": 0.0001, "loss": 1.9801, "step": 886 }, { "epoch": 0.5770982433311646, "grad_norm": 0.35102301836013794, "learning_rate": 0.0001, "loss": 2.5647, "step": 887 }, { "epoch": 0.5777488614183475, "grad_norm": 0.22332875430583954, "learning_rate": 0.0001, "loss": 2.0542, "step": 888 }, { "epoch": 0.5783994795055303, "grad_norm": 0.2073124796152115, "learning_rate": 0.0001, "loss": 1.9348, "step": 889 }, { "epoch": 0.5790500975927131, "grad_norm": 0.21079733967781067, "learning_rate": 0.0001, "loss": 1.9829, "step": 890 }, { "epoch": 0.5797007156798959, "grad_norm": 0.2842913866043091, "learning_rate": 0.0001, "loss": 2.7215, "step": 891 }, { "epoch": 0.5803513337670787, "grad_norm": 0.2807595133781433, "learning_rate": 0.0001, "loss": 2.1827, "step": 892 }, { "epoch": 0.5810019518542615, "grad_norm": 0.24955599009990692, "learning_rate": 0.0001, "loss": 2.6246, "step": 893 }, { "epoch": 0.5816525699414443, "grad_norm": 0.23281241953372955, "learning_rate": 0.0001, "loss": 2.3944, "step": 894 }, { "epoch": 0.5823031880286272, "grad_norm": 0.2617682218551636, "learning_rate": 0.0001, "loss": 2.6147, "step": 895 }, { "epoch": 0.58295380611581, "grad_norm": 0.1915360391139984, "learning_rate": 0.0001, "loss": 2.0095, "step": 896 }, { "epoch": 0.5836044242029929, "grad_norm": 0.20270249247550964, "learning_rate": 0.0001, "loss": 1.8983, "step": 897 }, { "epoch": 0.5842550422901757, "grad_norm": 0.21804624795913696, "learning_rate": 0.0001, "loss": 2.0425, "step": 898 }, { "epoch": 0.5849056603773585, "grad_norm": 0.25326576828956604, "learning_rate": 0.0001, "loss": 2.4875, "step": 899 }, { "epoch": 0.5855562784645413, "grad_norm": 0.21714434027671814, "learning_rate": 0.0001, "loss": 2.269, "step": 900 } ], "logging_steps": 1, "max_steps": 1537, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3576622208712704e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }