| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 782, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0025575447570332483, |
| "grad_norm": 15.392987058824561, |
| "learning_rate": 9.999959651660741e-06, |
| "loss": 1.0123, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005115089514066497, |
| "grad_norm": 9.468512560286664, |
| "learning_rate": 9.999838607294157e-06, |
| "loss": 0.7403, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0076726342710997444, |
| "grad_norm": 9.243581351204085, |
| "learning_rate": 9.999636868853824e-06, |
| "loss": 0.5837, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.010230179028132993, |
| "grad_norm": 6.9496549198580935, |
| "learning_rate": 9.999354439595668e-06, |
| "loss": 0.4631, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01278772378516624, |
| "grad_norm": 5.707870970454734, |
| "learning_rate": 9.998991324077906e-06, |
| "loss": 0.3324, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.015345268542199489, |
| "grad_norm": 4.709511789953965, |
| "learning_rate": 9.998547528160987e-06, |
| "loss": 0.3558, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.017902813299232736, |
| "grad_norm": 3.5534524642812655, |
| "learning_rate": 9.998023059007477e-06, |
| "loss": 0.2853, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.020460358056265986, |
| "grad_norm": 4.702487004348668, |
| "learning_rate": 9.997417925081963e-06, |
| "loss": 0.2791, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.023017902813299233, |
| "grad_norm": 5.73219181182079, |
| "learning_rate": 9.996732136150902e-06, |
| "loss": 0.2954, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.02557544757033248, |
| "grad_norm": 3.782084054111381, |
| "learning_rate": 9.995965703282472e-06, |
| "loss": 0.2448, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.028132992327365727, |
| "grad_norm": 4.761454357292183, |
| "learning_rate": 9.995118638846394e-06, |
| "loss": 0.3343, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.030690537084398978, |
| "grad_norm": 4.862414967949473, |
| "learning_rate": 9.99419095651372e-06, |
| "loss": 0.3326, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.03324808184143223, |
| "grad_norm": 4.287681311749821, |
| "learning_rate": 9.993182671256633e-06, |
| "loss": 0.3317, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.03580562659846547, |
| "grad_norm": 4.548249117039148, |
| "learning_rate": 9.992093799348182e-06, |
| "loss": 0.2939, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03836317135549872, |
| "grad_norm": 4.306374147946709, |
| "learning_rate": 9.990924358362037e-06, |
| "loss": 0.2409, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04092071611253197, |
| "grad_norm": 3.4314319732642176, |
| "learning_rate": 9.9896743671722e-06, |
| "loss": 0.3049, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.043478260869565216, |
| "grad_norm": 4.114297236669634, |
| "learning_rate": 9.988343845952697e-06, |
| "loss": 0.314, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.04603580562659847, |
| "grad_norm": 3.6558902621855167, |
| "learning_rate": 9.986932816177258e-06, |
| "loss": 0.2786, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04859335038363171, |
| "grad_norm": 3.451023121802292, |
| "learning_rate": 9.985441300618966e-06, |
| "loss": 0.2949, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.05115089514066496, |
| "grad_norm": 3.8360280897630745, |
| "learning_rate": 9.98386932334989e-06, |
| "loss": 0.2868, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05370843989769821, |
| "grad_norm": 3.104220866344891, |
| "learning_rate": 9.982216909740703e-06, |
| "loss": 0.2371, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.056265984654731455, |
| "grad_norm": 3.2036209599473193, |
| "learning_rate": 9.980484086460258e-06, |
| "loss": 0.2853, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.058823529411764705, |
| "grad_norm": 3.997000948759866, |
| "learning_rate": 9.978670881475173e-06, |
| "loss": 0.2551, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.061381074168797956, |
| "grad_norm": 4.22698810804951, |
| "learning_rate": 9.976777324049374e-06, |
| "loss": 0.2897, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.0639386189258312, |
| "grad_norm": 3.666060568124115, |
| "learning_rate": 9.974803444743617e-06, |
| "loss": 0.2985, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06649616368286446, |
| "grad_norm": 3.7518838770389342, |
| "learning_rate": 9.972749275415005e-06, |
| "loss": 0.2834, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.06905370843989769, |
| "grad_norm": 3.870535915573906, |
| "learning_rate": 9.970614849216465e-06, |
| "loss": 0.2385, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.07161125319693094, |
| "grad_norm": 3.6607655718950056, |
| "learning_rate": 9.96840020059622e-06, |
| "loss": 0.3131, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0741687979539642, |
| "grad_norm": 3.475270449404676, |
| "learning_rate": 9.966105365297226e-06, |
| "loss": 0.2588, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.07672634271099744, |
| "grad_norm": 4.200264770629892, |
| "learning_rate": 9.963730380356599e-06, |
| "loss": 0.2746, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0792838874680307, |
| "grad_norm": 4.372784076877616, |
| "learning_rate": 9.96127528410502e-06, |
| "loss": 0.3288, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.08184143222506395, |
| "grad_norm": 3.6840961909503727, |
| "learning_rate": 9.958740116166113e-06, |
| "loss": 0.2797, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.08439897698209718, |
| "grad_norm": 3.660857860053387, |
| "learning_rate": 9.9561249174558e-06, |
| "loss": 0.2552, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.08695652173913043, |
| "grad_norm": 3.843432399530098, |
| "learning_rate": 9.953429730181653e-06, |
| "loss": 0.3066, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.08951406649616368, |
| "grad_norm": 3.863125954202528, |
| "learning_rate": 9.950654597842209e-06, |
| "loss": 0.281, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09207161125319693, |
| "grad_norm": 3.233755549464827, |
| "learning_rate": 9.947799565226253e-06, |
| "loss": 0.2684, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.09462915601023018, |
| "grad_norm": 4.19966407483748, |
| "learning_rate": 9.944864678412118e-06, |
| "loss": 0.2856, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.09718670076726342, |
| "grad_norm": 3.165753312848627, |
| "learning_rate": 9.94184998476693e-06, |
| "loss": 0.24, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.09974424552429667, |
| "grad_norm": 3.4710302005614277, |
| "learning_rate": 9.938755532945838e-06, |
| "loss": 0.293, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.10230179028132992, |
| "grad_norm": 3.2439371920078663, |
| "learning_rate": 9.93558137289124e-06, |
| "loss": 0.2786, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10485933503836317, |
| "grad_norm": 3.133329636506038, |
| "learning_rate": 9.932327555831972e-06, |
| "loss": 0.1905, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.10741687979539642, |
| "grad_norm": 3.041802191982791, |
| "learning_rate": 9.928994134282477e-06, |
| "loss": 0.2307, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.10997442455242967, |
| "grad_norm": 3.44216600685852, |
| "learning_rate": 9.925581162041967e-06, |
| "loss": 0.2273, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.11253196930946291, |
| "grad_norm": 3.2284774978139663, |
| "learning_rate": 9.922088694193546e-06, |
| "loss": 0.2279, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.11508951406649616, |
| "grad_norm": 2.7419938932220442, |
| "learning_rate": 9.918516787103322e-06, |
| "loss": 0.2278, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11764705882352941, |
| "grad_norm": 3.7438191859843313, |
| "learning_rate": 9.91486549841951e-06, |
| "loss": 0.2512, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.12020460358056266, |
| "grad_norm": 3.435535432268392, |
| "learning_rate": 9.911134887071477e-06, |
| "loss": 0.2619, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.12276214833759591, |
| "grad_norm": 3.4627560471149756, |
| "learning_rate": 9.907325013268816e-06, |
| "loss": 0.2711, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.12531969309462915, |
| "grad_norm": 2.7213735709949196, |
| "learning_rate": 9.903435938500356e-06, |
| "loss": 0.1918, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.1278772378516624, |
| "grad_norm": 3.0775197585007343, |
| "learning_rate": 9.899467725533181e-06, |
| "loss": 0.2259, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13043478260869565, |
| "grad_norm": 3.6206763525611274, |
| "learning_rate": 9.895420438411616e-06, |
| "loss": 0.2572, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.1329923273657289, |
| "grad_norm": 3.898289061481641, |
| "learning_rate": 9.89129414245618e-06, |
| "loss": 0.2254, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.13554987212276215, |
| "grad_norm": 3.6029485399959778, |
| "learning_rate": 9.887088904262557e-06, |
| "loss": 0.2347, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.13810741687979539, |
| "grad_norm": 3.4988251793641676, |
| "learning_rate": 9.882804791700488e-06, |
| "loss": 0.2594, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.14066496163682865, |
| "grad_norm": 3.305043823936606, |
| "learning_rate": 9.878441873912712e-06, |
| "loss": 0.2443, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1432225063938619, |
| "grad_norm": 3.8391979100290206, |
| "learning_rate": 9.87400022131382e-06, |
| "loss": 0.2909, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.14578005115089515, |
| "grad_norm": 34.49457439956216, |
| "learning_rate": 9.869479905589136e-06, |
| "loss": 0.1952, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.1483375959079284, |
| "grad_norm": 5.486937250031213, |
| "learning_rate": 9.864880999693551e-06, |
| "loss": 0.1897, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.15089514066496162, |
| "grad_norm": 3.457559155846861, |
| "learning_rate": 9.860203577850353e-06, |
| "loss": 0.2333, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.1534526854219949, |
| "grad_norm": 3.551347913898386, |
| "learning_rate": 9.855447715550024e-06, |
| "loss": 0.232, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15601023017902813, |
| "grad_norm": 3.029321243204479, |
| "learning_rate": 9.850613489549018e-06, |
| "loss": 0.1929, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.1585677749360614, |
| "grad_norm": 3.994451029674619, |
| "learning_rate": 9.845700977868536e-06, |
| "loss": 0.2771, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.16112531969309463, |
| "grad_norm": 2.9207143827429594, |
| "learning_rate": 9.840710259793251e-06, |
| "loss": 0.1973, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.1636828644501279, |
| "grad_norm": 3.288151604488407, |
| "learning_rate": 9.835641415870038e-06, |
| "loss": 0.2833, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.16624040920716113, |
| "grad_norm": 3.1704268328889285, |
| "learning_rate": 9.830494527906671e-06, |
| "loss": 0.1916, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.16879795396419436, |
| "grad_norm": 3.0478461733192677, |
| "learning_rate": 9.825269678970502e-06, |
| "loss": 0.227, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.17135549872122763, |
| "grad_norm": 3.247187893935764, |
| "learning_rate": 9.819966953387122e-06, |
| "loss": 0.2379, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.17391304347826086, |
| "grad_norm": 3.7351933963260597, |
| "learning_rate": 9.814586436738998e-06, |
| "loss": 0.2284, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.17647058823529413, |
| "grad_norm": 3.562240712811416, |
| "learning_rate": 9.809128215864096e-06, |
| "loss": 0.2875, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.17902813299232737, |
| "grad_norm": 2.6707181487795264, |
| "learning_rate": 9.803592378854476e-06, |
| "loss": 0.2144, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1815856777493606, |
| "grad_norm": 2.932314569007143, |
| "learning_rate": 9.797979015054868e-06, |
| "loss": 0.2432, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.18414322250639387, |
| "grad_norm": 3.2103231515107105, |
| "learning_rate": 9.792288215061237e-06, |
| "loss": 0.2341, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.1867007672634271, |
| "grad_norm": 3.640482608271676, |
| "learning_rate": 9.786520070719313e-06, |
| "loss": 0.1941, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.18925831202046037, |
| "grad_norm": 2.9037943123818075, |
| "learning_rate": 9.780674675123113e-06, |
| "loss": 0.204, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.1918158567774936, |
| "grad_norm": 3.158767038474441, |
| "learning_rate": 9.77475212261344e-06, |
| "loss": 0.2453, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19437340153452684, |
| "grad_norm": 3.1234088193628917, |
| "learning_rate": 9.768752508776358e-06, |
| "loss": 0.1783, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.1969309462915601, |
| "grad_norm": 2.8268077914257312, |
| "learning_rate": 9.762675930441647e-06, |
| "loss": 0.1986, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.19948849104859334, |
| "grad_norm": 2.9513681306595343, |
| "learning_rate": 9.756522485681247e-06, |
| "loss": 0.2365, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2020460358056266, |
| "grad_norm": 4.828985495990047, |
| "learning_rate": 9.750292273807666e-06, |
| "loss": 0.2922, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.20460358056265984, |
| "grad_norm": 4.12969908799372, |
| "learning_rate": 9.743985395372387e-06, |
| "loss": 0.2747, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2071611253196931, |
| "grad_norm": 3.8828623504287374, |
| "learning_rate": 9.737601952164238e-06, |
| "loss": 0.2756, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.20971867007672634, |
| "grad_norm": 3.1532945515057427, |
| "learning_rate": 9.73114204720775e-06, |
| "loss": 0.2399, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.21227621483375958, |
| "grad_norm": 2.768144038570148, |
| "learning_rate": 9.724605784761501e-06, |
| "loss": 0.2225, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.21483375959079284, |
| "grad_norm": 3.0936033820891544, |
| "learning_rate": 9.717993270316421e-06, |
| "loss": 0.1855, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.21739130434782608, |
| "grad_norm": 2.874646258205036, |
| "learning_rate": 9.711304610594104e-06, |
| "loss": 0.2511, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.21994884910485935, |
| "grad_norm": 2.631348523046017, |
| "learning_rate": 9.704539913545073e-06, |
| "loss": 0.2081, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.22250639386189258, |
| "grad_norm": 2.8254213731499336, |
| "learning_rate": 9.697699288347043e-06, |
| "loss": 0.1861, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.22506393861892582, |
| "grad_norm": 2.4651414344097526, |
| "learning_rate": 9.690782845403164e-06, |
| "loss": 0.182, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.22762148337595908, |
| "grad_norm": 2.837322369457593, |
| "learning_rate": 9.683790696340229e-06, |
| "loss": 0.211, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.23017902813299232, |
| "grad_norm": 3.370816547248047, |
| "learning_rate": 9.676722954006878e-06, |
| "loss": 0.2782, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.23273657289002558, |
| "grad_norm": 3.7835641834887843, |
| "learning_rate": 9.669579732471779e-06, |
| "loss": 0.3041, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 3.296489483167239, |
| "learning_rate": 9.66236114702178e-06, |
| "loss": 0.23, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.23785166240409208, |
| "grad_norm": 2.8890092480403564, |
| "learning_rate": 9.655067314160058e-06, |
| "loss": 0.2149, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.24040920716112532, |
| "grad_norm": 3.429189900304709, |
| "learning_rate": 9.647698351604227e-06, |
| "loss": 0.301, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.24296675191815856, |
| "grad_norm": 3.3850684920397245, |
| "learning_rate": 9.640254378284447e-06, |
| "loss": 0.2723, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.24552429667519182, |
| "grad_norm": 3.0904355382979016, |
| "learning_rate": 9.632735514341508e-06, |
| "loss": 0.2598, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.24808184143222506, |
| "grad_norm": 3.7186808278209, |
| "learning_rate": 9.625141881124874e-06, |
| "loss": 0.2948, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.2506393861892583, |
| "grad_norm": 3.150975002506145, |
| "learning_rate": 9.617473601190743e-06, |
| "loss": 0.2143, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.2531969309462916, |
| "grad_norm": 23.69858423180905, |
| "learning_rate": 9.609730798300056e-06, |
| "loss": 0.1867, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.2557544757033248, |
| "grad_norm": 2.9299967673432215, |
| "learning_rate": 9.601913597416513e-06, |
| "loss": 0.2168, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25831202046035806, |
| "grad_norm": 4.058096387207032, |
| "learning_rate": 9.594022124704541e-06, |
| "loss": 0.3437, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.2608695652173913, |
| "grad_norm": 2.4370542742741614, |
| "learning_rate": 9.586056507527266e-06, |
| "loss": 0.2073, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.26342710997442453, |
| "grad_norm": 3.1921593576184355, |
| "learning_rate": 9.578016874444459e-06, |
| "loss": 0.216, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.2659846547314578, |
| "grad_norm": 2.1088920969603056, |
| "learning_rate": 9.569903355210457e-06, |
| "loss": 0.1807, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.26854219948849106, |
| "grad_norm": 3.2993263422325976, |
| "learning_rate": 9.561716080772072e-06, |
| "loss": 0.2389, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2710997442455243, |
| "grad_norm": 3.7311120165895475, |
| "learning_rate": 9.55345518326647e-06, |
| "loss": 0.2998, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.27365728900255754, |
| "grad_norm": 4.097055377202607, |
| "learning_rate": 9.545120796019056e-06, |
| "loss": 0.2108, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.27621483375959077, |
| "grad_norm": 3.2362396004576657, |
| "learning_rate": 9.5367130535413e-06, |
| "loss": 0.2854, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.27877237851662406, |
| "grad_norm": 3.371909008316518, |
| "learning_rate": 9.528232091528578e-06, |
| "loss": 0.2293, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.2813299232736573, |
| "grad_norm": 2.5846598885848495, |
| "learning_rate": 9.519678046857987e-06, |
| "loss": 0.232, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.28388746803069054, |
| "grad_norm": 2.8625625809663573, |
| "learning_rate": 9.511051057586125e-06, |
| "loss": 0.2534, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.2864450127877238, |
| "grad_norm": 2.4251750007131037, |
| "learning_rate": 9.502351262946865e-06, |
| "loss": 0.224, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.289002557544757, |
| "grad_norm": 2.4385429235387477, |
| "learning_rate": 9.493578803349117e-06, |
| "loss": 0.1934, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.2915601023017903, |
| "grad_norm": 3.112915617413732, |
| "learning_rate": 9.48473382037455e-06, |
| "loss": 0.2592, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 3.1095060452385375, |
| "learning_rate": 9.475816456775313e-06, |
| "loss": 0.2748, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2966751918158568, |
| "grad_norm": 3.193274362044026, |
| "learning_rate": 9.466826856471728e-06, |
| "loss": 0.2443, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.29923273657289, |
| "grad_norm": 2.805529582638814, |
| "learning_rate": 9.457765164549979e-06, |
| "loss": 0.2351, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.30179028132992325, |
| "grad_norm": 4.497751524982449, |
| "learning_rate": 9.448631527259749e-06, |
| "loss": 0.3551, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.30434782608695654, |
| "grad_norm": 2.9074901684200163, |
| "learning_rate": 9.439426092011877e-06, |
| "loss": 0.2004, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3069053708439898, |
| "grad_norm": 2.974411031118463, |
| "learning_rate": 9.430149007375974e-06, |
| "loss": 0.2605, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.309462915601023, |
| "grad_norm": 2.8491332590658702, |
| "learning_rate": 9.42080042307802e-06, |
| "loss": 0.2337, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.31202046035805625, |
| "grad_norm": 2.3669589821998915, |
| "learning_rate": 9.411380489997962e-06, |
| "loss": 0.1974, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.3145780051150895, |
| "grad_norm": 3.347163072711235, |
| "learning_rate": 9.401889360167256e-06, |
| "loss": 0.2662, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.3171355498721228, |
| "grad_norm": 2.827025371885836, |
| "learning_rate": 9.392327186766434e-06, |
| "loss": 0.2275, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.319693094629156, |
| "grad_norm": 2.4402905299053983, |
| "learning_rate": 9.382694124122624e-06, |
| "loss": 0.2067, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.32225063938618925, |
| "grad_norm": 2.595701436925841, |
| "learning_rate": 9.372990327707057e-06, |
| "loss": 0.1936, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.3248081841432225, |
| "grad_norm": 2.972919332628706, |
| "learning_rate": 9.36321595413256e-06, |
| "loss": 0.2284, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.3273657289002558, |
| "grad_norm": 2.96761954478865, |
| "learning_rate": 9.353371161151032e-06, |
| "loss": 0.3038, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.329923273657289, |
| "grad_norm": 2.6927171651588337, |
| "learning_rate": 9.34345610765089e-06, |
| "loss": 0.1787, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.33248081841432225, |
| "grad_norm": 3.5431583100736863, |
| "learning_rate": 9.333470953654513e-06, |
| "loss": 0.3147, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3350383631713555, |
| "grad_norm": 2.632290215476919, |
| "learning_rate": 9.32341586031565e-06, |
| "loss": 0.2647, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.3375959079283887, |
| "grad_norm": 3.2469034068085736, |
| "learning_rate": 9.31329098991683e-06, |
| "loss": 0.2767, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.340153452685422, |
| "grad_norm": 3.3387196083764823, |
| "learning_rate": 9.303096505866734e-06, |
| "loss": 0.2513, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.34271099744245526, |
| "grad_norm": 2.6511165133843186, |
| "learning_rate": 9.292832572697566e-06, |
| "loss": 0.2379, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.3452685421994885, |
| "grad_norm": 2.270613271667338, |
| "learning_rate": 9.282499356062385e-06, |
| "loss": 0.2181, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.34782608695652173, |
| "grad_norm": 2.7845455382154585, |
| "learning_rate": 9.272097022732444e-06, |
| "loss": 0.1959, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.35038363171355497, |
| "grad_norm": 3.359968320145697, |
| "learning_rate": 9.261625740594494e-06, |
| "loss": 0.2432, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.35294117647058826, |
| "grad_norm": 3.225209792269358, |
| "learning_rate": 9.251085678648072e-06, |
| "loss": 0.2534, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.3554987212276215, |
| "grad_norm": 2.833166106658051, |
| "learning_rate": 9.240477007002777e-06, |
| "loss": 0.2156, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.35805626598465473, |
| "grad_norm": 3.4490482806712626, |
| "learning_rate": 9.22979989687552e-06, |
| "loss": 0.2963, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.36061381074168797, |
| "grad_norm": 3.1464710421043365, |
| "learning_rate": 9.219054520587766e-06, |
| "loss": 0.1821, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.3631713554987212, |
| "grad_norm": 2.5959703410215114, |
| "learning_rate": 9.208241051562753e-06, |
| "loss": 0.2277, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.3657289002557545, |
| "grad_norm": 3.044870699089069, |
| "learning_rate": 9.197359664322684e-06, |
| "loss": 0.2234, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.36828644501278773, |
| "grad_norm": 3.4399399781668403, |
| "learning_rate": 9.186410534485924e-06, |
| "loss": 0.2574, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.37084398976982097, |
| "grad_norm": 2.673893462327474, |
| "learning_rate": 9.175393838764153e-06, |
| "loss": 0.2054, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3734015345268542, |
| "grad_norm": 2.444687752494406, |
| "learning_rate": 9.164309754959523e-06, |
| "loss": 0.207, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.37595907928388744, |
| "grad_norm": 2.6282321631694248, |
| "learning_rate": 9.153158461961782e-06, |
| "loss": 0.1948, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.37851662404092073, |
| "grad_norm": 2.4890084427480588, |
| "learning_rate": 9.14194013974539e-06, |
| "loss": 0.186, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.38107416879795397, |
| "grad_norm": 2.5896447502938633, |
| "learning_rate": 9.130654969366619e-06, |
| "loss": 0.2275, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.3836317135549872, |
| "grad_norm": 2.564119331980231, |
| "learning_rate": 9.11930313296062e-06, |
| "loss": 0.213, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.38618925831202044, |
| "grad_norm": 3.224590304098589, |
| "learning_rate": 9.107884813738492e-06, |
| "loss": 0.283, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.3887468030690537, |
| "grad_norm": 2.715718216923276, |
| "learning_rate": 9.096400195984322e-06, |
| "loss": 0.1769, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.391304347826087, |
| "grad_norm": 2.9116689180007698, |
| "learning_rate": 9.08484946505221e-06, |
| "loss": 0.2214, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.3938618925831202, |
| "grad_norm": 2.855052145835007, |
| "learning_rate": 9.073232807363283e-06, |
| "loss": 0.2181, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.39641943734015345, |
| "grad_norm": 3.579582085441427, |
| "learning_rate": 9.061550410402677e-06, |
| "loss": 0.3031, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3989769820971867, |
| "grad_norm": 3.3622889576332122, |
| "learning_rate": 9.049802462716521e-06, |
| "loss": 0.2345, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.40153452685422, |
| "grad_norm": 3.5682342482734684, |
| "learning_rate": 9.037989153908882e-06, |
| "loss": 0.2558, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.4040920716112532, |
| "grad_norm": 2.541343662604646, |
| "learning_rate": 9.026110674638722e-06, |
| "loss": 0.2171, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.40664961636828645, |
| "grad_norm": 2.7014269251115097, |
| "learning_rate": 9.0141672166168e-06, |
| "loss": 0.1616, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.4092071611253197, |
| "grad_norm": 3.069690328482181, |
| "learning_rate": 9.002158972602599e-06, |
| "loss": 0.3043, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4117647058823529, |
| "grad_norm": 2.922623620430193, |
| "learning_rate": 8.990086136401199e-06, |
| "loss": 0.2376, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.4143222506393862, |
| "grad_norm": 3.792442645821336, |
| "learning_rate": 8.977948902860154e-06, |
| "loss": 0.2899, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.41687979539641945, |
| "grad_norm": 2.831662241450309, |
| "learning_rate": 8.965747467866355e-06, |
| "loss": 0.1795, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.4194373401534527, |
| "grad_norm": 2.800496787119197, |
| "learning_rate": 8.953482028342853e-06, |
| "loss": 0.2936, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.4219948849104859, |
| "grad_norm": 2.5546124203822327, |
| "learning_rate": 8.9411527822457e-06, |
| "loss": 0.2064, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.42455242966751916, |
| "grad_norm": 3.0514576119776153, |
| "learning_rate": 8.92875992856073e-06, |
| "loss": 0.2617, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.42710997442455245, |
| "grad_norm": 2.283609395848259, |
| "learning_rate": 8.916303667300373e-06, |
| "loss": 0.181, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.4296675191815857, |
| "grad_norm": 3.1823314702001184, |
| "learning_rate": 8.903784199500412e-06, |
| "loss": 0.2184, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.4322250639386189, |
| "grad_norm": 3.087529114047624, |
| "learning_rate": 8.89120172721674e-06, |
| "loss": 0.218, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.43478260869565216, |
| "grad_norm": 3.2616256957739957, |
| "learning_rate": 8.8785564535221e-06, |
| "loss": 0.2138, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4373401534526854, |
| "grad_norm": 3.2597963602492928, |
| "learning_rate": 8.86584858250281e-06, |
| "loss": 0.2471, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.4398976982097187, |
| "grad_norm": 2.892437025795672, |
| "learning_rate": 8.853078319255466e-06, |
| "loss": 0.2423, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.4424552429667519, |
| "grad_norm": 2.7606976408563257, |
| "learning_rate": 8.840245869883635e-06, |
| "loss": 0.2559, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.44501278772378516, |
| "grad_norm": 2.172044285789016, |
| "learning_rate": 8.827351441494525e-06, |
| "loss": 0.204, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.4475703324808184, |
| "grad_norm": 3.0017261477691104, |
| "learning_rate": 8.814395242195642e-06, |
| "loss": 0.2776, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.45012787723785164, |
| "grad_norm": 2.648838834500726, |
| "learning_rate": 8.80137748109144e-06, |
| "loss": 0.2085, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.45268542199488493, |
| "grad_norm": 2.035533604532462, |
| "learning_rate": 8.78829836827993e-06, |
| "loss": 0.1803, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.45524296675191817, |
| "grad_norm": 2.2008376049863148, |
| "learning_rate": 8.77515811484931e-06, |
| "loss": 0.2172, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.4578005115089514, |
| "grad_norm": 2.151726038676321, |
| "learning_rate": 8.761956932874539e-06, |
| "loss": 0.1737, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.46035805626598464, |
| "grad_norm": 2.8305977619166263, |
| "learning_rate": 8.748695035413925e-06, |
| "loss": 0.2258, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4629156010230179, |
| "grad_norm": 2.2823582846606705, |
| "learning_rate": 8.735372636505681e-06, |
| "loss": 0.2186, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.46547314578005117, |
| "grad_norm": 2.423531563066667, |
| "learning_rate": 8.72198995116448e-06, |
| "loss": 0.2407, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.4680306905370844, |
| "grad_norm": 2.534940058147962, |
| "learning_rate": 8.708547195377968e-06, |
| "loss": 0.2939, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 2.8644782418220296, |
| "learning_rate": 8.695044586103297e-06, |
| "loss": 0.2317, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.4731457800511509, |
| "grad_norm": 3.673887217855088, |
| "learning_rate": 8.68148234126361e-06, |
| "loss": 0.2833, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.47570332480818417, |
| "grad_norm": 2.749104058931232, |
| "learning_rate": 8.667860679744529e-06, |
| "loss": 0.205, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.4782608695652174, |
| "grad_norm": 2.881858956427076, |
| "learning_rate": 8.65417982139062e-06, |
| "loss": 0.2373, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.48081841432225064, |
| "grad_norm": 2.441774845496025, |
| "learning_rate": 8.640439987001855e-06, |
| "loss": 0.1988, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.4833759590792839, |
| "grad_norm": 2.680220627075973, |
| "learning_rate": 8.626641398330027e-06, |
| "loss": 0.2128, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4859335038363171, |
| "grad_norm": 2.1697481026879144, |
| "learning_rate": 8.612784278075195e-06, |
| "loss": 0.2085, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4884910485933504, |
| "grad_norm": 2.682355105066521, |
| "learning_rate": 8.598868849882074e-06, |
| "loss": 0.2354, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.49104859335038364, |
| "grad_norm": 2.470410385763528, |
| "learning_rate": 8.58489533833643e-06, |
| "loss": 0.1969, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.4936061381074169, |
| "grad_norm": 2.311090258410845, |
| "learning_rate": 8.570863968961456e-06, |
| "loss": 0.1628, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.4961636828644501, |
| "grad_norm": 2.069853143088041, |
| "learning_rate": 8.556774968214134e-06, |
| "loss": 0.2108, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.49872122762148335, |
| "grad_norm": 2.245346110916918, |
| "learning_rate": 8.542628563481577e-06, |
| "loss": 0.2197, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5012787723785166, |
| "grad_norm": 2.5537303598871426, |
| "learning_rate": 8.52842498307736e-06, |
| "loss": 0.2529, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.5038363171355499, |
| "grad_norm": 2.7454118710249924, |
| "learning_rate": 8.514164456237835e-06, |
| "loss": 0.2372, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.5063938618925832, |
| "grad_norm": 2.6858558224066176, |
| "learning_rate": 8.499847213118431e-06, |
| "loss": 0.256, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.5089514066496164, |
| "grad_norm": 2.790784827198894, |
| "learning_rate": 8.485473484789944e-06, |
| "loss": 0.263, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.5115089514066496, |
| "grad_norm": 2.243606795545935, |
| "learning_rate": 8.471043503234796e-06, |
| "loss": 0.2317, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5140664961636828, |
| "grad_norm": 3.2990716039658685, |
| "learning_rate": 8.45655750134331e-06, |
| "loss": 0.2392, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.5166240409207161, |
| "grad_norm": 2.193537985146762, |
| "learning_rate": 8.442015712909926e-06, |
| "loss": 0.1975, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.5191815856777494, |
| "grad_norm": 2.3337401949778496, |
| "learning_rate": 8.427418372629456e-06, |
| "loss": 0.199, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.5217391304347826, |
| "grad_norm": 2.472600441004196, |
| "learning_rate": 8.412765716093273e-06, |
| "loss": 0.2034, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.5242966751918159, |
| "grad_norm": 1.9955637536104787, |
| "learning_rate": 8.398057979785515e-06, |
| "loss": 0.1746, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5268542199488491, |
| "grad_norm": 2.4509873939651987, |
| "learning_rate": 8.383295401079284e-06, |
| "loss": 0.2152, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.5294117647058824, |
| "grad_norm": 2.8716129683382348, |
| "learning_rate": 8.368478218232787e-06, |
| "loss": 0.2504, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.5319693094629157, |
| "grad_norm": 2.243120617246336, |
| "learning_rate": 8.353606670385514e-06, |
| "loss": 0.1801, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.5345268542199488, |
| "grad_norm": 2.215427775181383, |
| "learning_rate": 8.338680997554372e-06, |
| "loss": 0.2024, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.5370843989769821, |
| "grad_norm": 2.2038368206477386, |
| "learning_rate": 8.3237014406298e-06, |
| "loss": 0.2008, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5396419437340153, |
| "grad_norm": 2.2158803371127354, |
| "learning_rate": 8.308668241371897e-06, |
| "loss": 0.1498, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.5421994884910486, |
| "grad_norm": 2.1939303716398544, |
| "learning_rate": 8.293581642406517e-06, |
| "loss": 0.1791, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.5447570332480819, |
| "grad_norm": 3.5427690009814103, |
| "learning_rate": 8.278441887221338e-06, |
| "loss": 0.2976, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.5473145780051151, |
| "grad_norm": 2.2422966046960426, |
| "learning_rate": 8.263249220161957e-06, |
| "loss": 0.1768, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.5498721227621484, |
| "grad_norm": 2.3319047512662, |
| "learning_rate": 8.248003886427927e-06, |
| "loss": 0.1948, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5524296675191815, |
| "grad_norm": 2.213024238241374, |
| "learning_rate": 8.232706132068806e-06, |
| "loss": 0.1195, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.5549872122762148, |
| "grad_norm": 3.1675916876159262, |
| "learning_rate": 8.217356203980187e-06, |
| "loss": 0.193, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.5575447570332481, |
| "grad_norm": 2.1931556698548826, |
| "learning_rate": 8.201954349899712e-06, |
| "loss": 0.2183, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.5601023017902813, |
| "grad_norm": 2.1548375973876848, |
| "learning_rate": 8.186500818403076e-06, |
| "loss": 0.1331, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.5626598465473146, |
| "grad_norm": 2.8937805193605017, |
| "learning_rate": 8.17099585890001e-06, |
| "loss": 0.2457, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5652173913043478, |
| "grad_norm": 2.1169776599974295, |
| "learning_rate": 8.155439721630265e-06, |
| "loss": 0.169, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.5677749360613811, |
| "grad_norm": 2.4695153062225788, |
| "learning_rate": 8.139832657659557e-06, |
| "loss": 0.2079, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.5703324808184144, |
| "grad_norm": 2.7432218121780942, |
| "learning_rate": 8.124174918875532e-06, |
| "loss": 0.2972, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.5728900255754475, |
| "grad_norm": 2.8180099396258296, |
| "learning_rate": 8.108466757983695e-06, |
| "loss": 0.2098, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.5754475703324808, |
| "grad_norm": 2.3696765683916508, |
| "learning_rate": 8.092708428503324e-06, |
| "loss": 0.1861, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.578005115089514, |
| "grad_norm": 3.7161214440329964, |
| "learning_rate": 8.076900184763394e-06, |
| "loss": 0.2049, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.5805626598465473, |
| "grad_norm": 2.353616152240531, |
| "learning_rate": 8.061042281898453e-06, |
| "loss": 0.2029, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.5831202046035806, |
| "grad_norm": 2.6071768383498917, |
| "learning_rate": 8.04513497584452e-06, |
| "loss": 0.207, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.5856777493606138, |
| "grad_norm": 2.871861928589528, |
| "learning_rate": 8.02917852333495e-06, |
| "loss": 0.274, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 2.6285736054402036, |
| "learning_rate": 8.013173181896283e-06, |
| "loss": 0.2626, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5907928388746803, |
| "grad_norm": 2.4148081233137306, |
| "learning_rate": 7.9971192098441e-06, |
| "loss": 0.1275, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.5933503836317136, |
| "grad_norm": 2.5465699530358643, |
| "learning_rate": 7.981016866278843e-06, |
| "loss": 0.1969, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.5959079283887468, |
| "grad_norm": 2.45509535000344, |
| "learning_rate": 7.964866411081645e-06, |
| "loss": 0.2047, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.59846547314578, |
| "grad_norm": 2.379257503910445, |
| "learning_rate": 7.94866810491012e-06, |
| "loss": 0.2049, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.6010230179028133, |
| "grad_norm": 2.62551198927104, |
| "learning_rate": 7.93242220919417e-06, |
| "loss": 0.2194, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.6035805626598465, |
| "grad_norm": 2.154755756722019, |
| "learning_rate": 7.916128986131761e-06, |
| "loss": 0.1858, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.6061381074168798, |
| "grad_norm": 2.15917280811157, |
| "learning_rate": 7.899788698684687e-06, |
| "loss": 0.197, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.6086956521739131, |
| "grad_norm": 2.9311613451724146, |
| "learning_rate": 7.883401610574338e-06, |
| "loss": 0.2576, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.6112531969309463, |
| "grad_norm": 1.9858065195675783, |
| "learning_rate": 7.866967986277423e-06, |
| "loss": 0.1448, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.6138107416879796, |
| "grad_norm": 2.4479250215085244, |
| "learning_rate": 7.850488091021726e-06, |
| "loss": 0.241, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6163682864450127, |
| "grad_norm": 2.9274230717517873, |
| "learning_rate": 7.833962190781809e-06, |
| "loss": 0.2245, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.618925831202046, |
| "grad_norm": 2.0046579280922185, |
| "learning_rate": 7.817390552274721e-06, |
| "loss": 0.1377, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.6214833759590793, |
| "grad_norm": 2.688691129256327, |
| "learning_rate": 7.800773442955703e-06, |
| "loss": 0.2236, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.6240409207161125, |
| "grad_norm": 2.4420882136374384, |
| "learning_rate": 7.784111131013858e-06, |
| "loss": 0.1967, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.6265984654731458, |
| "grad_norm": 2.395059320305546, |
| "learning_rate": 7.767403885367832e-06, |
| "loss": 0.1821, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.629156010230179, |
| "grad_norm": 3.1294024434588468, |
| "learning_rate": 7.750651975661471e-06, |
| "loss": 0.2713, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.6317135549872123, |
| "grad_norm": 2.4451446949008253, |
| "learning_rate": 7.733855672259472e-06, |
| "loss": 0.2237, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.6342710997442456, |
| "grad_norm": 2.0190388654625173, |
| "learning_rate": 7.717015246243012e-06, |
| "loss": 0.1576, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.6368286445012787, |
| "grad_norm": 2.8493759397799425, |
| "learning_rate": 7.700130969405377e-06, |
| "loss": 0.2196, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.639386189258312, |
| "grad_norm": 2.9296895450026006, |
| "learning_rate": 7.683203114247587e-06, |
| "loss": 0.2143, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6419437340153452, |
| "grad_norm": 2.711043376771317, |
| "learning_rate": 7.66623195397397e-06, |
| "loss": 0.2276, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.6445012787723785, |
| "grad_norm": 2.8063349249807827, |
| "learning_rate": 7.649217762487786e-06, |
| "loss": 0.243, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.6470588235294118, |
| "grad_norm": 2.25381162153375, |
| "learning_rate": 7.63216081438678e-06, |
| "loss": 0.1902, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.649616368286445, |
| "grad_norm": 2.0621493452179105, |
| "learning_rate": 7.615061384958764e-06, |
| "loss": 0.2048, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.6521739130434783, |
| "grad_norm": 2.8241949462654294, |
| "learning_rate": 7.597919750177168e-06, |
| "loss": 0.2255, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6547314578005116, |
| "grad_norm": 2.283822865220664, |
| "learning_rate": 7.580736186696593e-06, |
| "loss": 0.1865, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.6572890025575447, |
| "grad_norm": 2.9381754097047725, |
| "learning_rate": 7.563510971848339e-06, |
| "loss": 0.2054, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.659846547314578, |
| "grad_norm": 2.411395623614636, |
| "learning_rate": 7.546244383635929e-06, |
| "loss": 0.2523, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.6624040920716112, |
| "grad_norm": 2.3344681353933594, |
| "learning_rate": 7.528936700730627e-06, |
| "loss": 0.2203, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.6649616368286445, |
| "grad_norm": 2.201988420125249, |
| "learning_rate": 7.5115882024669375e-06, |
| "loss": 0.1944, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6675191815856778, |
| "grad_norm": 2.560389049658497, |
| "learning_rate": 7.494199168838099e-06, |
| "loss": 0.2616, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.670076726342711, |
| "grad_norm": 2.8818559971982523, |
| "learning_rate": 7.476769880491561e-06, |
| "loss": 0.2532, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.6726342710997443, |
| "grad_norm": 2.3605699658219814, |
| "learning_rate": 7.459300618724462e-06, |
| "loss": 0.1874, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.6751918158567775, |
| "grad_norm": 3.6724541267723065, |
| "learning_rate": 7.44179166547908e-06, |
| "loss": 0.2658, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6777493606138107, |
| "grad_norm": 2.6188857770286047, |
| "learning_rate": 7.42424330333829e-06, |
| "loss": 0.2566, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.680306905370844, |
| "grad_norm": 2.379062702399823, |
| "learning_rate": 7.406655815520998e-06, |
| "loss": 0.1988, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.6828644501278772, |
| "grad_norm": 2.623687689325637, |
| "learning_rate": 7.389029485877577e-06, |
| "loss": 0.2168, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.6854219948849105, |
| "grad_norm": 2.0590068225537355, |
| "learning_rate": 7.371364598885276e-06, |
| "loss": 0.1858, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.6879795396419437, |
| "grad_norm": 2.022043648648239, |
| "learning_rate": 7.353661439643638e-06, |
| "loss": 0.1643, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.690537084398977, |
| "grad_norm": 2.1352711030685865, |
| "learning_rate": 7.335920293869891e-06, |
| "loss": 0.1904, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6930946291560103, |
| "grad_norm": 2.1395975567372973, |
| "learning_rate": 7.318141447894344e-06, |
| "loss": 0.1301, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.6956521739130435, |
| "grad_norm": 2.358500093463299, |
| "learning_rate": 7.300325188655762e-06, |
| "loss": 0.233, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.6982097186700768, |
| "grad_norm": 2.7648498326794475, |
| "learning_rate": 7.28247180369673e-06, |
| "loss": 0.2265, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.7007672634271099, |
| "grad_norm": 3.2503327399881763, |
| "learning_rate": 7.264581581159024e-06, |
| "loss": 0.2357, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.7033248081841432, |
| "grad_norm": 2.5417867541200874, |
| "learning_rate": 7.246654809778951e-06, |
| "loss": 0.2498, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 2.3961400191083806, |
| "learning_rate": 7.2286917788826926e-06, |
| "loss": 0.1844, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.7084398976982097, |
| "grad_norm": 1.9891068084294126, |
| "learning_rate": 7.210692778381634e-06, |
| "loss": 0.193, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.710997442455243, |
| "grad_norm": 3.2392544755285453, |
| "learning_rate": 7.192658098767686e-06, |
| "loss": 0.2539, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.7135549872122762, |
| "grad_norm": 2.1786844692725773, |
| "learning_rate": 7.174588031108598e-06, |
| "loss": 0.1915, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.7161125319693095, |
| "grad_norm": 2.6661413865093713, |
| "learning_rate": 7.1564828670432595e-06, |
| "loss": 0.2342, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7186700767263428, |
| "grad_norm": 2.9051881715043426, |
| "learning_rate": 7.138342898776989e-06, |
| "loss": 0.2147, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.7212276214833759, |
| "grad_norm": 2.7853479622331436, |
| "learning_rate": 7.120168419076825e-06, |
| "loss": 0.2404, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.7237851662404092, |
| "grad_norm": 2.62022869243389, |
| "learning_rate": 7.101959721266798e-06, |
| "loss": 0.2654, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.7263427109974424, |
| "grad_norm": 2.8497861912336493, |
| "learning_rate": 7.083717099223192e-06, |
| "loss": 0.2363, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.7289002557544757, |
| "grad_norm": 1.927888471947111, |
| "learning_rate": 7.0654408473698084e-06, |
| "loss": 0.1749, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.731457800511509, |
| "grad_norm": 2.1417743769308846, |
| "learning_rate": 7.047131260673214e-06, |
| "loss": 0.1625, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.7340153452685422, |
| "grad_norm": 2.7428996805182893, |
| "learning_rate": 7.0287886346379755e-06, |
| "loss": 0.2409, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.7365728900255755, |
| "grad_norm": 2.882244454025018, |
| "learning_rate": 7.010413265301888e-06, |
| "loss": 0.2332, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.7391304347826086, |
| "grad_norm": 2.289617076560806, |
| "learning_rate": 6.9920054492312086e-06, |
| "loss": 0.2432, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.7416879795396419, |
| "grad_norm": 2.6660050033924674, |
| "learning_rate": 6.97356548351586e-06, |
| "loss": 0.2138, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7442455242966752, |
| "grad_norm": 2.2449127788273984, |
| "learning_rate": 6.9550936657646386e-06, |
| "loss": 0.1914, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.7468030690537084, |
| "grad_norm": 2.578986965277836, |
| "learning_rate": 6.936590294100414e-06, |
| "loss": 0.1689, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.7493606138107417, |
| "grad_norm": 2.678033357388885, |
| "learning_rate": 6.918055667155311e-06, |
| "loss": 0.1851, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.7519181585677749, |
| "grad_norm": 2.2424142224493058, |
| "learning_rate": 6.899490084065897e-06, |
| "loss": 0.1656, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.7544757033248082, |
| "grad_norm": 2.601517395679842, |
| "learning_rate": 6.8808938444683505e-06, |
| "loss": 0.1847, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7570332480818415, |
| "grad_norm": 2.6747405794799053, |
| "learning_rate": 6.862267248493624e-06, |
| "loss": 0.1879, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.7595907928388747, |
| "grad_norm": 2.7534418874581807, |
| "learning_rate": 6.843610596762606e-06, |
| "loss": 0.2088, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.7621483375959079, |
| "grad_norm": 2.4987719982507257, |
| "learning_rate": 6.824924190381257e-06, |
| "loss": 0.1564, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.7647058823529411, |
| "grad_norm": 3.011560955755456, |
| "learning_rate": 6.806208330935766e-06, |
| "loss": 0.2279, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.7672634271099744, |
| "grad_norm": 2.47182094190269, |
| "learning_rate": 6.7874633204876705e-06, |
| "loss": 0.1486, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7698209718670077, |
| "grad_norm": 2.325842317060422, |
| "learning_rate": 6.768689461568987e-06, |
| "loss": 0.188, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.7723785166240409, |
| "grad_norm": 1.9846873076028098, |
| "learning_rate": 6.7498870571773275e-06, |
| "loss": 0.1817, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.7749360613810742, |
| "grad_norm": 2.2567074601145594, |
| "learning_rate": 6.731056410771008e-06, |
| "loss": 0.1887, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.7774936061381074, |
| "grad_norm": 2.6561654913648347, |
| "learning_rate": 6.712197826264154e-06, |
| "loss": 0.2012, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.7800511508951407, |
| "grad_norm": 2.5575096912130033, |
| "learning_rate": 6.69331160802179e-06, |
| "loss": 0.193, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.782608695652174, |
| "grad_norm": 2.3711184342063443, |
| "learning_rate": 6.674398060854931e-06, |
| "loss": 0.1485, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.7851662404092071, |
| "grad_norm": 2.4200012568245115, |
| "learning_rate": 6.655457490015667e-06, |
| "loss": 0.2083, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.7877237851662404, |
| "grad_norm": 1.9707518284091383, |
| "learning_rate": 6.636490201192229e-06, |
| "loss": 0.1813, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.7902813299232737, |
| "grad_norm": 1.7181427371507578, |
| "learning_rate": 6.617496500504056e-06, |
| "loss": 0.0925, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.7928388746803069, |
| "grad_norm": 2.3369129146849468, |
| "learning_rate": 6.5984766944968636e-06, |
| "loss": 0.1802, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7953964194373402, |
| "grad_norm": 2.631310750013353, |
| "learning_rate": 6.579431090137681e-06, |
| "loss": 0.195, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.7979539641943734, |
| "grad_norm": 2.8165495006266985, |
| "learning_rate": 6.560359994809916e-06, |
| "loss": 0.3059, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.8005115089514067, |
| "grad_norm": 2.5182286775381146, |
| "learning_rate": 6.541263716308375e-06, |
| "loss": 0.2114, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.80306905370844, |
| "grad_norm": 3.1928195988480477, |
| "learning_rate": 6.522142562834307e-06, |
| "loss": 0.2564, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.8056265984654731, |
| "grad_norm": 2.092609856779203, |
| "learning_rate": 6.502996842990431e-06, |
| "loss": 0.1982, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8081841432225064, |
| "grad_norm": 2.664912965399812, |
| "learning_rate": 6.483826865775941e-06, |
| "loss": 0.2371, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.8107416879795396, |
| "grad_norm": 2.375675995524453, |
| "learning_rate": 6.46463294058154e-06, |
| "loss": 0.1792, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.8132992327365729, |
| "grad_norm": 2.6330711752053557, |
| "learning_rate": 6.445415377184427e-06, |
| "loss": 0.2179, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.8158567774936062, |
| "grad_norm": 2.377318815972245, |
| "learning_rate": 6.426174485743309e-06, |
| "loss": 0.1779, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.8184143222506394, |
| "grad_norm": 2.047002161592419, |
| "learning_rate": 6.4069105767933944e-06, |
| "loss": 0.1914, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8209718670076727, |
| "grad_norm": 3.6604469943318816, |
| "learning_rate": 6.387623961241375e-06, |
| "loss": 0.269, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.8235294117647058, |
| "grad_norm": 2.292929507372217, |
| "learning_rate": 6.368314950360416e-06, |
| "loss": 0.1641, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.8260869565217391, |
| "grad_norm": 2.396691252809917, |
| "learning_rate": 6.348983855785122e-06, |
| "loss": 0.1481, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.8286445012787724, |
| "grad_norm": 2.223495270908185, |
| "learning_rate": 6.3296309895065215e-06, |
| "loss": 0.1846, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.8312020460358056, |
| "grad_norm": 2.77555766966037, |
| "learning_rate": 6.310256663867019e-06, |
| "loss": 0.1814, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8337595907928389, |
| "grad_norm": 3.0829837194013336, |
| "learning_rate": 6.290861191555359e-06, |
| "loss": 0.1887, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.8363171355498721, |
| "grad_norm": 3.0603758797950085, |
| "learning_rate": 6.271444885601583e-06, |
| "loss": 0.2475, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.8388746803069054, |
| "grad_norm": 2.8775211385673667, |
| "learning_rate": 6.252008059371968e-06, |
| "loss": 0.2194, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.8414322250639387, |
| "grad_norm": 1.8794766662306952, |
| "learning_rate": 6.2325510265639785e-06, |
| "loss": 0.1518, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.8439897698209718, |
| "grad_norm": 2.7832036572553225, |
| "learning_rate": 6.213074101201202e-06, |
| "loss": 0.1865, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8465473145780051, |
| "grad_norm": 2.3334923351434154, |
| "learning_rate": 6.193577597628268e-06, |
| "loss": 0.1868, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.8491048593350383, |
| "grad_norm": 2.02193771649545, |
| "learning_rate": 6.174061830505801e-06, |
| "loss": 0.1716, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.8516624040920716, |
| "grad_norm": 2.863073171004623, |
| "learning_rate": 6.154527114805312e-06, |
| "loss": 0.2744, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.8542199488491049, |
| "grad_norm": 2.265853554867697, |
| "learning_rate": 6.1349737658041385e-06, |
| "loss": 0.2205, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.8567774936061381, |
| "grad_norm": 2.3251872190009006, |
| "learning_rate": 6.115402099080345e-06, |
| "loss": 0.1638, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.8593350383631714, |
| "grad_norm": 2.203257515903664, |
| "learning_rate": 6.095812430507627e-06, |
| "loss": 0.1257, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.8618925831202046, |
| "grad_norm": 2.7857913202564677, |
| "learning_rate": 6.076205076250227e-06, |
| "loss": 0.2098, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.8644501278772379, |
| "grad_norm": 2.6357008969113305, |
| "learning_rate": 6.056580352757813e-06, |
| "loss": 0.2217, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.8670076726342711, |
| "grad_norm": 2.211934388483927, |
| "learning_rate": 6.036938576760388e-06, |
| "loss": 0.1638, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 1.8993360773251713, |
| "learning_rate": 6.0172800652631706e-06, |
| "loss": 0.1501, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8721227621483376, |
| "grad_norm": 2.6231453652485888, |
| "learning_rate": 5.997605135541472e-06, |
| "loss": 0.1895, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.8746803069053708, |
| "grad_norm": 2.5439694565298065, |
| "learning_rate": 5.977914105135594e-06, |
| "loss": 0.2258, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.8772378516624041, |
| "grad_norm": 1.9699119486573233, |
| "learning_rate": 5.9582072918456805e-06, |
| "loss": 0.1393, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.8797953964194374, |
| "grad_norm": 2.350677864064173, |
| "learning_rate": 5.938485013726612e-06, |
| "loss": 0.2093, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 2.134194035147575, |
| "learning_rate": 5.918747589082853e-06, |
| "loss": 0.1793, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8849104859335039, |
| "grad_norm": 1.8936481603379223, |
| "learning_rate": 5.898995336463326e-06, |
| "loss": 0.1789, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.887468030690537, |
| "grad_norm": 2.1007267584688045, |
| "learning_rate": 5.879228574656269e-06, |
| "loss": 0.1288, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.8900255754475703, |
| "grad_norm": 3.051111060833368, |
| "learning_rate": 5.859447622684084e-06, |
| "loss": 0.2582, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.8925831202046036, |
| "grad_norm": 2.236462377355688, |
| "learning_rate": 5.839652799798197e-06, |
| "loss": 0.1863, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.8951406649616368, |
| "grad_norm": 3.1932643905276294, |
| "learning_rate": 5.819844425473899e-06, |
| "loss": 0.2649, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8976982097186701, |
| "grad_norm": 2.335957164933005, |
| "learning_rate": 5.800022819405194e-06, |
| "loss": 0.1919, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.9002557544757033, |
| "grad_norm": 2.393627657617242, |
| "learning_rate": 5.780188301499636e-06, |
| "loss": 0.2248, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.9028132992327366, |
| "grad_norm": 3.0419193632396593, |
| "learning_rate": 5.760341191873167e-06, |
| "loss": 0.2492, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.9053708439897699, |
| "grad_norm": 2.789673666006855, |
| "learning_rate": 5.740481810844952e-06, |
| "loss": 0.2148, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.907928388746803, |
| "grad_norm": 2.177256387807687, |
| "learning_rate": 5.720610478932211e-06, |
| "loss": 0.1572, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.9104859335038363, |
| "grad_norm": 2.5575844668599004, |
| "learning_rate": 5.700727516845038e-06, |
| "loss": 0.1791, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.9130434782608695, |
| "grad_norm": 2.159477204677751, |
| "learning_rate": 5.680833245481234e-06, |
| "loss": 0.2066, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.9156010230179028, |
| "grad_norm": 2.620579576653764, |
| "learning_rate": 5.660927985921122e-06, |
| "loss": 0.2152, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.9181585677749361, |
| "grad_norm": 2.4186482484527465, |
| "learning_rate": 5.641012059422369e-06, |
| "loss": 0.2289, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.9207161125319693, |
| "grad_norm": 2.47409420791087, |
| "learning_rate": 5.621085787414799e-06, |
| "loss": 0.2251, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9232736572890026, |
| "grad_norm": 2.4391802732543675, |
| "learning_rate": 5.601149491495206e-06, |
| "loss": 0.2096, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.9258312020460358, |
| "grad_norm": 2.1522496402929763, |
| "learning_rate": 5.581203493422161e-06, |
| "loss": 0.2098, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.928388746803069, |
| "grad_norm": 2.051258819209543, |
| "learning_rate": 5.561248115110822e-06, |
| "loss": 0.1607, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.9309462915601023, |
| "grad_norm": 2.826498031720456, |
| "learning_rate": 5.541283678627742e-06, |
| "loss": 0.1787, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.9335038363171355, |
| "grad_norm": 1.9969316599555271, |
| "learning_rate": 5.521310506185661e-06, |
| "loss": 0.1489, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.9360613810741688, |
| "grad_norm": 2.718834305160498, |
| "learning_rate": 5.501328920138314e-06, |
| "loss": 0.2199, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.9386189258312021, |
| "grad_norm": 2.5390949603608517, |
| "learning_rate": 5.481339242975227e-06, |
| "loss": 0.1642, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 2.5326981562462043, |
| "learning_rate": 5.46134179731651e-06, |
| "loss": 0.197, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.9437340153452686, |
| "grad_norm": 2.3314348462475745, |
| "learning_rate": 5.441336905907653e-06, |
| "loss": 0.1948, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.9462915601023018, |
| "grad_norm": 2.953899718678035, |
| "learning_rate": 5.421324891614312e-06, |
| "loss": 0.2213, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.948849104859335, |
| "grad_norm": 1.8019263635252813, |
| "learning_rate": 5.4013060774171055e-06, |
| "loss": 0.1613, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.9514066496163683, |
| "grad_norm": 2.6407174472584263, |
| "learning_rate": 5.3812807864063946e-06, |
| "loss": 0.2338, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.9539641943734015, |
| "grad_norm": 2.222112415501572, |
| "learning_rate": 5.361249341777075e-06, |
| "loss": 0.1829, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.9565217391304348, |
| "grad_norm": 2.2070653031081724, |
| "learning_rate": 5.341212066823356e-06, |
| "loss": 0.1999, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.959079283887468, |
| "grad_norm": 2.7743426313075625, |
| "learning_rate": 5.321169284933543e-06, |
| "loss": 0.2185, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9616368286445013, |
| "grad_norm": 2.248190406589497, |
| "learning_rate": 5.3011213195848245e-06, |
| "loss": 0.2421, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.9641943734015346, |
| "grad_norm": 2.7068752652927213, |
| "learning_rate": 5.281068494338039e-06, |
| "loss": 0.1734, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.9667519181585678, |
| "grad_norm": 2.192826902317446, |
| "learning_rate": 5.26101113283247e-06, |
| "loss": 0.1515, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.969309462915601, |
| "grad_norm": 2.7221466363290046, |
| "learning_rate": 5.240949558780605e-06, |
| "loss": 0.1996, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.9718670076726342, |
| "grad_norm": 2.5816644472067947, |
| "learning_rate": 5.220884095962924e-06, |
| "loss": 0.2146, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9744245524296675, |
| "grad_norm": 2.6603299397494014, |
| "learning_rate": 5.200815068222666e-06, |
| "loss": 0.1963, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.9769820971867008, |
| "grad_norm": 2.043440881296807, |
| "learning_rate": 5.1807427994606065e-06, |
| "loss": 0.1299, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.979539641943734, |
| "grad_norm": 3.09946402567714, |
| "learning_rate": 5.1606676136298305e-06, |
| "loss": 0.2043, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.9820971867007673, |
| "grad_norm": 2.0882076768035605, |
| "learning_rate": 5.140589834730503e-06, |
| "loss": 0.1652, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.9846547314578005, |
| "grad_norm": 2.2528010881334164, |
| "learning_rate": 5.120509786804635e-06, |
| "loss": 0.2062, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9872122762148338, |
| "grad_norm": 2.555764169894337, |
| "learning_rate": 5.100427793930862e-06, |
| "loss": 0.2093, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.989769820971867, |
| "grad_norm": 2.8157882383013435, |
| "learning_rate": 5.08034418021921e-06, |
| "loss": 0.2087, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.9923273657289002, |
| "grad_norm": 1.9976132604345895, |
| "learning_rate": 5.06025926980586e-06, |
| "loss": 0.1367, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.9948849104859335, |
| "grad_norm": 1.919401790892785, |
| "learning_rate": 5.040173386847926e-06, |
| "loss": 0.1615, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.9974424552429667, |
| "grad_norm": 2.262839581690866, |
| "learning_rate": 5.0200868555182155e-06, |
| "loss": 0.2022, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.153092207417964, |
| "learning_rate": 5e-06, |
| "loss": 0.172, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.0025575447570332, |
| "grad_norm": 1.5698547864039567, |
| "learning_rate": 4.979913144481785e-06, |
| "loss": 0.0923, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.0051150895140666, |
| "grad_norm": 1.7196677143098391, |
| "learning_rate": 4.959826613152074e-06, |
| "loss": 0.0905, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.0076726342710998, |
| "grad_norm": 1.6337325297432321, |
| "learning_rate": 4.939740730194141e-06, |
| "loss": 0.0937, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.010230179028133, |
| "grad_norm": 1.546693698592758, |
| "learning_rate": 4.919655819780792e-06, |
| "loss": 0.0719, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.0127877237851663, |
| "grad_norm": 1.210961794524791, |
| "learning_rate": 4.899572206069138e-06, |
| "loss": 0.0735, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.0153452685421995, |
| "grad_norm": 1.4546180756017038, |
| "learning_rate": 4.879490213195366e-06, |
| "loss": 0.0834, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.0179028132992327, |
| "grad_norm": 2.134512736663491, |
| "learning_rate": 4.8594101652694996e-06, |
| "loss": 0.1081, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.020460358056266, |
| "grad_norm": 1.7947252889220031, |
| "learning_rate": 4.839332386370171e-06, |
| "loss": 0.0797, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.0230179028132993, |
| "grad_norm": 1.4906221815114593, |
| "learning_rate": 4.819257200539394e-06, |
| "loss": 0.0809, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0255754475703325, |
| "grad_norm": 1.4752311089192027, |
| "learning_rate": 4.799184931777337e-06, |
| "loss": 0.0915, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.0281329923273657, |
| "grad_norm": 1.750489629718268, |
| "learning_rate": 4.779115904037079e-06, |
| "loss": 0.0646, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.030690537084399, |
| "grad_norm": 1.6512373102135207, |
| "learning_rate": 4.759050441219395e-06, |
| "loss": 0.0628, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.0332480818414322, |
| "grad_norm": 1.691044169536105, |
| "learning_rate": 4.738988867167531e-06, |
| "loss": 0.063, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.0358056265984654, |
| "grad_norm": 2.0322017046197156, |
| "learning_rate": 4.718931505661961e-06, |
| "loss": 0.0774, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.0383631713554988, |
| "grad_norm": 1.9548414770728504, |
| "learning_rate": 4.698878680415176e-06, |
| "loss": 0.0761, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.040920716112532, |
| "grad_norm": 1.8154231893157877, |
| "learning_rate": 4.678830715066458e-06, |
| "loss": 0.0663, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.0434782608695652, |
| "grad_norm": 1.908769325692329, |
| "learning_rate": 4.6587879331766465e-06, |
| "loss": 0.0797, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.0460358056265984, |
| "grad_norm": 1.8323259285844344, |
| "learning_rate": 4.638750658222927e-06, |
| "loss": 0.0924, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.0485933503836318, |
| "grad_norm": 1.5414803826730832, |
| "learning_rate": 4.618719213593605e-06, |
| "loss": 0.0634, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.051150895140665, |
| "grad_norm": 1.6613587791845974, |
| "learning_rate": 4.598693922582896e-06, |
| "loss": 0.0746, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.0537084398976981, |
| "grad_norm": 2.2412707215008387, |
| "learning_rate": 4.5786751083856895e-06, |
| "loss": 0.0867, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.0562659846547315, |
| "grad_norm": 1.9062228570199347, |
| "learning_rate": 4.558663094092348e-06, |
| "loss": 0.0911, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.0588235294117647, |
| "grad_norm": 2.0731838014599084, |
| "learning_rate": 4.53865820268349e-06, |
| "loss": 0.101, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.061381074168798, |
| "grad_norm": 2.269864643199699, |
| "learning_rate": 4.518660757024774e-06, |
| "loss": 0.0712, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.0639386189258313, |
| "grad_norm": 1.980016755904863, |
| "learning_rate": 4.498671079861686e-06, |
| "loss": 0.0925, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.0664961636828645, |
| "grad_norm": 1.8104325090018185, |
| "learning_rate": 4.478689493814341e-06, |
| "loss": 0.0938, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.0690537084398977, |
| "grad_norm": 2.122828708966008, |
| "learning_rate": 4.4587163213722595e-06, |
| "loss": 0.098, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.0716112531969308, |
| "grad_norm": 1.7514752364051867, |
| "learning_rate": 4.438751884889179e-06, |
| "loss": 0.0807, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.0741687979539642, |
| "grad_norm": 1.704079437046455, |
| "learning_rate": 4.41879650657784e-06, |
| "loss": 0.063, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0767263427109974, |
| "grad_norm": 1.8407856408139254, |
| "learning_rate": 4.398850508504795e-06, |
| "loss": 0.0635, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.0792838874680306, |
| "grad_norm": 1.3786332639992318, |
| "learning_rate": 4.3789142125852015e-06, |
| "loss": 0.0627, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.081841432225064, |
| "grad_norm": 1.9012435964224539, |
| "learning_rate": 4.358987940577631e-06, |
| "loss": 0.0842, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.0843989769820972, |
| "grad_norm": 1.8321649236974789, |
| "learning_rate": 4.339072014078879e-06, |
| "loss": 0.0716, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.0869565217391304, |
| "grad_norm": 2.4106609123907665, |
| "learning_rate": 4.319166754518768e-06, |
| "loss": 0.0713, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0895140664961638, |
| "grad_norm": 2.439541112929831, |
| "learning_rate": 4.299272483154963e-06, |
| "loss": 0.0908, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.092071611253197, |
| "grad_norm": 2.2334712493856554, |
| "learning_rate": 4.27938952106779e-06, |
| "loss": 0.0882, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.0946291560102301, |
| "grad_norm": 1.2884249231332807, |
| "learning_rate": 4.259518189155049e-06, |
| "loss": 0.0575, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.0971867007672633, |
| "grad_norm": 1.9023989436254016, |
| "learning_rate": 4.2396588081268355e-06, |
| "loss": 0.0661, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.0997442455242967, |
| "grad_norm": 2.3807159026641345, |
| "learning_rate": 4.219811698500365e-06, |
| "loss": 0.1393, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.10230179028133, |
| "grad_norm": 1.511245707372203, |
| "learning_rate": 4.199977180594807e-06, |
| "loss": 0.0557, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.104859335038363, |
| "grad_norm": 2.213364286263777, |
| "learning_rate": 4.1801555745261025e-06, |
| "loss": 0.0703, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.1074168797953965, |
| "grad_norm": 1.5318293474866007, |
| "learning_rate": 4.160347200201804e-06, |
| "loss": 0.0897, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.1099744245524297, |
| "grad_norm": 1.6444289494653057, |
| "learning_rate": 4.140552377315918e-06, |
| "loss": 0.0839, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.1125319693094629, |
| "grad_norm": 1.8090163189909276, |
| "learning_rate": 4.120771425343733e-06, |
| "loss": 0.0597, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.1150895140664963, |
| "grad_norm": 1.7790317490199443, |
| "learning_rate": 4.101004663536675e-06, |
| "loss": 0.0776, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.1176470588235294, |
| "grad_norm": 1.597885560258774, |
| "learning_rate": 4.081252410917148e-06, |
| "loss": 0.0779, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.1202046035805626, |
| "grad_norm": 1.8656620983935601, |
| "learning_rate": 4.061514986273391e-06, |
| "loss": 0.0744, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.1227621483375958, |
| "grad_norm": 2.3111425176888902, |
| "learning_rate": 4.041792708154321e-06, |
| "loss": 0.1111, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.1253196930946292, |
| "grad_norm": 1.3808204885398507, |
| "learning_rate": 4.022085894864408e-06, |
| "loss": 0.0607, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1278772378516624, |
| "grad_norm": 1.540584019757327, |
| "learning_rate": 4.0023948644585294e-06, |
| "loss": 0.0685, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.1304347826086956, |
| "grad_norm": 2.5606144468666137, |
| "learning_rate": 3.982719934736832e-06, |
| "loss": 0.0948, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.132992327365729, |
| "grad_norm": 1.4721842128661413, |
| "learning_rate": 3.963061423239612e-06, |
| "loss": 0.0813, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.1355498721227621, |
| "grad_norm": 2.0215762888887565, |
| "learning_rate": 3.943419647242189e-06, |
| "loss": 0.0865, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.1381074168797953, |
| "grad_norm": 1.698982836870589, |
| "learning_rate": 3.923794923749775e-06, |
| "loss": 0.0886, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.1406649616368287, |
| "grad_norm": 1.4422178530936527, |
| "learning_rate": 3.904187569492373e-06, |
| "loss": 0.061, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.143222506393862, |
| "grad_norm": 1.6568446058876403, |
| "learning_rate": 3.884597900919656e-06, |
| "loss": 0.072, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.145780051150895, |
| "grad_norm": 1.6692733646389666, |
| "learning_rate": 3.865026234195863e-06, |
| "loss": 0.064, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.1483375959079285, |
| "grad_norm": 1.9178605584493544, |
| "learning_rate": 3.8454728851946885e-06, |
| "loss": 0.0797, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.1508951406649617, |
| "grad_norm": 1.624572665749194, |
| "learning_rate": 3.8259381694942e-06, |
| "loss": 0.0769, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1534526854219949, |
| "grad_norm": 1.4633601037928454, |
| "learning_rate": 3.806422402371733e-06, |
| "loss": 0.0596, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.156010230179028, |
| "grad_norm": 1.6961811237423687, |
| "learning_rate": 3.786925898798801e-06, |
| "loss": 0.0779, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.1585677749360614, |
| "grad_norm": 1.3744500648620386, |
| "learning_rate": 3.767448973436021e-06, |
| "loss": 0.0666, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.1611253196930946, |
| "grad_norm": 2.179426339865248, |
| "learning_rate": 3.7479919406280334e-06, |
| "loss": 0.0841, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.1636828644501278, |
| "grad_norm": 1.816045503492125, |
| "learning_rate": 3.728555114398419e-06, |
| "loss": 0.0715, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.1662404092071612, |
| "grad_norm": 1.9881894419562507, |
| "learning_rate": 3.709138808444641e-06, |
| "loss": 0.0799, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.1687979539641944, |
| "grad_norm": 1.49466287653342, |
| "learning_rate": 3.689743336132982e-06, |
| "loss": 0.0516, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.1713554987212276, |
| "grad_norm": 1.3842976468768091, |
| "learning_rate": 3.6703690104934806e-06, |
| "loss": 0.0655, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.1739130434782608, |
| "grad_norm": 1.754609078905008, |
| "learning_rate": 3.6510161442148783e-06, |
| "loss": 0.0685, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 1.4943900089831539, |
| "learning_rate": 3.6316850496395863e-06, |
| "loss": 0.0726, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1790281329923273, |
| "grad_norm": 1.5559428054805928, |
| "learning_rate": 3.6123760387586265e-06, |
| "loss": 0.0627, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.1815856777493605, |
| "grad_norm": 1.6718429584640253, |
| "learning_rate": 3.5930894232066072e-06, |
| "loss": 0.0601, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.184143222506394, |
| "grad_norm": 1.8714176452111198, |
| "learning_rate": 3.5738255142566912e-06, |
| "loss": 0.092, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.186700767263427, |
| "grad_norm": 1.6615141671047535, |
| "learning_rate": 3.5545846228155743e-06, |
| "loss": 0.0608, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.1892583120204603, |
| "grad_norm": 1.5566192447860165, |
| "learning_rate": 3.5353670594184623e-06, |
| "loss": 0.0578, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.1918158567774937, |
| "grad_norm": 2.0925827246711686, |
| "learning_rate": 3.516173134224059e-06, |
| "loss": 0.0864, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.1943734015345269, |
| "grad_norm": 1.5682869130335337, |
| "learning_rate": 3.4970031570095707e-06, |
| "loss": 0.0563, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.19693094629156, |
| "grad_norm": 1.57609443913993, |
| "learning_rate": 3.477857437165694e-06, |
| "loss": 0.0711, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.1994884910485935, |
| "grad_norm": 1.7895753498943323, |
| "learning_rate": 3.458736283691626e-06, |
| "loss": 0.0712, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.2020460358056266, |
| "grad_norm": 1.6643459994448309, |
| "learning_rate": 3.4396400051900846e-06, |
| "loss": 0.059, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.2046035805626598, |
| "grad_norm": 1.3441611085327887, |
| "learning_rate": 3.4205689098623195e-06, |
| "loss": 0.0488, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.207161125319693, |
| "grad_norm": 1.1211357993124325, |
| "learning_rate": 3.401523305503139e-06, |
| "loss": 0.0371, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.2097186700767264, |
| "grad_norm": 2.0660355539669393, |
| "learning_rate": 3.3825034994959445e-06, |
| "loss": 0.0947, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.2122762148337596, |
| "grad_norm": 1.608787114297835, |
| "learning_rate": 3.3635097988077724e-06, |
| "loss": 0.074, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.2148337595907928, |
| "grad_norm": 1.7486815265862035, |
| "learning_rate": 3.3445425099843343e-06, |
| "loss": 0.0742, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2173913043478262, |
| "grad_norm": 1.5604908568521394, |
| "learning_rate": 3.3256019391450696e-06, |
| "loss": 0.0514, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.2199488491048593, |
| "grad_norm": 2.2122019533355983, |
| "learning_rate": 3.3066883919782116e-06, |
| "loss": 0.1137, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.2225063938618925, |
| "grad_norm": 1.8080383259958446, |
| "learning_rate": 3.287802173735848e-06, |
| "loss": 0.0709, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.2250639386189257, |
| "grad_norm": 1.6502758829199498, |
| "learning_rate": 3.268943589228992e-06, |
| "loss": 0.0648, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.227621483375959, |
| "grad_norm": 2.365736744384701, |
| "learning_rate": 3.250112942822673e-06, |
| "loss": 0.0759, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.2301790281329923, |
| "grad_norm": 2.097762143569881, |
| "learning_rate": 3.231310538431015e-06, |
| "loss": 0.0966, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.2327365728900257, |
| "grad_norm": 2.112263985795022, |
| "learning_rate": 3.212536679512332e-06, |
| "loss": 0.0801, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.2352941176470589, |
| "grad_norm": 1.6316550742720446, |
| "learning_rate": 3.1937916690642356e-06, |
| "loss": 0.0585, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.237851662404092, |
| "grad_norm": 2.051437980934914, |
| "learning_rate": 3.1750758096187446e-06, |
| "loss": 0.0954, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.2404092071611252, |
| "grad_norm": 1.6248985041395814, |
| "learning_rate": 3.1563894032373977e-06, |
| "loss": 0.0705, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.2429667519181586, |
| "grad_norm": 1.8808984876425323, |
| "learning_rate": 3.137732751506376e-06, |
| "loss": 0.0699, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.2455242966751918, |
| "grad_norm": 1.6657687470775868, |
| "learning_rate": 3.1191061555316503e-06, |
| "loss": 0.0725, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.248081841432225, |
| "grad_norm": 1.6204503721283818, |
| "learning_rate": 3.1005099159341044e-06, |
| "loss": 0.0705, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.2506393861892584, |
| "grad_norm": 1.7946799973401157, |
| "learning_rate": 3.08194433284469e-06, |
| "loss": 0.0939, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.2531969309462916, |
| "grad_norm": 1.5436173873028685, |
| "learning_rate": 3.0634097058995877e-06, |
| "loss": 0.0642, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2557544757033248, |
| "grad_norm": 2.5025377910535602, |
| "learning_rate": 3.0449063342353635e-06, |
| "loss": 0.089, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.258312020460358, |
| "grad_norm": 1.846350151404866, |
| "learning_rate": 3.0264345164841426e-06, |
| "loss": 0.0797, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.2608695652173914, |
| "grad_norm": 1.4014902867742467, |
| "learning_rate": 3.007994550768793e-06, |
| "loss": 0.0529, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.2634271099744245, |
| "grad_norm": 1.749293050973978, |
| "learning_rate": 2.989586734698113e-06, |
| "loss": 0.066, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.265984654731458, |
| "grad_norm": 1.677143321270154, |
| "learning_rate": 2.971211365362028e-06, |
| "loss": 0.0717, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.2685421994884911, |
| "grad_norm": 1.4865811657524735, |
| "learning_rate": 2.9528687393267865e-06, |
| "loss": 0.0746, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.2710997442455243, |
| "grad_norm": 1.581847266608915, |
| "learning_rate": 2.934559152630192e-06, |
| "loss": 0.0732, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.2736572890025575, |
| "grad_norm": 1.7806387171278764, |
| "learning_rate": 2.9162829007768103e-06, |
| "loss": 0.0919, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.2762148337595907, |
| "grad_norm": 1.5745970726671847, |
| "learning_rate": 2.898040278733203e-06, |
| "loss": 0.0719, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.278772378516624, |
| "grad_norm": 1.6462836730413777, |
| "learning_rate": 2.879831580923176e-06, |
| "loss": 0.0652, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.278772378516624, |
| "eval_loss": 0.21273775398731232, |
| "eval_runtime": 4.5782, |
| "eval_samples_per_second": 6.99, |
| "eval_steps_per_second": 1.747, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2813299232736572, |
| "grad_norm": 1.943689809670517, |
| "learning_rate": 2.8616571012230134e-06, |
| "loss": 0.0896, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.2838874680306906, |
| "grad_norm": 1.7108775240717138, |
| "learning_rate": 2.843517132956742e-06, |
| "loss": 0.0829, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.2864450127877238, |
| "grad_norm": 1.5745418008984486, |
| "learning_rate": 2.8254119688914017e-06, |
| "loss": 0.0761, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.289002557544757, |
| "grad_norm": 1.5315465963046555, |
| "learning_rate": 2.8073419012323154e-06, |
| "loss": 0.0798, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.2915601023017902, |
| "grad_norm": 1.5085834635172504, |
| "learning_rate": 2.789307221618369e-06, |
| "loss": 0.0696, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.2941176470588236, |
| "grad_norm": 2.0317808899575933, |
| "learning_rate": 2.771308221117309e-06, |
| "loss": 0.1059, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.2966751918158568, |
| "grad_norm": 1.4613005775169157, |
| "learning_rate": 2.7533451902210512e-06, |
| "loss": 0.0603, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.29923273657289, |
| "grad_norm": 1.8605111151110105, |
| "learning_rate": 2.7354184188409773e-06, |
| "loss": 0.0888, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.3017902813299234, |
| "grad_norm": 1.617241995750445, |
| "learning_rate": 2.71752819630327e-06, |
| "loss": 0.0737, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.3043478260869565, |
| "grad_norm": 1.8548161748439362, |
| "learning_rate": 2.6996748113442397e-06, |
| "loss": 0.0663, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.3069053708439897, |
| "grad_norm": 1.8033275920362444, |
| "learning_rate": 2.6818585521056573e-06, |
| "loss": 0.0704, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.309462915601023, |
| "grad_norm": 1.8578383533525582, |
| "learning_rate": 2.66407970613011e-06, |
| "loss": 0.0754, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.3120204603580563, |
| "grad_norm": 1.6455284536390185, |
| "learning_rate": 2.646338560356363e-06, |
| "loss": 0.0545, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.3145780051150895, |
| "grad_norm": 1.6433378276314705, |
| "learning_rate": 2.6286354011147252e-06, |
| "loss": 0.063, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.317135549872123, |
| "grad_norm": 2.305871703667012, |
| "learning_rate": 2.6109705141224255e-06, |
| "loss": 0.1028, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.319693094629156, |
| "grad_norm": 1.5367647731836724, |
| "learning_rate": 2.593344184479003e-06, |
| "loss": 0.066, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.3222506393861893, |
| "grad_norm": 1.6097087447086145, |
| "learning_rate": 2.575756696661713e-06, |
| "loss": 0.0582, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.3248081841432224, |
| "grad_norm": 1.214868677490465, |
| "learning_rate": 2.5582083345209217e-06, |
| "loss": 0.057, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.3273657289002558, |
| "grad_norm": 1.7532789333330783, |
| "learning_rate": 2.540699381275539e-06, |
| "loss": 0.0784, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.329923273657289, |
| "grad_norm": 1.535396007238416, |
| "learning_rate": 2.5232301195084395e-06, |
| "loss": 0.061, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.3324808184143222, |
| "grad_norm": 1.961040557523754, |
| "learning_rate": 2.5058008311619035e-06, |
| "loss": 0.0774, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.3350383631713556, |
| "grad_norm": 1.6548886187642289, |
| "learning_rate": 2.488411797533064e-06, |
| "loss": 0.0497, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.3375959079283888, |
| "grad_norm": 1.8616891226892944, |
| "learning_rate": 2.4710632992693737e-06, |
| "loss": 0.0664, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.340153452685422, |
| "grad_norm": 1.522818582149643, |
| "learning_rate": 2.4537556163640726e-06, |
| "loss": 0.0642, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.3427109974424551, |
| "grad_norm": 1.9564770020434565, |
| "learning_rate": 2.4364890281516633e-06, |
| "loss": 0.0815, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3452685421994885, |
| "grad_norm": 1.1715877168427582, |
| "learning_rate": 2.4192638133034074e-06, |
| "loss": 0.0435, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.3478260869565217, |
| "grad_norm": 1.3873917483817966, |
| "learning_rate": 2.4020802498228333e-06, |
| "loss": 0.0573, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.350383631713555, |
| "grad_norm": 1.0503749788006405, |
| "learning_rate": 2.384938615041238e-06, |
| "loss": 0.0497, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.3529411764705883, |
| "grad_norm": 1.7078053655214958, |
| "learning_rate": 2.3678391856132203e-06, |
| "loss": 0.0658, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.3554987212276215, |
| "grad_norm": 1.233203753875264, |
| "learning_rate": 2.350782237512215e-06, |
| "loss": 0.0471, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.3580562659846547, |
| "grad_norm": 1.5492283920850878, |
| "learning_rate": 2.3337680460260314e-06, |
| "loss": 0.0606, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.3606138107416879, |
| "grad_norm": 2.42405809855206, |
| "learning_rate": 2.316796885752415e-06, |
| "loss": 0.108, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.3631713554987213, |
| "grad_norm": 2.0245811204612516, |
| "learning_rate": 2.299869030594622e-06, |
| "loss": 0.1002, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.3657289002557544, |
| "grad_norm": 1.0435760500681248, |
| "learning_rate": 2.2829847537569904e-06, |
| "loss": 0.0351, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.3682864450127878, |
| "grad_norm": 2.2546431033682985, |
| "learning_rate": 2.266144327740531e-06, |
| "loss": 0.0926, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.370843989769821, |
| "grad_norm": 2.155804482532426, |
| "learning_rate": 2.2493480243385298e-06, |
| "loss": 0.0832, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.3734015345268542, |
| "grad_norm": 1.7587243734253648, |
| "learning_rate": 2.2325961146321683e-06, |
| "loss": 0.0744, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.3759590792838874, |
| "grad_norm": 1.9603939445396372, |
| "learning_rate": 2.2158888689861434e-06, |
| "loss": 0.0742, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.3785166240409208, |
| "grad_norm": 1.473096805809632, |
| "learning_rate": 2.1992265570442974e-06, |
| "loss": 0.0706, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.381074168797954, |
| "grad_norm": 1.8442896619919424, |
| "learning_rate": 2.182609447725279e-06, |
| "loss": 0.0885, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.3836317135549872, |
| "grad_norm": 1.8191363677733596, |
| "learning_rate": 2.1660378092181935e-06, |
| "loss": 0.074, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.3861892583120206, |
| "grad_norm": 1.9074363059029638, |
| "learning_rate": 2.149511908978275e-06, |
| "loss": 0.0746, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.3887468030690537, |
| "grad_norm": 1.910812757836058, |
| "learning_rate": 2.1330320137225773e-06, |
| "loss": 0.0956, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.391304347826087, |
| "grad_norm": 1.8198118504923657, |
| "learning_rate": 2.1165983894256647e-06, |
| "loss": 0.091, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.39386189258312, |
| "grad_norm": 1.771534594356425, |
| "learning_rate": 2.100211301315315e-06, |
| "loss": 0.0624, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.3964194373401535, |
| "grad_norm": 1.5960742782667032, |
| "learning_rate": 2.0838710138682412e-06, |
| "loss": 0.0649, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.3989769820971867, |
| "grad_norm": 1.4352565996218818, |
| "learning_rate": 2.0675777908058307e-06, |
| "loss": 0.0651, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.40153452685422, |
| "grad_norm": 1.8453025336140658, |
| "learning_rate": 2.051331895089882e-06, |
| "loss": 0.0856, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.4040920716112533, |
| "grad_norm": 1.7879301257977294, |
| "learning_rate": 2.035133588918356e-06, |
| "loss": 0.0564, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.4066496163682864, |
| "grad_norm": 1.055074940010279, |
| "learning_rate": 2.0189831337211573e-06, |
| "loss": 0.0349, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.4092071611253196, |
| "grad_norm": 1.7438163522982681, |
| "learning_rate": 2.0028807901559027e-06, |
| "loss": 0.0776, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.4117647058823528, |
| "grad_norm": 1.5139546840336329, |
| "learning_rate": 1.9868268181037186e-06, |
| "loss": 0.0754, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.4143222506393862, |
| "grad_norm": 1.79660968193301, |
| "learning_rate": 1.970821476665051e-06, |
| "loss": 0.0693, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.4168797953964194, |
| "grad_norm": 1.396783128713414, |
| "learning_rate": 1.9548650241554812e-06, |
| "loss": 0.0559, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.4194373401534528, |
| "grad_norm": 1.6817944787104224, |
| "learning_rate": 1.9389577181015496e-06, |
| "loss": 0.0532, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.421994884910486, |
| "grad_norm": 1.3959759855353526, |
| "learning_rate": 1.923099815236608e-06, |
| "loss": 0.053, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.4245524296675192, |
| "grad_norm": 1.266582979888415, |
| "learning_rate": 1.9072915714966761e-06, |
| "loss": 0.048, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.4271099744245523, |
| "grad_norm": 1.8765331165368473, |
| "learning_rate": 1.8915332420163074e-06, |
| "loss": 0.0633, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.4296675191815857, |
| "grad_norm": 1.2977035796306693, |
| "learning_rate": 1.8758250811244682e-06, |
| "loss": 0.0509, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.432225063938619, |
| "grad_norm": 1.4167349873272226, |
| "learning_rate": 1.8601673423404449e-06, |
| "loss": 0.0645, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.434782608695652, |
| "grad_norm": 1.8235368661900502, |
| "learning_rate": 1.8445602783697375e-06, |
| "loss": 0.0668, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.4373401534526855, |
| "grad_norm": 1.5509277107583779, |
| "learning_rate": 1.8290041410999893e-06, |
| "loss": 0.077, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.4398976982097187, |
| "grad_norm": 1.5375878611055105, |
| "learning_rate": 1.8134991815969238e-06, |
| "loss": 0.0673, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.4424552429667519, |
| "grad_norm": 1.9051540404989096, |
| "learning_rate": 1.798045650100289e-06, |
| "loss": 0.0709, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.445012787723785, |
| "grad_norm": 2.141394685521614, |
| "learning_rate": 1.782643796019814e-06, |
| "loss": 0.072, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.4475703324808185, |
| "grad_norm": 1.7999776365108109, |
| "learning_rate": 1.7672938679311957e-06, |
| "loss": 0.0764, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.4501278772378516, |
| "grad_norm": 1.8863953126395598, |
| "learning_rate": 1.7519961135720737e-06, |
| "loss": 0.0754, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.452685421994885, |
| "grad_norm": 1.7560519814030673, |
| "learning_rate": 1.736750779838044e-06, |
| "loss": 0.0712, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.4552429667519182, |
| "grad_norm": 1.5334124499018023, |
| "learning_rate": 1.7215581127786624e-06, |
| "loss": 0.0639, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.4578005115089514, |
| "grad_norm": 1.364571623755127, |
| "learning_rate": 1.7064183575934856e-06, |
| "loss": 0.0598, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4603580562659846, |
| "grad_norm": 1.358357570743382, |
| "learning_rate": 1.6913317586281048e-06, |
| "loss": 0.0587, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.4629156010230178, |
| "grad_norm": 2.2006102667445413, |
| "learning_rate": 1.676298559370202e-06, |
| "loss": 0.098, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.4654731457800512, |
| "grad_norm": 1.885231575108773, |
| "learning_rate": 1.6613190024456293e-06, |
| "loss": 0.0756, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.4680306905370843, |
| "grad_norm": 1.8937798233386698, |
| "learning_rate": 1.6463933296144863e-06, |
| "loss": 0.0645, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 2.079267842097328, |
| "learning_rate": 1.6315217817672142e-06, |
| "loss": 0.0794, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.473145780051151, |
| "grad_norm": 2.0372550296739087, |
| "learning_rate": 1.6167045989207185e-06, |
| "loss": 0.0877, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.4757033248081841, |
| "grad_norm": 1.6766306067310754, |
| "learning_rate": 1.6019420202144853e-06, |
| "loss": 0.0403, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.4782608695652173, |
| "grad_norm": 1.6282511096736187, |
| "learning_rate": 1.5872342839067305e-06, |
| "loss": 0.0533, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.4808184143222507, |
| "grad_norm": 1.3859280744992961, |
| "learning_rate": 1.5725816273705453e-06, |
| "loss": 0.0578, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.4833759590792839, |
| "grad_norm": 1.1143022475391577, |
| "learning_rate": 1.5579842870900746e-06, |
| "loss": 0.0376, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.485933503836317, |
| "grad_norm": 1.529711981151928, |
| "learning_rate": 1.5434424986566938e-06, |
| "loss": 0.0953, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.4884910485933505, |
| "grad_norm": 1.27023062886774, |
| "learning_rate": 1.5289564967652033e-06, |
| "loss": 0.0456, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.4910485933503836, |
| "grad_norm": 1.6016588716041684, |
| "learning_rate": 1.5145265152100574e-06, |
| "loss": 0.0695, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.4936061381074168, |
| "grad_norm": 1.7027552802851214, |
| "learning_rate": 1.5001527868815702e-06, |
| "loss": 0.075, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.49616368286445, |
| "grad_norm": 2.087105079353017, |
| "learning_rate": 1.4858355437621663e-06, |
| "loss": 0.0843, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.4987212276214834, |
| "grad_norm": 1.5515159664184583, |
| "learning_rate": 1.4715750169226417e-06, |
| "loss": 0.0746, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.5012787723785166, |
| "grad_norm": 1.6199284724546499, |
| "learning_rate": 1.457371436518424e-06, |
| "loss": 0.06, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.50383631713555, |
| "grad_norm": 1.5277517404992678, |
| "learning_rate": 1.4432250317858675e-06, |
| "loss": 0.0729, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.5063938618925832, |
| "grad_norm": 2.3550500012049733, |
| "learning_rate": 1.4291360310385455e-06, |
| "loss": 0.0633, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.5089514066496164, |
| "grad_norm": 1.7295044529207286, |
| "learning_rate": 1.4151046616635727e-06, |
| "loss": 0.0671, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.5115089514066495, |
| "grad_norm": 1.894949015332874, |
| "learning_rate": 1.4011311501179287e-06, |
| "loss": 0.0804, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.5140664961636827, |
| "grad_norm": 2.121021767643575, |
| "learning_rate": 1.3872157219248045e-06, |
| "loss": 0.0717, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.5166240409207161, |
| "grad_norm": 1.6430284408399045, |
| "learning_rate": 1.373358601669973e-06, |
| "loss": 0.053, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.5191815856777495, |
| "grad_norm": 1.526803613192238, |
| "learning_rate": 1.3595600129981469e-06, |
| "loss": 0.0638, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.5217391304347827, |
| "grad_norm": 1.0212528309323259, |
| "learning_rate": 1.3458201786093795e-06, |
| "loss": 0.0416, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.5242966751918159, |
| "grad_norm": 1.9219415286349144, |
| "learning_rate": 1.3321393202554739e-06, |
| "loss": 0.0834, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.526854219948849, |
| "grad_norm": 2.080278099746331, |
| "learning_rate": 1.3185176587363919e-06, |
| "loss": 0.0882, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.5294117647058822, |
| "grad_norm": 1.7816231414266257, |
| "learning_rate": 1.3049554138967052e-06, |
| "loss": 0.0582, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.5319693094629157, |
| "grad_norm": 1.889256583497343, |
| "learning_rate": 1.2914528046220332e-06, |
| "loss": 0.0695, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.5345268542199488, |
| "grad_norm": 1.1550477281276221, |
| "learning_rate": 1.278010048835523e-06, |
| "loss": 0.0333, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5370843989769822, |
| "grad_norm": 1.4939337260656969, |
| "learning_rate": 1.2646273634943195e-06, |
| "loss": 0.0725, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.5396419437340154, |
| "grad_norm": 1.4169987914491964, |
| "learning_rate": 1.2513049645860759e-06, |
| "loss": 0.0412, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.5421994884910486, |
| "grad_norm": 1.7381273646988493, |
| "learning_rate": 1.2380430671254618e-06, |
| "loss": 0.0604, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.5447570332480818, |
| "grad_norm": 1.606050132274542, |
| "learning_rate": 1.224841885150691e-06, |
| "loss": 0.0654, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.547314578005115, |
| "grad_norm": 1.9353777885376064, |
| "learning_rate": 1.2117016317200702e-06, |
| "loss": 0.0999, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.5498721227621484, |
| "grad_norm": 1.4204697537593365, |
| "learning_rate": 1.1986225189085627e-06, |
| "loss": 0.0503, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.5524296675191815, |
| "grad_norm": 1.7779994619812158, |
| "learning_rate": 1.185604757804359e-06, |
| "loss": 0.0648, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.554987212276215, |
| "grad_norm": 1.8729532913873508, |
| "learning_rate": 1.172648558505477e-06, |
| "loss": 0.0621, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.5575447570332481, |
| "grad_norm": 1.5939978689407244, |
| "learning_rate": 1.1597541301163655e-06, |
| "loss": 0.0616, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.5601023017902813, |
| "grad_norm": 1.6805775276124237, |
| "learning_rate": 1.1469216807445348e-06, |
| "loss": 0.0584, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.5626598465473145, |
| "grad_norm": 1.8194848984443412, |
| "learning_rate": 1.1341514174971907e-06, |
| "loss": 0.0632, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.5652173913043477, |
| "grad_norm": 1.6220328579883132, |
| "learning_rate": 1.1214435464779006e-06, |
| "loss": 0.0706, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.567774936061381, |
| "grad_norm": 1.9488536146970024, |
| "learning_rate": 1.1087982727832613e-06, |
| "loss": 0.1119, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.5703324808184145, |
| "grad_norm": 2.2215510632895907, |
| "learning_rate": 1.0962158004995893e-06, |
| "loss": 0.097, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.5728900255754477, |
| "grad_norm": 1.3539194267902872, |
| "learning_rate": 1.083696332699628e-06, |
| "loss": 0.0474, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.5754475703324808, |
| "grad_norm": 1.0447972180103597, |
| "learning_rate": 1.0712400714392723e-06, |
| "loss": 0.0349, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.578005115089514, |
| "grad_norm": 1.7820783836872298, |
| "learning_rate": 1.058847217754303e-06, |
| "loss": 0.0574, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.5805626598465472, |
| "grad_norm": 1.2726641899295212, |
| "learning_rate": 1.0465179716571467e-06, |
| "loss": 0.0391, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.5831202046035806, |
| "grad_norm": 2.1713194520674297, |
| "learning_rate": 1.034252532133646e-06, |
| "loss": 0.0564, |
| "step": 619 |
| }, |
| { |
| "epoch": 1.5856777493606138, |
| "grad_norm": 1.3422351008724465, |
| "learning_rate": 1.0220510971398473e-06, |
| "loss": 0.0582, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.5882352941176472, |
| "grad_norm": 1.9326774960679987, |
| "learning_rate": 1.0099138635988026e-06, |
| "loss": 0.0679, |
| "step": 621 |
| }, |
| { |
| "epoch": 1.5907928388746804, |
| "grad_norm": 1.4277862321292778, |
| "learning_rate": 9.978410273974015e-07, |
| "loss": 0.0578, |
| "step": 622 |
| }, |
| { |
| "epoch": 1.5933503836317136, |
| "grad_norm": 1.84261949025278, |
| "learning_rate": 9.858327833832004e-07, |
| "loss": 0.0638, |
| "step": 623 |
| }, |
| { |
| "epoch": 1.5959079283887467, |
| "grad_norm": 1.5750246594688215, |
| "learning_rate": 9.738893253612808e-07, |
| "loss": 0.0646, |
| "step": 624 |
| }, |
| { |
| "epoch": 1.59846547314578, |
| "grad_norm": 1.7918813630008548, |
| "learning_rate": 9.620108460911181e-07, |
| "loss": 0.0575, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.6010230179028133, |
| "grad_norm": 1.582528596444316, |
| "learning_rate": 9.50197537283481e-07, |
| "loss": 0.0643, |
| "step": 626 |
| }, |
| { |
| "epoch": 1.6035805626598465, |
| "grad_norm": 1.858978097610581, |
| "learning_rate": 9.384495895973227e-07, |
| "loss": 0.0775, |
| "step": 627 |
| }, |
| { |
| "epoch": 1.60613810741688, |
| "grad_norm": 1.9376171131571154, |
| "learning_rate": 9.267671926367166e-07, |
| "loss": 0.0682, |
| "step": 628 |
| }, |
| { |
| "epoch": 1.608695652173913, |
| "grad_norm": 1.6529180966681518, |
| "learning_rate": 9.151505349477901e-07, |
| "loss": 0.0749, |
| "step": 629 |
| }, |
| { |
| "epoch": 1.6112531969309463, |
| "grad_norm": 1.545032973496891, |
| "learning_rate": 9.035998040156801e-07, |
| "loss": 0.0653, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.6138107416879794, |
| "grad_norm": 1.4873821155805664, |
| "learning_rate": 8.921151862615091e-07, |
| "loss": 0.0522, |
| "step": 631 |
| }, |
| { |
| "epoch": 1.6163682864450126, |
| "grad_norm": 1.9828594885163102, |
| "learning_rate": 8.806968670393801e-07, |
| "loss": 0.0881, |
| "step": 632 |
| }, |
| { |
| "epoch": 1.618925831202046, |
| "grad_norm": 1.5428881405958734, |
| "learning_rate": 8.693450306333818e-07, |
| "loss": 0.0596, |
| "step": 633 |
| }, |
| { |
| "epoch": 1.6214833759590794, |
| "grad_norm": 1.9526142345186388, |
| "learning_rate": 8.580598602546109e-07, |
| "loss": 0.0713, |
| "step": 634 |
| }, |
| { |
| "epoch": 1.6240409207161126, |
| "grad_norm": 1.2159178790910155, |
| "learning_rate": 8.4684153803822e-07, |
| "loss": 0.0437, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.6265984654731458, |
| "grad_norm": 1.5525955368175755, |
| "learning_rate": 8.356902450404792e-07, |
| "loss": 0.0553, |
| "step": 636 |
| }, |
| { |
| "epoch": 1.629156010230179, |
| "grad_norm": 1.4526232993831398, |
| "learning_rate": 8.246061612358475e-07, |
| "loss": 0.0607, |
| "step": 637 |
| }, |
| { |
| "epoch": 1.6317135549872122, |
| "grad_norm": 1.623054202144306, |
| "learning_rate": 8.135894655140758e-07, |
| "loss": 0.0614, |
| "step": 638 |
| }, |
| { |
| "epoch": 1.6342710997442456, |
| "grad_norm": 2.020789659063834, |
| "learning_rate": 8.026403356773161e-07, |
| "loss": 0.0662, |
| "step": 639 |
| }, |
| { |
| "epoch": 1.6368286445012787, |
| "grad_norm": 2.0131654709879756, |
| "learning_rate": 7.91758948437249e-07, |
| "loss": 0.0641, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.6393861892583121, |
| "grad_norm": 1.5241049366793433, |
| "learning_rate": 7.809454794122346e-07, |
| "loss": 0.0676, |
| "step": 641 |
| }, |
| { |
| "epoch": 1.6419437340153453, |
| "grad_norm": 1.5072894320854298, |
| "learning_rate": 7.702001031244816e-07, |
| "loss": 0.0613, |
| "step": 642 |
| }, |
| { |
| "epoch": 1.6445012787723785, |
| "grad_norm": 1.3149510012931023, |
| "learning_rate": 7.595229929972253e-07, |
| "loss": 0.066, |
| "step": 643 |
| }, |
| { |
| "epoch": 1.6470588235294117, |
| "grad_norm": 1.354258203231293, |
| "learning_rate": 7.489143213519301e-07, |
| "loss": 0.0409, |
| "step": 644 |
| }, |
| { |
| "epoch": 1.6496163682864449, |
| "grad_norm": 1.394303666622696, |
| "learning_rate": 7.383742594055077e-07, |
| "loss": 0.0616, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.6521739130434783, |
| "grad_norm": 1.9374972519378002, |
| "learning_rate": 7.279029772675572e-07, |
| "loss": 0.082, |
| "step": 646 |
| }, |
| { |
| "epoch": 1.6547314578005117, |
| "grad_norm": 1.6243040582047277, |
| "learning_rate": 7.17500643937617e-07, |
| "loss": 0.0498, |
| "step": 647 |
| }, |
| { |
| "epoch": 1.6572890025575449, |
| "grad_norm": 1.6392303232363044, |
| "learning_rate": 7.071674273024353e-07, |
| "loss": 0.0634, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.659846547314578, |
| "grad_norm": 2.139421270837991, |
| "learning_rate": 6.969034941332664e-07, |
| "loss": 0.1057, |
| "step": 649 |
| }, |
| { |
| "epoch": 1.6624040920716112, |
| "grad_norm": 1.3581405578159422, |
| "learning_rate": 6.86709010083172e-07, |
| "loss": 0.0392, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.6649616368286444, |
| "grad_norm": 1.675037519291502, |
| "learning_rate": 6.765841396843514e-07, |
| "loss": 0.0526, |
| "step": 651 |
| }, |
| { |
| "epoch": 1.6675191815856778, |
| "grad_norm": 2.0592421439000135, |
| "learning_rate": 6.665290463454882e-07, |
| "loss": 0.0733, |
| "step": 652 |
| }, |
| { |
| "epoch": 1.670076726342711, |
| "grad_norm": 2.3725190689778834, |
| "learning_rate": 6.565438923491102e-07, |
| "loss": 0.085, |
| "step": 653 |
| }, |
| { |
| "epoch": 1.6726342710997444, |
| "grad_norm": 1.5665070937066485, |
| "learning_rate": 6.466288388489689e-07, |
| "loss": 0.0741, |
| "step": 654 |
| }, |
| { |
| "epoch": 1.6751918158567776, |
| "grad_norm": 1.5211024774946007, |
| "learning_rate": 6.367840458674401e-07, |
| "loss": 0.0662, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.6777493606138107, |
| "grad_norm": 1.4224852306974154, |
| "learning_rate": 6.270096722929442e-07, |
| "loss": 0.0465, |
| "step": 656 |
| }, |
| { |
| "epoch": 1.680306905370844, |
| "grad_norm": 1.6096574471315697, |
| "learning_rate": 6.173058758773775e-07, |
| "loss": 0.0635, |
| "step": 657 |
| }, |
| { |
| "epoch": 1.682864450127877, |
| "grad_norm": 1.7328118872108653, |
| "learning_rate": 6.076728132335669e-07, |
| "loss": 0.0536, |
| "step": 658 |
| }, |
| { |
| "epoch": 1.6854219948849105, |
| "grad_norm": 2.0081885289199004, |
| "learning_rate": 5.981106398327463e-07, |
| "loss": 0.0762, |
| "step": 659 |
| }, |
| { |
| "epoch": 1.6879795396419437, |
| "grad_norm": 1.7928585732400393, |
| "learning_rate": 5.886195100020408e-07, |
| "loss": 0.0775, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.690537084398977, |
| "grad_norm": 1.7653788413306644, |
| "learning_rate": 5.7919957692198e-07, |
| "loss": 0.0553, |
| "step": 661 |
| }, |
| { |
| "epoch": 1.6930946291560103, |
| "grad_norm": 1.4481488430178222, |
| "learning_rate": 5.698509926240275e-07, |
| "loss": 0.0647, |
| "step": 662 |
| }, |
| { |
| "epoch": 1.6956521739130435, |
| "grad_norm": 1.7839607218624998, |
| "learning_rate": 5.60573907988124e-07, |
| "loss": 0.0784, |
| "step": 663 |
| }, |
| { |
| "epoch": 1.6982097186700766, |
| "grad_norm": 1.3127529412098409, |
| "learning_rate": 5.513684727402529e-07, |
| "loss": 0.0444, |
| "step": 664 |
| }, |
| { |
| "epoch": 1.7007672634271098, |
| "grad_norm": 1.3869543812328162, |
| "learning_rate": 5.422348354500217e-07, |
| "loss": 0.0516, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.7033248081841432, |
| "grad_norm": 1.6005455194774372, |
| "learning_rate": 5.331731435282705e-07, |
| "loss": 0.0488, |
| "step": 666 |
| }, |
| { |
| "epoch": 1.7058823529411766, |
| "grad_norm": 1.8517881383130126, |
| "learning_rate": 5.241835432246888e-07, |
| "loss": 0.0851, |
| "step": 667 |
| }, |
| { |
| "epoch": 1.7084398976982098, |
| "grad_norm": 1.4850618619837235, |
| "learning_rate": 5.152661796254505e-07, |
| "loss": 0.0447, |
| "step": 668 |
| }, |
| { |
| "epoch": 1.710997442455243, |
| "grad_norm": 2.1553630250764884, |
| "learning_rate": 5.064211966508837e-07, |
| "loss": 0.0595, |
| "step": 669 |
| }, |
| { |
| "epoch": 1.7135549872122762, |
| "grad_norm": 1.5588219606375748, |
| "learning_rate": 4.976487370531352e-07, |
| "loss": 0.0621, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.7161125319693094, |
| "grad_norm": 2.089144802560461, |
| "learning_rate": 4.88948942413876e-07, |
| "loss": 0.0858, |
| "step": 671 |
| }, |
| { |
| "epoch": 1.7186700767263428, |
| "grad_norm": 1.7907366729470824, |
| "learning_rate": 4.803219531420128e-07, |
| "loss": 0.0585, |
| "step": 672 |
| }, |
| { |
| "epoch": 1.721227621483376, |
| "grad_norm": 1.6065952810269382, |
| "learning_rate": 4.717679084714222e-07, |
| "loss": 0.0469, |
| "step": 673 |
| }, |
| { |
| "epoch": 1.7237851662404093, |
| "grad_norm": 1.391081341252637, |
| "learning_rate": 4.6328694645870254e-07, |
| "loss": 0.0621, |
| "step": 674 |
| }, |
| { |
| "epoch": 1.7263427109974425, |
| "grad_norm": 1.7591705981665215, |
| "learning_rate": 4.5487920398094465e-07, |
| "loss": 0.0688, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.7289002557544757, |
| "grad_norm": 1.3688117618719597, |
| "learning_rate": 4.46544816733529e-07, |
| "loss": 0.0516, |
| "step": 676 |
| }, |
| { |
| "epoch": 1.7314578005115089, |
| "grad_norm": 1.5369125245891289, |
| "learning_rate": 4.382839192279303e-07, |
| "loss": 0.068, |
| "step": 677 |
| }, |
| { |
| "epoch": 1.734015345268542, |
| "grad_norm": 1.4485510331681306, |
| "learning_rate": 4.3009664478954384e-07, |
| "loss": 0.0438, |
| "step": 678 |
| }, |
| { |
| "epoch": 1.7365728900255755, |
| "grad_norm": 1.9885788988160304, |
| "learning_rate": 4.219831255555423e-07, |
| "loss": 0.0673, |
| "step": 679 |
| }, |
| { |
| "epoch": 1.7391304347826086, |
| "grad_norm": 1.3795869342736704, |
| "learning_rate": 4.139434924727359e-07, |
| "loss": 0.0442, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.741687979539642, |
| "grad_norm": 1.8321446740987333, |
| "learning_rate": 4.059778752954607e-07, |
| "loss": 0.064, |
| "step": 681 |
| }, |
| { |
| "epoch": 1.7442455242966752, |
| "grad_norm": 1.6108426141916128, |
| "learning_rate": 3.9808640258348686e-07, |
| "loss": 0.0561, |
| "step": 682 |
| }, |
| { |
| "epoch": 1.7468030690537084, |
| "grad_norm": 1.5359072620188055, |
| "learning_rate": 3.9026920169994374e-07, |
| "loss": 0.0675, |
| "step": 683 |
| }, |
| { |
| "epoch": 1.7493606138107416, |
| "grad_norm": 1.7008119887557362, |
| "learning_rate": 3.825263988092587e-07, |
| "loss": 0.0671, |
| "step": 684 |
| }, |
| { |
| "epoch": 1.7519181585677748, |
| "grad_norm": 1.471165980148968, |
| "learning_rate": 3.7485811887512714e-07, |
| "loss": 0.0607, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.7544757033248082, |
| "grad_norm": 1.4713436991190307, |
| "learning_rate": 3.672644856584928e-07, |
| "loss": 0.0593, |
| "step": 686 |
| }, |
| { |
| "epoch": 1.7570332480818416, |
| "grad_norm": 1.8321987965717021, |
| "learning_rate": 3.597456217155526e-07, |
| "loss": 0.0503, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.7595907928388748, |
| "grad_norm": 1.4654951355310737, |
| "learning_rate": 3.523016483957742e-07, |
| "loss": 0.0661, |
| "step": 688 |
| }, |
| { |
| "epoch": 1.762148337595908, |
| "grad_norm": 1.6318489446677316, |
| "learning_rate": 3.4493268583994434e-07, |
| "loss": 0.0595, |
| "step": 689 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 1.2394295437052634, |
| "learning_rate": 3.3763885297822153e-07, |
| "loss": 0.0282, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.7672634271099743, |
| "grad_norm": 1.7143312871829341, |
| "learning_rate": 3.3042026752822254e-07, |
| "loss": 0.0728, |
| "step": 691 |
| }, |
| { |
| "epoch": 1.7698209718670077, |
| "grad_norm": 1.8174618944743401, |
| "learning_rate": 3.2327704599312283e-07, |
| "loss": 0.0644, |
| "step": 692 |
| }, |
| { |
| "epoch": 1.772378516624041, |
| "grad_norm": 1.6070849800659126, |
| "learning_rate": 3.16209303659773e-07, |
| "loss": 0.0597, |
| "step": 693 |
| }, |
| { |
| "epoch": 1.7749360613810743, |
| "grad_norm": 1.9286508631962667, |
| "learning_rate": 3.0921715459683753e-07, |
| "loss": 0.0502, |
| "step": 694 |
| }, |
| { |
| "epoch": 1.7774936061381075, |
| "grad_norm": 1.5349318887338477, |
| "learning_rate": 3.0230071165295804e-07, |
| "loss": 0.0456, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.7800511508951407, |
| "grad_norm": 1.4901091372165034, |
| "learning_rate": 2.95460086454929e-07, |
| "loss": 0.0528, |
| "step": 696 |
| }, |
| { |
| "epoch": 1.7826086956521738, |
| "grad_norm": 1.918618234904437, |
| "learning_rate": 2.88695389405898e-07, |
| "loss": 0.0784, |
| "step": 697 |
| }, |
| { |
| "epoch": 1.785166240409207, |
| "grad_norm": 1.1882113006459234, |
| "learning_rate": 2.820067296835799e-07, |
| "loss": 0.0445, |
| "step": 698 |
| }, |
| { |
| "epoch": 1.7877237851662404, |
| "grad_norm": 1.9596226833895882, |
| "learning_rate": 2.753942152385014e-07, |
| "loss": 0.0688, |
| "step": 699 |
| }, |
| { |
| "epoch": 1.7902813299232738, |
| "grad_norm": 1.4807925998658464, |
| "learning_rate": 2.688579527922514e-07, |
| "loss": 0.0402, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.792838874680307, |
| "grad_norm": 1.5494471130294227, |
| "learning_rate": 2.6239804783576294e-07, |
| "loss": 0.0543, |
| "step": 701 |
| }, |
| { |
| "epoch": 1.7953964194373402, |
| "grad_norm": 1.710830595848272, |
| "learning_rate": 2.560146046276135e-07, |
| "loss": 0.078, |
| "step": 702 |
| }, |
| { |
| "epoch": 1.7979539641943734, |
| "grad_norm": 1.6103330225310404, |
| "learning_rate": 2.4970772619233475e-07, |
| "loss": 0.0609, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.8005115089514065, |
| "grad_norm": 2.1596511568230294, |
| "learning_rate": 2.4347751431875453e-07, |
| "loss": 0.0849, |
| "step": 704 |
| }, |
| { |
| "epoch": 1.80306905370844, |
| "grad_norm": 1.6435842864771086, |
| "learning_rate": 2.373240695583534e-07, |
| "loss": 0.0789, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.8056265984654731, |
| "grad_norm": 1.5677270813395774, |
| "learning_rate": 2.3124749122364286e-07, |
| "loss": 0.0664, |
| "step": 706 |
| }, |
| { |
| "epoch": 1.8081841432225065, |
| "grad_norm": 1.710908609250228, |
| "learning_rate": 2.2524787738656073e-07, |
| "loss": 0.0554, |
| "step": 707 |
| }, |
| { |
| "epoch": 1.8107416879795397, |
| "grad_norm": 1.6048364103546928, |
| "learning_rate": 2.1932532487688784e-07, |
| "loss": 0.0532, |
| "step": 708 |
| }, |
| { |
| "epoch": 1.813299232736573, |
| "grad_norm": 1.4644455792556497, |
| "learning_rate": 2.1347992928068884e-07, |
| "loss": 0.0603, |
| "step": 709 |
| }, |
| { |
| "epoch": 1.815856777493606, |
| "grad_norm": 1.0091446695350044, |
| "learning_rate": 2.0771178493876387e-07, |
| "loss": 0.0313, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.8184143222506393, |
| "grad_norm": 1.987324391957735, |
| "learning_rate": 2.0202098494513157e-07, |
| "loss": 0.0747, |
| "step": 711 |
| }, |
| { |
| "epoch": 1.8209718670076727, |
| "grad_norm": 1.5012232578184002, |
| "learning_rate": 1.964076211455246e-07, |
| "loss": 0.0491, |
| "step": 712 |
| }, |
| { |
| "epoch": 1.8235294117647058, |
| "grad_norm": 1.6839252722578555, |
| "learning_rate": 1.908717841359048e-07, |
| "loss": 0.065, |
| "step": 713 |
| }, |
| { |
| "epoch": 1.8260869565217392, |
| "grad_norm": 1.764351815514701, |
| "learning_rate": 1.8541356326100436e-07, |
| "loss": 0.049, |
| "step": 714 |
| }, |
| { |
| "epoch": 1.8286445012787724, |
| "grad_norm": 1.9346780735975497, |
| "learning_rate": 1.800330466128808e-07, |
| "loss": 0.0635, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.8312020460358056, |
| "grad_norm": 1.1857733970794198, |
| "learning_rate": 1.7473032102949983e-07, |
| "loss": 0.0427, |
| "step": 716 |
| }, |
| { |
| "epoch": 1.8337595907928388, |
| "grad_norm": 2.126736518256365, |
| "learning_rate": 1.695054720933309e-07, |
| "loss": 0.0884, |
| "step": 717 |
| }, |
| { |
| "epoch": 1.836317135549872, |
| "grad_norm": 1.5012079730873682, |
| "learning_rate": 1.6435858412996275e-07, |
| "loss": 0.0525, |
| "step": 718 |
| }, |
| { |
| "epoch": 1.8388746803069054, |
| "grad_norm": 1.9920935892905687, |
| "learning_rate": 1.5928974020674947e-07, |
| "loss": 0.0633, |
| "step": 719 |
| }, |
| { |
| "epoch": 1.8414322250639388, |
| "grad_norm": 1.4935126462129222, |
| "learning_rate": 1.542990221314644e-07, |
| "loss": 0.0416, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.843989769820972, |
| "grad_norm": 1.5667962682302419, |
| "learning_rate": 1.4938651045098174e-07, |
| "loss": 0.0401, |
| "step": 721 |
| }, |
| { |
| "epoch": 1.8465473145780051, |
| "grad_norm": 2.3165346207133957, |
| "learning_rate": 1.445522844499775e-07, |
| "loss": 0.0555, |
| "step": 722 |
| }, |
| { |
| "epoch": 1.8491048593350383, |
| "grad_norm": 1.9816583062122102, |
| "learning_rate": 1.3979642214964728e-07, |
| "loss": 0.0648, |
| "step": 723 |
| }, |
| { |
| "epoch": 1.8516624040920715, |
| "grad_norm": 2.4975848569979355, |
| "learning_rate": 1.3511900030644954e-07, |
| "loss": 0.0852, |
| "step": 724 |
| }, |
| { |
| "epoch": 1.854219948849105, |
| "grad_norm": 2.2176696568855374, |
| "learning_rate": 1.3052009441086533e-07, |
| "loss": 0.0691, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.856777493606138, |
| "grad_norm": 1.9148609494936542, |
| "learning_rate": 1.2599977868618052e-07, |
| "loss": 0.0518, |
| "step": 726 |
| }, |
| { |
| "epoch": 1.8593350383631715, |
| "grad_norm": 2.056805786998454, |
| "learning_rate": 1.215581260872889e-07, |
| "loss": 0.0703, |
| "step": 727 |
| }, |
| { |
| "epoch": 1.8618925831202047, |
| "grad_norm": 1.156766803459366, |
| "learning_rate": 1.1719520829951203e-07, |
| "loss": 0.0393, |
| "step": 728 |
| }, |
| { |
| "epoch": 1.8644501278772379, |
| "grad_norm": 1.1820320080282383, |
| "learning_rate": 1.1291109573744574e-07, |
| "loss": 0.0336, |
| "step": 729 |
| }, |
| { |
| "epoch": 1.867007672634271, |
| "grad_norm": 1.7806788218166225, |
| "learning_rate": 1.087058575438199e-07, |
| "loss": 0.0737, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8695652173913042, |
| "grad_norm": 1.5667661177865002, |
| "learning_rate": 1.0457956158838545e-07, |
| "loss": 0.0625, |
| "step": 731 |
| }, |
| { |
| "epoch": 1.8721227621483376, |
| "grad_norm": 1.186958803626775, |
| "learning_rate": 1.0053227446681912e-07, |
| "loss": 0.0496, |
| "step": 732 |
| }, |
| { |
| "epoch": 1.8746803069053708, |
| "grad_norm": 1.8521104628392622, |
| "learning_rate": 9.656406149964548e-08, |
| "loss": 0.0756, |
| "step": 733 |
| }, |
| { |
| "epoch": 1.8772378516624042, |
| "grad_norm": 1.5291622162529863, |
| "learning_rate": 9.267498673118547e-08, |
| "loss": 0.0609, |
| "step": 734 |
| }, |
| { |
| "epoch": 1.8797953964194374, |
| "grad_norm": 1.5425572563924586, |
| "learning_rate": 8.886511292852395e-08, |
| "loss": 0.0629, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 1.823405533973878, |
| "learning_rate": 8.513450158049109e-08, |
| "loss": 0.0627, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.8849104859335037, |
| "grad_norm": 1.7161639545427452, |
| "learning_rate": 8.148321289667749e-08, |
| "loss": 0.0572, |
| "step": 737 |
| }, |
| { |
| "epoch": 1.887468030690537, |
| "grad_norm": 1.2571507117703948, |
| "learning_rate": 7.791130580645623e-08, |
| "loss": 0.0485, |
| "step": 738 |
| }, |
| { |
| "epoch": 1.8900255754475703, |
| "grad_norm": 1.575736071352483, |
| "learning_rate": 7.441883795803462e-08, |
| "loss": 0.0572, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.8925831202046037, |
| "grad_norm": 1.187272056902486, |
| "learning_rate": 7.100586571752444e-08, |
| "loss": 0.0392, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.895140664961637, |
| "grad_norm": 1.2982831651687614, |
| "learning_rate": 6.767244416802988e-08, |
| "loss": 0.0482, |
| "step": 741 |
| }, |
| { |
| "epoch": 1.89769820971867, |
| "grad_norm": 1.5142498294577111, |
| "learning_rate": 6.441862710876102e-08, |
| "loss": 0.0589, |
| "step": 742 |
| }, |
| { |
| "epoch": 1.9002557544757033, |
| "grad_norm": 1.5225022048090735, |
| "learning_rate": 6.124446705416343e-08, |
| "loss": 0.0385, |
| "step": 743 |
| }, |
| { |
| "epoch": 1.9028132992327365, |
| "grad_norm": 1.485678760379258, |
| "learning_rate": 5.815001523307162e-08, |
| "loss": 0.0559, |
| "step": 744 |
| }, |
| { |
| "epoch": 1.9053708439897699, |
| "grad_norm": 1.5322533890589751, |
| "learning_rate": 5.513532158788193e-08, |
| "loss": 0.0627, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.907928388746803, |
| "grad_norm": 1.9793699147122528, |
| "learning_rate": 5.220043477374759e-08, |
| "loss": 0.0895, |
| "step": 746 |
| }, |
| { |
| "epoch": 1.9104859335038364, |
| "grad_norm": 1.8767345948228846, |
| "learning_rate": 4.934540215779271e-08, |
| "loss": 0.062, |
| "step": 747 |
| }, |
| { |
| "epoch": 1.9130434782608696, |
| "grad_norm": 1.8530710891566307, |
| "learning_rate": 4.657026981834623e-08, |
| "loss": 0.0725, |
| "step": 748 |
| }, |
| { |
| "epoch": 1.9156010230179028, |
| "grad_norm": 1.5959856775583183, |
| "learning_rate": 4.3875082544201364e-08, |
| "loss": 0.066, |
| "step": 749 |
| }, |
| { |
| "epoch": 1.918158567774936, |
| "grad_norm": 1.6751349325104308, |
| "learning_rate": 4.125988383388957e-08, |
| "loss": 0.0601, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.9207161125319692, |
| "grad_norm": 1.4734314475739716, |
| "learning_rate": 3.87247158949805e-08, |
| "loss": 0.0542, |
| "step": 751 |
| }, |
| { |
| "epoch": 1.9232736572890026, |
| "grad_norm": 1.5924404071141742, |
| "learning_rate": 3.626961964340203e-08, |
| "loss": 0.0724, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.9258312020460358, |
| "grad_norm": 1.7826522089646637, |
| "learning_rate": 3.389463470277576e-08, |
| "loss": 0.0768, |
| "step": 753 |
| }, |
| { |
| "epoch": 1.9283887468030692, |
| "grad_norm": 1.6983017020209106, |
| "learning_rate": 3.159979940378088e-08, |
| "loss": 0.0592, |
| "step": 754 |
| }, |
| { |
| "epoch": 1.9309462915601023, |
| "grad_norm": 2.3762549911017707, |
| "learning_rate": 2.938515078353521e-08, |
| "loss": 0.0738, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.9335038363171355, |
| "grad_norm": 1.5837970119495974, |
| "learning_rate": 2.725072458499567e-08, |
| "loss": 0.0721, |
| "step": 756 |
| }, |
| { |
| "epoch": 1.9360613810741687, |
| "grad_norm": 1.8572244239313698, |
| "learning_rate": 2.519655525638376e-08, |
| "loss": 0.0892, |
| "step": 757 |
| }, |
| { |
| "epoch": 1.938618925831202, |
| "grad_norm": 1.9352353669922568, |
| "learning_rate": 2.3222675950627106e-08, |
| "loss": 0.0569, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.9411764705882353, |
| "grad_norm": 1.3105846383254622, |
| "learning_rate": 2.1329118524827662e-08, |
| "loss": 0.0489, |
| "step": 759 |
| }, |
| { |
| "epoch": 1.9437340153452687, |
| "grad_norm": 1.6708577410266003, |
| "learning_rate": 1.9515913539743247e-08, |
| "loss": 0.063, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9462915601023019, |
| "grad_norm": 1.4618266441401186, |
| "learning_rate": 1.7783090259297918e-08, |
| "loss": 0.0609, |
| "step": 761 |
| }, |
| { |
| "epoch": 1.948849104859335, |
| "grad_norm": 1.4615574349262082, |
| "learning_rate": 1.613067665010959e-08, |
| "loss": 0.0449, |
| "step": 762 |
| }, |
| { |
| "epoch": 1.9514066496163682, |
| "grad_norm": 2.07290740502851, |
| "learning_rate": 1.4558699381034825e-08, |
| "loss": 0.0781, |
| "step": 763 |
| }, |
| { |
| "epoch": 1.9539641943734014, |
| "grad_norm": 1.6805936964351897, |
| "learning_rate": 1.3067183822742525e-08, |
| "loss": 0.0529, |
| "step": 764 |
| }, |
| { |
| "epoch": 1.9565217391304348, |
| "grad_norm": 1.7159228063505934, |
| "learning_rate": 1.1656154047303691e-08, |
| "loss": 0.0497, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.959079283887468, |
| "grad_norm": 2.8268253984668634, |
| "learning_rate": 1.0325632827801745e-08, |
| "loss": 0.0748, |
| "step": 766 |
| }, |
| { |
| "epoch": 1.9616368286445014, |
| "grad_norm": 1.9193840541256346, |
| "learning_rate": 9.075641637964483e-09, |
| "loss": 0.0739, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.9641943734015346, |
| "grad_norm": 1.7559700086749113, |
| "learning_rate": 7.906200651819907e-09, |
| "loss": 0.0459, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.9667519181585678, |
| "grad_norm": 1.5806477845906548, |
| "learning_rate": 6.817328743368712e-09, |
| "loss": 0.0535, |
| "step": 769 |
| }, |
| { |
| "epoch": 1.969309462915601, |
| "grad_norm": 1.9343097264191595, |
| "learning_rate": 5.809043486279531e-09, |
| "loss": 0.0894, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9718670076726341, |
| "grad_norm": 1.5764067486137285, |
| "learning_rate": 4.881361153606934e-09, |
| "loss": 0.0498, |
| "step": 771 |
| }, |
| { |
| "epoch": 1.9744245524296675, |
| "grad_norm": 1.4007499728745463, |
| "learning_rate": 4.034296717527752e-09, |
| "loss": 0.0572, |
| "step": 772 |
| }, |
| { |
| "epoch": 1.976982097186701, |
| "grad_norm": 1.615233023165604, |
| "learning_rate": 3.2678638490996064e-09, |
| "loss": 0.0536, |
| "step": 773 |
| }, |
| { |
| "epoch": 1.979539641943734, |
| "grad_norm": 1.782038931727096, |
| "learning_rate": 2.5820749180388573e-09, |
| "loss": 0.0712, |
| "step": 774 |
| }, |
| { |
| "epoch": 1.9820971867007673, |
| "grad_norm": 1.6623499709903002, |
| "learning_rate": 1.976940992523546e-09, |
| "loss": 0.0584, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.9846547314578005, |
| "grad_norm": 2.255461841110182, |
| "learning_rate": 1.4524718390140913e-09, |
| "loss": 0.096, |
| "step": 776 |
| }, |
| { |
| "epoch": 1.9872122762148337, |
| "grad_norm": 1.5422911151085525, |
| "learning_rate": 1.0086759220934162e-09, |
| "loss": 0.0624, |
| "step": 777 |
| }, |
| { |
| "epoch": 1.989769820971867, |
| "grad_norm": 1.5624451885477848, |
| "learning_rate": 6.455604043331676e-10, |
| "loss": 0.0611, |
| "step": 778 |
| }, |
| { |
| "epoch": 1.9923273657289002, |
| "grad_norm": 1.1618378160708118, |
| "learning_rate": 3.631311461765874e-10, |
| "loss": 0.0338, |
| "step": 779 |
| }, |
| { |
| "epoch": 1.9948849104859336, |
| "grad_norm": 2.0229168127939583, |
| "learning_rate": 1.6139270584358823e-10, |
| "loss": 0.0759, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.9974424552429668, |
| "grad_norm": 1.6493382864517687, |
| "learning_rate": 4.034833925969928e-11, |
| "loss": 0.0716, |
| "step": 781 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.867208361253687, |
| "learning_rate": 0.0, |
| "loss": 0.0635, |
| "step": 782 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 782, |
| "total_flos": 4414597447680.0, |
| "train_loss": 0.1475916886583085, |
| "train_runtime": 1693.0919, |
| "train_samples_per_second": 3.695, |
| "train_steps_per_second": 0.462 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 782, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50000000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4414597447680.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|