| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2526315789473683, |
| "eval_steps": 88, |
| "global_step": 550, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.002275960170697013, |
| "grad_norm": 13.768637657165527, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.6233, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.002275960170697013, |
| "eval_loss": 0.6125021576881409, |
| "eval_runtime": 152.4238, |
| "eval_samples_per_second": 3.28, |
| "eval_steps_per_second": 0.82, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.004551920341394026, |
| "grad_norm": 15.011153221130371, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.635, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0068278805120910386, |
| "grad_norm": 16.715984344482422, |
| "learning_rate": 3.0000000000000005e-06, |
| "loss": 0.6105, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.009103840682788052, |
| "grad_norm": 19.56490135192871, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.642, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.011379800853485065, |
| "grad_norm": 13.951680183410645, |
| "learning_rate": 5e-06, |
| "loss": 0.5943, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.013655761024182077, |
| "grad_norm": 13.452520370483398, |
| "learning_rate": 6.000000000000001e-06, |
| "loss": 0.6028, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.015931721194879088, |
| "grad_norm": 1.6321176290512085, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 0.6105, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.018207681365576104, |
| "grad_norm": 8.300277709960938, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.5929, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.020483641536273117, |
| "grad_norm": 17.030431747436523, |
| "learning_rate": 9e-06, |
| "loss": 0.6348, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.02275960170697013, |
| "grad_norm": 1.8888440132141113, |
| "learning_rate": 1e-05, |
| "loss": 0.6508, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02503556187766714, |
| "grad_norm": 4.198676109313965, |
| "learning_rate": 9.999077384384731e-06, |
| "loss": 0.6275, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.027311522048364154, |
| "grad_norm": 11.794512748718262, |
| "learning_rate": 9.998152851535443e-06, |
| "loss": 0.5683, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.029587482219061167, |
| "grad_norm": 15.177773475646973, |
| "learning_rate": 9.99722639546978e-06, |
| "loss": 0.5764, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.031863442389758176, |
| "grad_norm": 9.44954776763916, |
| "learning_rate": 9.996298010180474e-06, |
| "loss": 0.6252, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.034139402560455195, |
| "grad_norm": 12.481877326965332, |
| "learning_rate": 9.995367689635207e-06, |
| "loss": 0.6145, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03641536273115221, |
| "grad_norm": 13.162659645080566, |
| "learning_rate": 9.994435427776491e-06, |
| "loss": 0.6246, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.03869132290184922, |
| "grad_norm": 2.3987083435058594, |
| "learning_rate": 9.993501218521528e-06, |
| "loss": 0.6159, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.04096728307254623, |
| "grad_norm": 13.608319282531738, |
| "learning_rate": 9.992565055762082e-06, |
| "loss": 0.622, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.043243243243243246, |
| "grad_norm": 3.531569242477417, |
| "learning_rate": 9.991626933364347e-06, |
| "loss": 0.5668, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.04551920341394026, |
| "grad_norm": 4.718797206878662, |
| "learning_rate": 9.990686845168801e-06, |
| "loss": 0.5602, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04779516358463727, |
| "grad_norm": 12.422525405883789, |
| "learning_rate": 9.989744784990097e-06, |
| "loss": 0.5826, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.05007112375533428, |
| "grad_norm": 7.886673450469971, |
| "learning_rate": 9.988800746616893e-06, |
| "loss": 0.5745, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.052347083926031296, |
| "grad_norm": 3.4852566719055176, |
| "learning_rate": 9.98785472381175e-06, |
| "loss": 0.5714, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.05462304409672831, |
| "grad_norm": 1.8027795553207397, |
| "learning_rate": 9.986906710310966e-06, |
| "loss": 0.5541, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.05689900426742532, |
| "grad_norm": 1.900911569595337, |
| "learning_rate": 9.985956699824462e-06, |
| "loss": 0.5634, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.059174964438122334, |
| "grad_norm": 4.22526741027832, |
| "learning_rate": 9.985004686035616e-06, |
| "loss": 0.5851, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.061450924608819346, |
| "grad_norm": 0.5803316831588745, |
| "learning_rate": 9.98405066260115e-06, |
| "loss": 0.5484, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.06372688477951635, |
| "grad_norm": 6.2574849128723145, |
| "learning_rate": 9.983094623150975e-06, |
| "loss": 0.5741, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.06600284495021337, |
| "grad_norm": 1.168273687362671, |
| "learning_rate": 9.98213656128805e-06, |
| "loss": 0.5554, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.06827880512091039, |
| "grad_norm": 9.939175605773926, |
| "learning_rate": 9.981176470588237e-06, |
| "loss": 0.5299, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0705547652916074, |
| "grad_norm": 1.4646342992782593, |
| "learning_rate": 9.980214344600165e-06, |
| "loss": 0.5762, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.07283072546230442, |
| "grad_norm": 1.7419993877410889, |
| "learning_rate": 9.979250176845085e-06, |
| "loss": 0.5367, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.07510668563300142, |
| "grad_norm": 2.708136558532715, |
| "learning_rate": 9.978283960816712e-06, |
| "loss": 0.5534, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.07738264580369844, |
| "grad_norm": 0.8647623062133789, |
| "learning_rate": 9.977315689981097e-06, |
| "loss": 0.5096, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.07965860597439545, |
| "grad_norm": 0.7143182754516602, |
| "learning_rate": 9.976345357776464e-06, |
| "loss": 0.5822, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.08193456614509247, |
| "grad_norm": 5.553163528442383, |
| "learning_rate": 9.975372957613073e-06, |
| "loss": 0.5044, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.08421052631578947, |
| "grad_norm": 1.1372196674346924, |
| "learning_rate": 9.97439848287306e-06, |
| "loss": 0.561, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.08648648648648649, |
| "grad_norm": 0.3290291130542755, |
| "learning_rate": 9.9734219269103e-06, |
| "loss": 0.5496, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0887624466571835, |
| "grad_norm": 0.7019220590591431, |
| "learning_rate": 9.972443283050244e-06, |
| "loss": 0.5228, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.09103840682788052, |
| "grad_norm": 2.112306594848633, |
| "learning_rate": 9.971462544589776e-06, |
| "loss": 0.5505, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.09331436699857752, |
| "grad_norm": 0.37200048565864563, |
| "learning_rate": 9.970479704797048e-06, |
| "loss": 0.5425, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.09559032716927454, |
| "grad_norm": 1.8724614381790161, |
| "learning_rate": 9.969494756911346e-06, |
| "loss": 0.5259, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.09786628733997155, |
| "grad_norm": 0.9688167572021484, |
| "learning_rate": 9.968507694142911e-06, |
| "loss": 0.5097, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.10014224751066857, |
| "grad_norm": 1.415741205215454, |
| "learning_rate": 9.967518509672797e-06, |
| "loss": 0.508, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.10241820768136557, |
| "grad_norm": 1.167455792427063, |
| "learning_rate": 9.966527196652723e-06, |
| "loss": 0.5521, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.10469416785206259, |
| "grad_norm": 1.2373249530792236, |
| "learning_rate": 9.965533748204883e-06, |
| "loss": 0.5584, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.1069701280227596, |
| "grad_norm": 2.306133985519409, |
| "learning_rate": 9.96453815742183e-06, |
| "loss": 0.521, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.10924608819345662, |
| "grad_norm": 0.6688553094863892, |
| "learning_rate": 9.963540417366276e-06, |
| "loss": 0.5565, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.11152204836415362, |
| "grad_norm": 1.8641799688339233, |
| "learning_rate": 9.962540521070958e-06, |
| "loss": 0.531, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.11379800853485064, |
| "grad_norm": 0.9236503839492798, |
| "learning_rate": 9.961538461538463e-06, |
| "loss": 0.4942, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.11607396870554765, |
| "grad_norm": 0.6534273624420166, |
| "learning_rate": 9.960534231741067e-06, |
| "loss": 0.5041, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.11834992887624467, |
| "grad_norm": 0.8832672834396362, |
| "learning_rate": 9.959527824620575e-06, |
| "loss": 0.4941, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.12062588904694167, |
| "grad_norm": 0.5689805746078491, |
| "learning_rate": 9.95851923308815e-06, |
| "loss": 0.5048, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.12290184921763869, |
| "grad_norm": 2.131309986114502, |
| "learning_rate": 9.957508450024145e-06, |
| "loss": 0.499, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.1251778093883357, |
| "grad_norm": 0.9756704568862915, |
| "learning_rate": 9.956495468277947e-06, |
| "loss": 0.5084, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1274537695590327, |
| "grad_norm": 1.8052393198013306, |
| "learning_rate": 9.955480280667798e-06, |
| "loss": 0.5328, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.12972972972972974, |
| "grad_norm": 1.034996509552002, |
| "learning_rate": 9.954462879980623e-06, |
| "loss": 0.5255, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.13200568990042674, |
| "grad_norm": 1.0904641151428223, |
| "learning_rate": 9.953443258971873e-06, |
| "loss": 0.5035, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.13428165007112375, |
| "grad_norm": 0.8282355666160583, |
| "learning_rate": 9.952421410365336e-06, |
| "loss": 0.506, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.13655761024182078, |
| "grad_norm": 0.2674669325351715, |
| "learning_rate": 9.951397326852978e-06, |
| "loss": 0.5003, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1388335704125178, |
| "grad_norm": 0.8057956099510193, |
| "learning_rate": 9.950371001094759e-06, |
| "loss": 0.5004, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.1411095305832148, |
| "grad_norm": 0.55033278465271, |
| "learning_rate": 9.949342425718462e-06, |
| "loss": 0.5063, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.1433854907539118, |
| "grad_norm": 0.40570515394210815, |
| "learning_rate": 9.94831159331952e-06, |
| "loss": 0.5085, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.14566145092460883, |
| "grad_norm": 1.0614453554153442, |
| "learning_rate": 9.947278496460825e-06, |
| "loss": 0.4805, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.14793741109530584, |
| "grad_norm": 0.31362247467041016, |
| "learning_rate": 9.946243127672572e-06, |
| "loss": 0.5105, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.15021337126600284, |
| "grad_norm": 0.775253415107727, |
| "learning_rate": 9.945205479452056e-06, |
| "loss": 0.481, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.15248933143669985, |
| "grad_norm": 0.7623254656791687, |
| "learning_rate": 9.944165544263502e-06, |
| "loss": 0.5027, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.15476529160739688, |
| "grad_norm": 0.376059889793396, |
| "learning_rate": 9.94312331453788e-06, |
| "loss": 0.5162, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.1570412517780939, |
| "grad_norm": 0.7439245581626892, |
| "learning_rate": 9.942078782672722e-06, |
| "loss": 0.519, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.1593172119487909, |
| "grad_norm": 1.2145835161209106, |
| "learning_rate": 9.941031941031942e-06, |
| "loss": 0.5068, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1615931721194879, |
| "grad_norm": 0.5037180185317993, |
| "learning_rate": 9.93998278194564e-06, |
| "loss": 0.5057, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.16386913229018493, |
| "grad_norm": 0.7445140480995178, |
| "learning_rate": 9.938931297709925e-06, |
| "loss": 0.5182, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.16614509246088194, |
| "grad_norm": 1.309226632118225, |
| "learning_rate": 9.937877480586715e-06, |
| "loss": 0.5338, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "grad_norm": 0.5644197463989258, |
| "learning_rate": 9.936821322803555e-06, |
| "loss": 0.4862, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.17069701280227595, |
| "grad_norm": 0.7700789570808411, |
| "learning_rate": 9.935762816553429e-06, |
| "loss": 0.527, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.17297297297297298, |
| "grad_norm": 0.374563068151474, |
| "learning_rate": 9.93470195399456e-06, |
| "loss": 0.5481, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.17524893314367, |
| "grad_norm": 0.5368718504905701, |
| "learning_rate": 9.933638727250218e-06, |
| "loss": 0.5311, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.177524893314367, |
| "grad_norm": 2.0363986492156982, |
| "learning_rate": 9.93257312840853e-06, |
| "loss": 0.4818, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.179800853485064, |
| "grad_norm": 0.6026269197463989, |
| "learning_rate": 9.931505149522274e-06, |
| "loss": 0.5074, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.18207681365576103, |
| "grad_norm": 1.5094815492630005, |
| "learning_rate": 9.930434782608697e-06, |
| "loss": 0.5266, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.18435277382645804, |
| "grad_norm": 0.7816861867904663, |
| "learning_rate": 9.929362019649299e-06, |
| "loss": 0.5715, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.18662873399715504, |
| "grad_norm": 0.6554467678070068, |
| "learning_rate": 9.928286852589644e-06, |
| "loss": 0.513, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.18890469416785205, |
| "grad_norm": 0.365382581949234, |
| "learning_rate": 9.927209273339152e-06, |
| "loss": 0.5209, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.19118065433854908, |
| "grad_norm": 1.5383721590042114, |
| "learning_rate": 9.926129273770902e-06, |
| "loss": 0.4893, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.1934566145092461, |
| "grad_norm": 2.3573522567749023, |
| "learning_rate": 9.925046845721427e-06, |
| "loss": 0.5048, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1957325746799431, |
| "grad_norm": 0.44573724269866943, |
| "learning_rate": 9.923961980990498e-06, |
| "loss": 0.4882, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.1980085348506401, |
| "grad_norm": 0.35019180178642273, |
| "learning_rate": 9.92287467134093e-06, |
| "loss": 0.5381, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.20028449502133713, |
| "grad_norm": 0.5049303770065308, |
| "learning_rate": 9.92178490849837e-06, |
| "loss": 0.503, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.20028449502133713, |
| "eval_loss": 0.5059446692466736, |
| "eval_runtime": 153.0766, |
| "eval_samples_per_second": 3.266, |
| "eval_steps_per_second": 0.817, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.20256045519203414, |
| "grad_norm": 0.4021438956260681, |
| "learning_rate": 9.920692684151088e-06, |
| "loss": 0.4953, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.20483641536273114, |
| "grad_norm": 0.7450408935546875, |
| "learning_rate": 9.91959798994975e-06, |
| "loss": 0.5125, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.20711237553342818, |
| "grad_norm": 0.416536420583725, |
| "learning_rate": 9.918500817507232e-06, |
| "loss": 0.4953, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.20938833570412518, |
| "grad_norm": 0.6918525695800781, |
| "learning_rate": 9.91740115839839e-06, |
| "loss": 0.5076, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.2116642958748222, |
| "grad_norm": 1.5085270404815674, |
| "learning_rate": 9.916299004159839e-06, |
| "loss": 0.516, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.2139402560455192, |
| "grad_norm": 0.3807312548160553, |
| "learning_rate": 9.915194346289754e-06, |
| "loss": 0.5073, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.21621621621621623, |
| "grad_norm": 0.5535611510276794, |
| "learning_rate": 9.914087176247632e-06, |
| "loss": 0.5781, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.21849217638691323, |
| "grad_norm": 1.2543669939041138, |
| "learning_rate": 9.912977485454088e-06, |
| "loss": 0.5442, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.22076813655761024, |
| "grad_norm": 0.45313432812690735, |
| "learning_rate": 9.911865265290619e-06, |
| "loss": 0.5187, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.22304409672830725, |
| "grad_norm": 0.4666624963283539, |
| "learning_rate": 9.910750507099394e-06, |
| "loss": 0.5296, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.22532005689900428, |
| "grad_norm": 0.38917532563209534, |
| "learning_rate": 9.90963320218302e-06, |
| "loss": 0.5246, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.22759601706970128, |
| "grad_norm": 1.5423916578292847, |
| "learning_rate": 9.908513341804322e-06, |
| "loss": 0.5175, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2298719772403983, |
| "grad_norm": 0.2859554588794708, |
| "learning_rate": 9.90739091718611e-06, |
| "loss": 0.5152, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.2321479374110953, |
| "grad_norm": 0.47693172097206116, |
| "learning_rate": 9.906265919510954e-06, |
| "loss": 0.489, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.23442389758179233, |
| "grad_norm": 0.2613968849182129, |
| "learning_rate": 9.90513833992095e-06, |
| "loss": 0.5094, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.23669985775248933, |
| "grad_norm": 0.4281187653541565, |
| "learning_rate": 9.904008169517489e-06, |
| "loss": 0.4941, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.23897581792318634, |
| "grad_norm": 0.48537370562553406, |
| "learning_rate": 9.902875399361023e-06, |
| "loss": 0.4996, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.24125177809388335, |
| "grad_norm": 0.7052855491638184, |
| "learning_rate": 9.90174002047083e-06, |
| "loss": 0.4988, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.24352773826458038, |
| "grad_norm": 1.2201290130615234, |
| "learning_rate": 9.900602023824775e-06, |
| "loss": 0.5419, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.24580369843527738, |
| "grad_norm": 0.2858637273311615, |
| "learning_rate": 9.89946140035907e-06, |
| "loss": 0.5326, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.2480796586059744, |
| "grad_norm": 0.3222598433494568, |
| "learning_rate": 9.898318140968033e-06, |
| "loss": 0.5041, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.2503556187766714, |
| "grad_norm": 0.4859575927257538, |
| "learning_rate": 9.897172236503858e-06, |
| "loss": 0.542, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.25263157894736843, |
| "grad_norm": 0.2807503938674927, |
| "learning_rate": 9.89602367777635e-06, |
| "loss": 0.4883, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.2549075391180654, |
| "grad_norm": 0.8720782995223999, |
| "learning_rate": 9.894872455552694e-06, |
| "loss": 0.5337, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.25718349928876244, |
| "grad_norm": 1.119335651397705, |
| "learning_rate": 9.893718560557205e-06, |
| "loss": 0.495, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.2594594594594595, |
| "grad_norm": 0.5547726154327393, |
| "learning_rate": 9.892561983471075e-06, |
| "loss": 0.4838, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.26173541963015645, |
| "grad_norm": 0.6144586801528931, |
| "learning_rate": 9.891402714932129e-06, |
| "loss": 0.5241, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2640113798008535, |
| "grad_norm": 0.5636857748031616, |
| "learning_rate": 9.890240745534561e-06, |
| "loss": 0.4913, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.2662873399715505, |
| "grad_norm": 0.29893824458122253, |
| "learning_rate": 9.889076065828691e-06, |
| "loss": 0.5242, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.2685633001422475, |
| "grad_norm": 0.6469008922576904, |
| "learning_rate": 9.887908666320707e-06, |
| "loss": 0.5167, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.27083926031294453, |
| "grad_norm": 0.40241923928260803, |
| "learning_rate": 9.8867385374724e-06, |
| "loss": 0.5227, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.27311522048364156, |
| "grad_norm": 0.3385710120201111, |
| "learning_rate": 9.885565669700911e-06, |
| "loss": 0.4929, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.27539118065433854, |
| "grad_norm": 0.28914356231689453, |
| "learning_rate": 9.884390053378467e-06, |
| "loss": 0.5345, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.2776671408250356, |
| "grad_norm": 0.32132378220558167, |
| "learning_rate": 9.883211678832118e-06, |
| "loss": 0.5019, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.27994310099573255, |
| "grad_norm": 0.5113037824630737, |
| "learning_rate": 9.88203053634347e-06, |
| "loss": 0.5445, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.2822190611664296, |
| "grad_norm": 0.6347448825836182, |
| "learning_rate": 9.880846616148421e-06, |
| "loss": 0.5212, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.2844950213371266, |
| "grad_norm": 0.9155266880989075, |
| "learning_rate": 9.879659908436889e-06, |
| "loss": 0.514, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2867709815078236, |
| "grad_norm": 1.8125205039978027, |
| "learning_rate": 9.878470403352541e-06, |
| "loss": 0.5174, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.28904694167852063, |
| "grad_norm": 0.9333555102348328, |
| "learning_rate": 9.877278090992529e-06, |
| "loss": 0.5025, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.29132290184921766, |
| "grad_norm": 0.24629177153110504, |
| "learning_rate": 9.876082961407194e-06, |
| "loss": 0.4798, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.29359886201991464, |
| "grad_norm": 0.6128231287002563, |
| "learning_rate": 9.874885004599817e-06, |
| "loss": 0.4844, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.2958748221906117, |
| "grad_norm": 0.5367395281791687, |
| "learning_rate": 9.873684210526317e-06, |
| "loss": 0.4997, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.29815078236130865, |
| "grad_norm": 0.41183608770370483, |
| "learning_rate": 9.872480569094982e-06, |
| "loss": 0.5128, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.3004267425320057, |
| "grad_norm": 0.5677710175514221, |
| "learning_rate": 9.871274070166184e-06, |
| "loss": 0.5103, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.3027027027027027, |
| "grad_norm": 0.9031636118888855, |
| "learning_rate": 9.870064703552094e-06, |
| "loss": 0.5087, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.3049786628733997, |
| "grad_norm": 0.3080916106700897, |
| "learning_rate": 9.868852459016395e-06, |
| "loss": 0.5155, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.30725462304409673, |
| "grad_norm": 0.4483453631401062, |
| "learning_rate": 9.867637326273991e-06, |
| "loss": 0.5308, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.30953058321479376, |
| "grad_norm": 0.22236040234565735, |
| "learning_rate": 9.866419294990726e-06, |
| "loss": 0.5352, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.31180654338549074, |
| "grad_norm": 1.0273557901382446, |
| "learning_rate": 9.865198354783072e-06, |
| "loss": 0.4747, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.3140825035561878, |
| "grad_norm": 0.7094826102256775, |
| "learning_rate": 9.863974495217856e-06, |
| "loss": 0.5033, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.31635846372688475, |
| "grad_norm": 0.5095308423042297, |
| "learning_rate": 9.862747705811943e-06, |
| "loss": 0.4751, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.3186344238975818, |
| "grad_norm": 0.33470073342323303, |
| "learning_rate": 9.861517976031957e-06, |
| "loss": 0.5321, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3209103840682788, |
| "grad_norm": 0.7143406271934509, |
| "learning_rate": 9.860285295293963e-06, |
| "loss": 0.488, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.3231863442389758, |
| "grad_norm": 139.33291625976562, |
| "learning_rate": 9.859049652963161e-06, |
| "loss": 0.4704, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.32546230440967283, |
| "grad_norm": 0.26354745030403137, |
| "learning_rate": 9.857811038353604e-06, |
| "loss": 0.5144, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.32773826458036986, |
| "grad_norm": 0.6331800222396851, |
| "learning_rate": 9.856569440727856e-06, |
| "loss": 0.5087, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.33001422475106684, |
| "grad_norm": 0.9244054555892944, |
| "learning_rate": 9.855324849296718e-06, |
| "loss": 0.5058, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3322901849217639, |
| "grad_norm": 0.6245642304420471, |
| "learning_rate": 9.854077253218885e-06, |
| "loss": 0.5242, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.3345661450924609, |
| "grad_norm": 0.8577654957771301, |
| "learning_rate": 9.852826641600645e-06, |
| "loss": 0.5137, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "grad_norm": 0.31745514273643494, |
| "learning_rate": 9.851573003495565e-06, |
| "loss": 0.5121, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.3391180654338549, |
| "grad_norm": 0.5666072964668274, |
| "learning_rate": 9.850316327904162e-06, |
| "loss": 0.5225, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.3413940256045519, |
| "grad_norm": 0.2888381779193878, |
| "learning_rate": 9.849056603773586e-06, |
| "loss": 0.5223, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.34366998577524893, |
| "grad_norm": 0.5915186405181885, |
| "learning_rate": 9.847793819997302e-06, |
| "loss": 0.4806, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.34594594594594597, |
| "grad_norm": 0.7029264569282532, |
| "learning_rate": 9.846527965414753e-06, |
| "loss": 0.48, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.34822190611664294, |
| "grad_norm": 0.3648228347301483, |
| "learning_rate": 9.845259028811038e-06, |
| "loss": 0.5169, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.35049786628734, |
| "grad_norm": 1.2137341499328613, |
| "learning_rate": 9.843986998916577e-06, |
| "loss": 0.5193, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.352773826458037, |
| "grad_norm": 0.5907750129699707, |
| "learning_rate": 9.842711864406782e-06, |
| "loss": 0.5163, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.355049786628734, |
| "grad_norm": 0.3823224902153015, |
| "learning_rate": 9.841433613901712e-06, |
| "loss": 0.5104, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.357325746799431, |
| "grad_norm": 0.5324851870536804, |
| "learning_rate": 9.840152235965747e-06, |
| "loss": 0.5045, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.359601706970128, |
| "grad_norm": 3.104480743408203, |
| "learning_rate": 9.838867719107241e-06, |
| "loss": 0.5095, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.36187766714082503, |
| "grad_norm": 0.3194819986820221, |
| "learning_rate": 9.837580051778173e-06, |
| "loss": 0.4913, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.36415362731152207, |
| "grad_norm": 0.8034701943397522, |
| "learning_rate": 9.836289222373807e-06, |
| "loss": 0.4841, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.36642958748221904, |
| "grad_norm": 0.6767010688781738, |
| "learning_rate": 9.834995219232347e-06, |
| "loss": 0.4973, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.3687055476529161, |
| "grad_norm": 0.57259202003479, |
| "learning_rate": 9.833698030634574e-06, |
| "loss": 0.5208, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.3709815078236131, |
| "grad_norm": 2.552525520324707, |
| "learning_rate": 9.832397644803507e-06, |
| "loss": 0.4789, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.3732574679943101, |
| "grad_norm": 0.3038395941257477, |
| "learning_rate": 9.831094049904034e-06, |
| "loss": 0.5255, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.3755334281650071, |
| "grad_norm": 0.4763418436050415, |
| "learning_rate": 9.829787234042554e-06, |
| "loss": 0.5409, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3778093883357041, |
| "grad_norm": 0.943462073802948, |
| "learning_rate": 9.828477185266632e-06, |
| "loss": 0.5115, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.38008534850640113, |
| "grad_norm": 0.3877524137496948, |
| "learning_rate": 9.827163891564609e-06, |
| "loss": 0.5204, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.38236130867709817, |
| "grad_norm": 1.1156424283981323, |
| "learning_rate": 9.825847340865254e-06, |
| "loss": 0.4881, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.38463726884779514, |
| "grad_norm": 0.5312694907188416, |
| "learning_rate": 9.824527521037385e-06, |
| "loss": 0.4566, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.3869132290184922, |
| "grad_norm": 0.3443738520145416, |
| "learning_rate": 9.823204419889503e-06, |
| "loss": 0.5117, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3891891891891892, |
| "grad_norm": 0.3024270832538605, |
| "learning_rate": 9.821878025169411e-06, |
| "loss": 0.5112, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.3914651493598862, |
| "grad_norm": 0.36176398396492004, |
| "learning_rate": 9.820548324563835e-06, |
| "loss": 0.5191, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.3937411095305832, |
| "grad_norm": 0.7123134136199951, |
| "learning_rate": 9.819215305698047e-06, |
| "loss": 0.4887, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.3960170697012802, |
| "grad_norm": 0.4487740695476532, |
| "learning_rate": 9.817878956135481e-06, |
| "loss": 0.5046, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.39829302987197723, |
| "grad_norm": 0.3469913601875305, |
| "learning_rate": 9.816539263377348e-06, |
| "loss": 0.5059, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.40056899004267427, |
| "grad_norm": 0.5372359156608582, |
| "learning_rate": 9.815196214862233e-06, |
| "loss": 0.4995, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.40056899004267427, |
| "eval_loss": 0.5013659000396729, |
| "eval_runtime": 152.7596, |
| "eval_samples_per_second": 3.273, |
| "eval_steps_per_second": 0.818, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.40284495021337124, |
| "grad_norm": 0.9638350009918213, |
| "learning_rate": 9.813849797965724e-06, |
| "loss": 0.5275, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.4051209103840683, |
| "grad_norm": 16.786623001098633, |
| "learning_rate": 9.8125e-06, |
| "loss": 0.5134, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.4073968705547653, |
| "grad_norm": 0.3690100908279419, |
| "learning_rate": 9.811146808213438e-06, |
| "loss": 0.5018, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.4096728307254623, |
| "grad_norm": 0.5200425982475281, |
| "learning_rate": 9.809790209790211e-06, |
| "loss": 0.4971, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4119487908961593, |
| "grad_norm": 0.5405089259147644, |
| "learning_rate": 9.808430191849882e-06, |
| "loss": 0.501, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.41422475106685636, |
| "grad_norm": 0.2915521264076233, |
| "learning_rate": 9.807066741447e-06, |
| "loss": 0.5194, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.41650071123755333, |
| "grad_norm": 0.6254512667655945, |
| "learning_rate": 9.805699845570686e-06, |
| "loss": 0.5021, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.41877667140825037, |
| "grad_norm": 0.6108738780021667, |
| "learning_rate": 9.804329491144224e-06, |
| "loss": 0.4807, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 0.451253741979599, |
| "learning_rate": 9.802955665024631e-06, |
| "loss": 0.4899, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4233285917496444, |
| "grad_norm": 0.5697038769721985, |
| "learning_rate": 9.801578354002258e-06, |
| "loss": 0.5312, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.4256045519203414, |
| "grad_norm": 0.4919067621231079, |
| "learning_rate": 9.800197544800338e-06, |
| "loss": 0.4737, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.4278805120910384, |
| "grad_norm": 0.2489043027162552, |
| "learning_rate": 9.7988132240746e-06, |
| "loss": 0.5331, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.4301564722617354, |
| "grad_norm": 0.3385188579559326, |
| "learning_rate": 9.79742537841279e-06, |
| "loss": 0.4948, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.43243243243243246, |
| "grad_norm": 0.8381849527359009, |
| "learning_rate": 9.79603399433428e-06, |
| "loss": 0.4929, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.43470839260312943, |
| "grad_norm": 0.625830888748169, |
| "learning_rate": 9.794639058289605e-06, |
| "loss": 0.4906, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.43698435277382647, |
| "grad_norm": 0.28946149349212646, |
| "learning_rate": 9.793240556660042e-06, |
| "loss": 0.5238, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.43926031294452345, |
| "grad_norm": 0.4598257839679718, |
| "learning_rate": 9.791838475757147e-06, |
| "loss": 0.4935, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.4415362731152205, |
| "grad_norm": 0.6791567802429199, |
| "learning_rate": 9.790432801822324e-06, |
| "loss": 0.4764, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.4438122332859175, |
| "grad_norm": 0.5078298449516296, |
| "learning_rate": 9.789023521026373e-06, |
| "loss": 0.5108, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4460881934566145, |
| "grad_norm": 0.6540849208831787, |
| "learning_rate": 9.787610619469028e-06, |
| "loss": 0.5228, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.4483641536273115, |
| "grad_norm": 0.31652045249938965, |
| "learning_rate": 9.786194083178507e-06, |
| "loss": 0.5159, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.45064011379800856, |
| "grad_norm": 0.7523598074913025, |
| "learning_rate": 9.78477389811105e-06, |
| "loss": 0.5, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.45291607396870553, |
| "grad_norm": 0.901971697807312, |
| "learning_rate": 9.783350050150453e-06, |
| "loss": 0.4988, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.45519203413940257, |
| "grad_norm": 0.271677166223526, |
| "learning_rate": 9.781922525107605e-06, |
| "loss": 0.5364, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.45746799431009955, |
| "grad_norm": 0.6052701473236084, |
| "learning_rate": 9.780491308720012e-06, |
| "loss": 0.48, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.4597439544807966, |
| "grad_norm": 0.6410478949546814, |
| "learning_rate": 9.779056386651325e-06, |
| "loss": 0.5074, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.4620199146514936, |
| "grad_norm": 0.4447648525238037, |
| "learning_rate": 9.777617744490855e-06, |
| "loss": 0.5016, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.4642958748221906, |
| "grad_norm": 0.5336310267448425, |
| "learning_rate": 9.776175367753103e-06, |
| "loss": 0.5156, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.4665718349928876, |
| "grad_norm": 0.37467631697654724, |
| "learning_rate": 9.774729241877257e-06, |
| "loss": 0.5054, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.46884779516358466, |
| "grad_norm": 0.6465873122215271, |
| "learning_rate": 9.77327935222672e-06, |
| "loss": 0.5257, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.47112375533428164, |
| "grad_norm": 0.78228360414505, |
| "learning_rate": 9.771825684088608e-06, |
| "loss": 0.4856, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.47339971550497867, |
| "grad_norm": 0.5945775508880615, |
| "learning_rate": 9.77036822267324e-06, |
| "loss": 0.5155, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.4756756756756757, |
| "grad_norm": 0.7464637756347656, |
| "learning_rate": 9.768906953113662e-06, |
| "loss": 0.4792, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.4779516358463727, |
| "grad_norm": 0.4406684935092926, |
| "learning_rate": 9.767441860465117e-06, |
| "loss": 0.4761, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4802275960170697, |
| "grad_norm": 0.3112039268016815, |
| "learning_rate": 9.765972929704557e-06, |
| "loss": 0.5142, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.4825035561877667, |
| "grad_norm": 0.7738893032073975, |
| "learning_rate": 9.764500145730109e-06, |
| "loss": 0.4874, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.4847795163584637, |
| "grad_norm": 0.6051543354988098, |
| "learning_rate": 9.763023493360573e-06, |
| "loss": 0.4699, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.48705547652916076, |
| "grad_norm": 0.3619549870491028, |
| "learning_rate": 9.761542957334893e-06, |
| "loss": 0.5096, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.48933143669985774, |
| "grad_norm": 0.23817306756973267, |
| "learning_rate": 9.760058522311634e-06, |
| "loss": 0.5148, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.49160739687055477, |
| "grad_norm": 0.714975118637085, |
| "learning_rate": 9.758570172868447e-06, |
| "loss": 0.5141, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.4938833570412518, |
| "grad_norm": 1.043235182762146, |
| "learning_rate": 9.757077893501543e-06, |
| "loss": 0.5073, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.4961593172119488, |
| "grad_norm": 0.6188557744026184, |
| "learning_rate": 9.755581668625147e-06, |
| "loss": 0.5243, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.4984352773826458, |
| "grad_norm": 0.8673965930938721, |
| "learning_rate": 9.754081482570967e-06, |
| "loss": 0.5065, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.5007112375533428, |
| "grad_norm": 0.5338215231895447, |
| "learning_rate": 9.75257731958763e-06, |
| "loss": 0.5002, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5029871977240399, |
| "grad_norm": 0.40506306290626526, |
| "learning_rate": 9.751069163840144e-06, |
| "loss": 0.479, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "grad_norm": 0.34282737970352173, |
| "learning_rate": 9.749556999409333e-06, |
| "loss": 0.5168, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.5075391180654338, |
| "grad_norm": 0.5617157220840454, |
| "learning_rate": 9.748040810291292e-06, |
| "loss": 0.4767, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.5098150782361308, |
| "grad_norm": 1.5562076568603516, |
| "learning_rate": 9.746520580396803e-06, |
| "loss": 0.4985, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.5120910384068279, |
| "grad_norm": 1.0099520683288574, |
| "learning_rate": 9.744996293550779e-06, |
| "loss": 0.4886, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5143669985775249, |
| "grad_norm": 0.421151340007782, |
| "learning_rate": 9.743467933491686e-06, |
| "loss": 0.4765, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.5166429587482219, |
| "grad_norm": 0.4360538423061371, |
| "learning_rate": 9.741935483870967e-06, |
| "loss": 0.5122, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.518918918918919, |
| "grad_norm": 0.41732823848724365, |
| "learning_rate": 9.740398928252457e-06, |
| "loss": 0.5237, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.5211948790896159, |
| "grad_norm": 0.34465718269348145, |
| "learning_rate": 9.738858250111791e-06, |
| "loss": 0.4984, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.5234708392603129, |
| "grad_norm": 0.46500489115715027, |
| "learning_rate": 9.737313432835823e-06, |
| "loss": 0.4946, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.52574679943101, |
| "grad_norm": 1.010578989982605, |
| "learning_rate": 9.735764459722017e-06, |
| "loss": 0.5233, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.528022759601707, |
| "grad_norm": 334.45111083984375, |
| "learning_rate": 9.734211313977852e-06, |
| "loss": 0.5287, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.530298719772404, |
| "grad_norm": 0.3792816698551178, |
| "learning_rate": 9.732653978720217e-06, |
| "loss": 0.4932, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.532574679943101, |
| "grad_norm": 0.23106412589550018, |
| "learning_rate": 9.731092436974791e-06, |
| "loss": 0.4945, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.534850640113798, |
| "grad_norm": 1.2293035984039307, |
| "learning_rate": 9.729526671675433e-06, |
| "loss": 0.4715, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.537126600284495, |
| "grad_norm": 0.6926982402801514, |
| "learning_rate": 9.727956665663558e-06, |
| "loss": 0.518, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.5394025604551921, |
| "grad_norm": 0.4290272295475006, |
| "learning_rate": 9.726382401687511e-06, |
| "loss": 0.4721, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.5416785206258891, |
| "grad_norm": 0.3275747001171112, |
| "learning_rate": 9.724803862401932e-06, |
| "loss": 0.4889, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.543954480796586, |
| "grad_norm": 0.2576768398284912, |
| "learning_rate": 9.723221030367125e-06, |
| "loss": 0.5028, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.5462304409672831, |
| "grad_norm": 0.2659994661808014, |
| "learning_rate": 9.721633888048413e-06, |
| "loss": 0.5081, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5485064011379801, |
| "grad_norm": 0.2829638123512268, |
| "learning_rate": 9.720042417815484e-06, |
| "loss": 0.4911, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.5507823613086771, |
| "grad_norm": 0.6670094132423401, |
| "learning_rate": 9.718446601941749e-06, |
| "loss": 0.4972, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.5530583214793741, |
| "grad_norm": 0.42905673384666443, |
| "learning_rate": 9.716846422603678e-06, |
| "loss": 0.5438, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.5553342816500711, |
| "grad_norm": 0.45875513553619385, |
| "learning_rate": 9.715241861880134e-06, |
| "loss": 0.5042, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.5576102418207681, |
| "grad_norm": 0.3114938735961914, |
| "learning_rate": 9.713632901751715e-06, |
| "loss": 0.484, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5598862019914651, |
| "grad_norm": 0.7309391498565674, |
| "learning_rate": 9.712019524100062e-06, |
| "loss": 0.4985, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.5621621621621622, |
| "grad_norm": 0.27174195647239685, |
| "learning_rate": 9.710401710707196e-06, |
| "loss": 0.5134, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.5644381223328592, |
| "grad_norm": 0.5611736178398132, |
| "learning_rate": 9.708779443254818e-06, |
| "loss": 0.5066, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.5667140825035561, |
| "grad_norm": 0.4500977396965027, |
| "learning_rate": 9.707152703323634e-06, |
| "loss": 0.5163, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.5689900426742532, |
| "grad_norm": 0.6716147661209106, |
| "learning_rate": 9.705521472392638e-06, |
| "loss": 0.508, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5712660028449502, |
| "grad_norm": 0.4123668372631073, |
| "learning_rate": 9.70388573183843e-06, |
| "loss": 0.5153, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.5735419630156472, |
| "grad_norm": 0.36963313817977905, |
| "learning_rate": 9.702245462934483e-06, |
| "loss": 0.5061, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.5758179231863443, |
| "grad_norm": 0.2945839762687683, |
| "learning_rate": 9.700600646850456e-06, |
| "loss": 0.5253, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.5780938833570413, |
| "grad_norm": 0.7066704630851746, |
| "learning_rate": 9.69895126465145e-06, |
| "loss": 0.5081, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.5803698435277382, |
| "grad_norm": 0.31411829590797424, |
| "learning_rate": 9.697297297297299e-06, |
| "loss": 0.5079, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5826458036984353, |
| "grad_norm": 0.607753336429596, |
| "learning_rate": 9.695638725641821e-06, |
| "loss": 0.5061, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.5849217638691323, |
| "grad_norm": 0.2587028443813324, |
| "learning_rate": 9.69397553043209e-06, |
| "loss": 0.4954, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.5871977240398293, |
| "grad_norm": 0.6263294816017151, |
| "learning_rate": 9.692307692307695e-06, |
| "loss": 0.5525, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.5894736842105263, |
| "grad_norm": 0.3169964849948883, |
| "learning_rate": 9.690635191799971e-06, |
| "loss": 0.5183, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.5917496443812233, |
| "grad_norm": 0.6329559087753296, |
| "learning_rate": 9.688958009331262e-06, |
| "loss": 0.4901, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5940256045519203, |
| "grad_norm": 0.34771379828453064, |
| "learning_rate": 9.687276125214142e-06, |
| "loss": 0.4956, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.5963015647226173, |
| "grad_norm": 0.5889184474945068, |
| "learning_rate": 9.685589519650655e-06, |
| "loss": 0.5509, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.5985775248933144, |
| "grad_norm": 0.26520290970802307, |
| "learning_rate": 9.683898172731534e-06, |
| "loss": 0.4983, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.6008534850640114, |
| "grad_norm": 0.4058733582496643, |
| "learning_rate": 9.682202064435408e-06, |
| "loss": 0.4959, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6008534850640114, |
| "eval_loss": 0.49948057532310486, |
| "eval_runtime": 152.8126, |
| "eval_samples_per_second": 3.272, |
| "eval_steps_per_second": 0.818, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6031294452347084, |
| "grad_norm": 0.37097811698913574, |
| "learning_rate": 9.680501174628037e-06, |
| "loss": 0.518, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.6054054054054054, |
| "grad_norm": 0.4192047715187073, |
| "learning_rate": 9.678795483061483e-06, |
| "loss": 0.5092, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.6076813655761024, |
| "grad_norm": 0.5240965485572815, |
| "learning_rate": 9.677084969373333e-06, |
| "loss": 0.5122, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.6099573257467994, |
| "grad_norm": 0.3636751174926758, |
| "learning_rate": 9.675369613085877e-06, |
| "loss": 0.503, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.6122332859174965, |
| "grad_norm": 0.6384652256965637, |
| "learning_rate": 9.673649393605293e-06, |
| "loss": 0.4904, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.6145092460881935, |
| "grad_norm": 0.2554349899291992, |
| "learning_rate": 9.67192429022082e-06, |
| "loss": 0.5118, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6167852062588904, |
| "grad_norm": 0.4561486840248108, |
| "learning_rate": 9.670194282103933e-06, |
| "loss": 0.5055, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.6190611664295875, |
| "grad_norm": 0.25888338685035706, |
| "learning_rate": 9.6684593483075e-06, |
| "loss": 0.4948, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.6213371266002845, |
| "grad_norm": 0.7615610361099243, |
| "learning_rate": 9.666719467764931e-06, |
| "loss": 0.5219, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.6236130867709815, |
| "grad_norm": 0.8310565948486328, |
| "learning_rate": 9.66497461928934e-06, |
| "loss": 0.4964, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.6258890469416786, |
| "grad_norm": 0.9119910001754761, |
| "learning_rate": 9.663224781572677e-06, |
| "loss": 0.5017, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.6281650071123756, |
| "grad_norm": 0.41001102328300476, |
| "learning_rate": 9.661469933184857e-06, |
| "loss": 0.5216, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.6304409672830725, |
| "grad_norm": 0.9293547868728638, |
| "learning_rate": 9.659710052572886e-06, |
| "loss": 0.5417, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.6327169274537695, |
| "grad_norm": 0.2771151065826416, |
| "learning_rate": 9.65794511805999e-06, |
| "loss": 0.5236, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.6349928876244666, |
| "grad_norm": 0.6640573143959045, |
| "learning_rate": 9.656175107844705e-06, |
| "loss": 0.5151, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.6372688477951636, |
| "grad_norm": 0.2822358310222626, |
| "learning_rate": 9.654400000000002e-06, |
| "loss": 0.5025, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6395448079658606, |
| "grad_norm": 0.45471474528312683, |
| "learning_rate": 9.652619772472362e-06, |
| "loss": 0.5364, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.6418207681365576, |
| "grad_norm": 0.6550914645195007, |
| "learning_rate": 9.650834403080876e-06, |
| "loss": 0.4794, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.6440967283072546, |
| "grad_norm": 0.4125944674015045, |
| "learning_rate": 9.649043869516313e-06, |
| "loss": 0.4805, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.6463726884779516, |
| "grad_norm": 0.4010949730873108, |
| "learning_rate": 9.6472481493402e-06, |
| "loss": 0.4967, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.6486486486486487, |
| "grad_norm": 0.3277333378791809, |
| "learning_rate": 9.645447219983885e-06, |
| "loss": 0.5088, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6509246088193457, |
| "grad_norm": 0.36314937472343445, |
| "learning_rate": 9.643641058747582e-06, |
| "loss": 0.4965, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.6532005689900426, |
| "grad_norm": 0.6457196474075317, |
| "learning_rate": 9.641829642799418e-06, |
| "loss": 0.5169, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.6554765291607397, |
| "grad_norm": 0.33768969774246216, |
| "learning_rate": 9.64001294917449e-06, |
| "loss": 0.4931, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.6577524893314367, |
| "grad_norm": 0.4165545701980591, |
| "learning_rate": 9.638190954773872e-06, |
| "loss": 0.4747, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.6600284495021337, |
| "grad_norm": 0.34501275420188904, |
| "learning_rate": 9.636363636363638e-06, |
| "loss": 0.5317, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6623044096728308, |
| "grad_norm": 1.3481067419052124, |
| "learning_rate": 9.634530970573893e-06, |
| "loss": 0.5159, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.6645803698435278, |
| "grad_norm": 0.5084125399589539, |
| "learning_rate": 9.632692933897753e-06, |
| "loss": 0.5209, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.6668563300142247, |
| "grad_norm": 0.6355491876602173, |
| "learning_rate": 9.630849502690364e-06, |
| "loss": 0.4935, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.6691322901849218, |
| "grad_norm": 0.28465208411216736, |
| "learning_rate": 9.629000653167865e-06, |
| "loss": 0.5164, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.6714082503556188, |
| "grad_norm": 0.2439097911119461, |
| "learning_rate": 9.627146361406379e-06, |
| "loss": 0.5239, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "grad_norm": 0.6047483682632446, |
| "learning_rate": 9.625286603340978e-06, |
| "loss": 0.4787, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.6759601706970128, |
| "grad_norm": 1.615376591682434, |
| "learning_rate": 9.62342135476464e-06, |
| "loss": 0.4881, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.6782361308677098, |
| "grad_norm": 0.2726902365684509, |
| "learning_rate": 9.621550591327202e-06, |
| "loss": 0.491, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.6805120910384068, |
| "grad_norm": 0.47915151715278625, |
| "learning_rate": 9.6196742885343e-06, |
| "loss": 0.4945, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.6827880512091038, |
| "grad_norm": 1.6578127145767212, |
| "learning_rate": 9.617792421746294e-06, |
| "loss": 0.5115, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6850640113798009, |
| "grad_norm": 0.33565983176231384, |
| "learning_rate": 9.6159049661772e-06, |
| "loss": 0.5, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.6873399715504979, |
| "grad_norm": 0.9541482925415039, |
| "learning_rate": 9.61401189689359e-06, |
| "loss": 0.5161, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.6896159317211948, |
| "grad_norm": 0.3061958849430084, |
| "learning_rate": 9.612113188813506e-06, |
| "loss": 0.4994, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.6918918918918919, |
| "grad_norm": 0.4669720232486725, |
| "learning_rate": 9.610208816705338e-06, |
| "loss": 0.4806, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.6941678520625889, |
| "grad_norm": 0.5754396319389343, |
| "learning_rate": 9.608298755186722e-06, |
| "loss": 0.4944, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6964438122332859, |
| "grad_norm": 0.28227463364601135, |
| "learning_rate": 9.606382978723405e-06, |
| "loss": 0.4856, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.698719772403983, |
| "grad_norm": 0.7921618223190308, |
| "learning_rate": 9.604461461628104e-06, |
| "loss": 0.4944, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.70099573257468, |
| "grad_norm": 0.3263791799545288, |
| "learning_rate": 9.602534178059356e-06, |
| "loss": 0.4855, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.7032716927453769, |
| "grad_norm": 0.5060755610466003, |
| "learning_rate": 9.60060110202037e-06, |
| "loss": 0.5183, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.705547652916074, |
| "grad_norm": 0.5023918747901917, |
| "learning_rate": 9.598662207357861e-06, |
| "loss": 0.495, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.707823613086771, |
| "grad_norm": 1.5953493118286133, |
| "learning_rate": 9.596717467760846e-06, |
| "loss": 0.4982, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.710099573257468, |
| "grad_norm": 0.5165386199951172, |
| "learning_rate": 9.59476685675948e-06, |
| "loss": 0.4849, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.712375533428165, |
| "grad_norm": 0.5814241766929626, |
| "learning_rate": 9.592810347723838e-06, |
| "loss": 0.5162, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.714651493598862, |
| "grad_norm": 0.8432161808013916, |
| "learning_rate": 9.59084791386272e-06, |
| "loss": 0.5477, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.716927453769559, |
| "grad_norm": 0.22218360006809235, |
| "learning_rate": 9.58887952822241e-06, |
| "loss": 0.4965, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.719203413940256, |
| "grad_norm": 0.5952489376068115, |
| "learning_rate": 9.586905163685456e-06, |
| "loss": 0.4746, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.7214793741109531, |
| "grad_norm": 0.6398565173149109, |
| "learning_rate": 9.584924792969412e-06, |
| "loss": 0.5047, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.7237553342816501, |
| "grad_norm": 0.8832747936248779, |
| "learning_rate": 9.582938388625594e-06, |
| "loss": 0.5256, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.726031294452347, |
| "grad_norm": 0.4012426435947418, |
| "learning_rate": 9.580945923037805e-06, |
| "loss": 0.4712, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.7283072546230441, |
| "grad_norm": 1.300533413887024, |
| "learning_rate": 9.578947368421054e-06, |
| "loss": 0.5119, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7305832147937411, |
| "grad_norm": 0.2682804465293884, |
| "learning_rate": 9.576942696820271e-06, |
| "loss": 0.4571, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.7328591749644381, |
| "grad_norm": 0.8085079193115234, |
| "learning_rate": 9.574931880108993e-06, |
| "loss": 0.4847, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.7351351351351352, |
| "grad_norm": 0.4819945693016052, |
| "learning_rate": 9.572914889988063e-06, |
| "loss": 0.4987, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.7374110953058322, |
| "grad_norm": 0.40524768829345703, |
| "learning_rate": 9.570891697984285e-06, |
| "loss": 0.5204, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.7396870554765291, |
| "grad_norm": 0.2749171853065491, |
| "learning_rate": 9.568862275449102e-06, |
| "loss": 0.5035, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7419630156472262, |
| "grad_norm": 0.7831512093544006, |
| "learning_rate": 9.566826593557232e-06, |
| "loss": 0.5235, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.7442389758179232, |
| "grad_norm": 0.9686456918716431, |
| "learning_rate": 9.564784623305303e-06, |
| "loss": 0.5206, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.7465149359886202, |
| "grad_norm": 0.517684280872345, |
| "learning_rate": 9.562736335510485e-06, |
| "loss": 0.5109, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.7487908961593173, |
| "grad_norm": 0.6049047708511353, |
| "learning_rate": 9.560681700809092e-06, |
| "loss": 0.5077, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.7510668563300142, |
| "grad_norm": 0.7499035000801086, |
| "learning_rate": 9.558620689655174e-06, |
| "loss": 0.5291, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7533428165007112, |
| "grad_norm": 0.7392110824584961, |
| "learning_rate": 9.556553272319118e-06, |
| "loss": 0.4963, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.7556187766714082, |
| "grad_norm": 0.7733691930770874, |
| "learning_rate": 9.5544794188862e-06, |
| "loss": 0.4758, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.7578947368421053, |
| "grad_norm": 0.28524699807167053, |
| "learning_rate": 9.552399099255154e-06, |
| "loss": 0.5136, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.7601706970128023, |
| "grad_norm": 0.29482683539390564, |
| "learning_rate": 9.550312283136714e-06, |
| "loss": 0.5285, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.7624466571834992, |
| "grad_norm": 0.6638491749763489, |
| "learning_rate": 9.548218940052129e-06, |
| "loss": 0.4822, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7647226173541963, |
| "grad_norm": 0.38611069321632385, |
| "learning_rate": 9.54611903933171e-06, |
| "loss": 0.4557, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.7669985775248933, |
| "grad_norm": 0.33370041847229004, |
| "learning_rate": 9.5440125501133e-06, |
| "loss": 0.5066, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.7692745376955903, |
| "grad_norm": 3.7957630157470703, |
| "learning_rate": 9.541899441340783e-06, |
| "loss": 0.4926, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.7715504978662874, |
| "grad_norm": 0.856934666633606, |
| "learning_rate": 9.539779681762547e-06, |
| "loss": 0.5026, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.7738264580369844, |
| "grad_norm": 0.44400107860565186, |
| "learning_rate": 9.537653239929948e-06, |
| "loss": 0.5118, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7761024182076813, |
| "grad_norm": 0.2703458368778229, |
| "learning_rate": 9.535520084195758e-06, |
| "loss": 0.4849, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.7783783783783784, |
| "grad_norm": 0.913375198841095, |
| "learning_rate": 9.533380182712582e-06, |
| "loss": 0.5025, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.7806543385490754, |
| "grad_norm": 0.28155040740966797, |
| "learning_rate": 9.531233503431288e-06, |
| "loss": 0.5364, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.7829302987197724, |
| "grad_norm": 1.5114960670471191, |
| "learning_rate": 9.5290800140994e-06, |
| "loss": 0.5126, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.7852062588904695, |
| "grad_norm": 0.7077274918556213, |
| "learning_rate": 9.526919682259488e-06, |
| "loss": 0.5075, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7874822190611664, |
| "grad_norm": 0.33342838287353516, |
| "learning_rate": 9.524752475247528e-06, |
| "loss": 0.5347, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.7897581792318634, |
| "grad_norm": 0.28337982296943665, |
| "learning_rate": 9.522578360191253e-06, |
| "loss": 0.4905, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.7920341394025604, |
| "grad_norm": 0.5180854201316833, |
| "learning_rate": 9.520397304008515e-06, |
| "loss": 0.5119, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.7943100995732575, |
| "grad_norm": 0.38203689455986023, |
| "learning_rate": 9.518209273405579e-06, |
| "loss": 0.5105, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.7965860597439545, |
| "grad_norm": 0.3038167357444763, |
| "learning_rate": 9.516014234875446e-06, |
| "loss": 0.5079, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7988620199146514, |
| "grad_norm": 2.371117353439331, |
| "learning_rate": 9.513812154696134e-06, |
| "loss": 0.4699, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.8011379800853485, |
| "grad_norm": 0.2708576023578644, |
| "learning_rate": 9.511602998928954e-06, |
| "loss": 0.4622, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.8011379800853485, |
| "eval_loss": 0.49841922521591187, |
| "eval_runtime": 152.7813, |
| "eval_samples_per_second": 3.273, |
| "eval_steps_per_second": 0.818, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.8034139402560455, |
| "grad_norm": 1.0353049039840698, |
| "learning_rate": 9.509386733416772e-06, |
| "loss": 0.4996, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.8056899004267425, |
| "grad_norm": 1.001231074333191, |
| "learning_rate": 9.507163323782235e-06, |
| "loss": 0.5168, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.8079658605974396, |
| "grad_norm": 0.7230463027954102, |
| "learning_rate": 9.504932735426011e-06, |
| "loss": 0.5141, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8102418207681366, |
| "grad_norm": 0.4161858856678009, |
| "learning_rate": 9.502694933524974e-06, |
| "loss": 0.49, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.8125177809388335, |
| "grad_norm": 0.22768855094909668, |
| "learning_rate": 9.500449883030413e-06, |
| "loss": 0.4949, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.8147937411095306, |
| "grad_norm": 0.28206667304039, |
| "learning_rate": 9.498197548666186e-06, |
| "loss": 0.4976, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.8170697012802276, |
| "grad_norm": 0.4161596894264221, |
| "learning_rate": 9.495937894926883e-06, |
| "loss": 0.4616, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.8193456614509246, |
| "grad_norm": 0.3083823323249817, |
| "learning_rate": 9.493670886075952e-06, |
| "loss": 0.5063, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8216216216216217, |
| "grad_norm": 0.27027612924575806, |
| "learning_rate": 9.491396486143816e-06, |
| "loss": 0.5188, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.8238975817923186, |
| "grad_norm": 0.2469726949930191, |
| "learning_rate": 9.48911465892598e-06, |
| "loss": 0.4866, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.8261735419630156, |
| "grad_norm": 0.8711740374565125, |
| "learning_rate": 9.486825367981103e-06, |
| "loss": 0.4987, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.8284495021337127, |
| "grad_norm": 0.3169425129890442, |
| "learning_rate": 9.484528576629051e-06, |
| "loss": 0.4681, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.8307254623044097, |
| "grad_norm": 0.5043453574180603, |
| "learning_rate": 9.482224247948954e-06, |
| "loss": 0.4931, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.8330014224751067, |
| "grad_norm": 0.5922644138336182, |
| "learning_rate": 9.47991234477721e-06, |
| "loss": 0.4958, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.8352773826458036, |
| "grad_norm": 0.5115863680839539, |
| "learning_rate": 9.477592829705506e-06, |
| "loss": 0.4781, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.8375533428165007, |
| "grad_norm": 0.36144527792930603, |
| "learning_rate": 9.475265665078786e-06, |
| "loss": 0.4835, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.8398293029871977, |
| "grad_norm": 0.31710296869277954, |
| "learning_rate": 9.472930812993211e-06, |
| "loss": 0.456, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 0.3835445046424866, |
| "learning_rate": 9.470588235294119e-06, |
| "loss": 0.5027, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8443812233285918, |
| "grad_norm": 0.3568181097507477, |
| "learning_rate": 9.468237893573928e-06, |
| "loss": 0.5016, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.8466571834992888, |
| "grad_norm": 0.3614184260368347, |
| "learning_rate": 9.465879749170048e-06, |
| "loss": 0.5197, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.8489331436699857, |
| "grad_norm": 0.7706065773963928, |
| "learning_rate": 9.463513763162758e-06, |
| "loss": 0.5188, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.8512091038406828, |
| "grad_norm": 0.23581719398498535, |
| "learning_rate": 9.46113989637306e-06, |
| "loss": 0.476, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.8534850640113798, |
| "grad_norm": 0.29790443181991577, |
| "learning_rate": 9.45875810936052e-06, |
| "loss": 0.5012, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8557610241820768, |
| "grad_norm": 0.651655375957489, |
| "learning_rate": 9.456368362421094e-06, |
| "loss": 0.5034, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.8580369843527739, |
| "grad_norm": 0.5256913304328918, |
| "learning_rate": 9.4539706155849e-06, |
| "loss": 0.4821, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.8603129445234708, |
| "grad_norm": 0.9587541222572327, |
| "learning_rate": 9.45156482861401e-06, |
| "loss": 0.5496, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.8625889046941678, |
| "grad_norm": 0.7224308252334595, |
| "learning_rate": 9.44915096100019e-06, |
| "loss": 0.5253, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.8648648648648649, |
| "grad_norm": 0.4760981798171997, |
| "learning_rate": 9.446728971962618e-06, |
| "loss": 0.5171, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8671408250355619, |
| "grad_norm": 0.3734980523586273, |
| "learning_rate": 9.444298820445613e-06, |
| "loss": 0.5173, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.8694167852062589, |
| "grad_norm": 0.23941481113433838, |
| "learning_rate": 9.441860465116282e-06, |
| "loss": 0.4919, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.871692745376956, |
| "grad_norm": 0.7737832069396973, |
| "learning_rate": 9.439413864362203e-06, |
| "loss": 0.4913, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.8739687055476529, |
| "grad_norm": 2.308584213256836, |
| "learning_rate": 9.436958976289048e-06, |
| "loss": 0.4884, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.8762446657183499, |
| "grad_norm": 0.2623022794723511, |
| "learning_rate": 9.434495758718193e-06, |
| "loss": 0.4841, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8785206258890469, |
| "grad_norm": 0.4232858717441559, |
| "learning_rate": 9.432024169184294e-06, |
| "loss": 0.5502, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.880796586059744, |
| "grad_norm": 0.3215095102787018, |
| "learning_rate": 9.429544164932855e-06, |
| "loss": 0.5164, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.883072546230441, |
| "grad_norm": 0.2719275653362274, |
| "learning_rate": 9.427055702917774e-06, |
| "loss": 0.5136, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.8853485064011379, |
| "grad_norm": 0.4348866641521454, |
| "learning_rate": 9.424558739798823e-06, |
| "loss": 0.4968, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.887624466571835, |
| "grad_norm": 0.23700681328773499, |
| "learning_rate": 9.422053231939165e-06, |
| "loss": 0.5154, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.889900426742532, |
| "grad_norm": 0.6731361746788025, |
| "learning_rate": 9.419539135402782e-06, |
| "loss": 0.5035, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.892176386913229, |
| "grad_norm": 0.37062525749206543, |
| "learning_rate": 9.41701640595193e-06, |
| "loss": 0.4937, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.8944523470839261, |
| "grad_norm": 0.3574432134628296, |
| "learning_rate": 9.414484999044527e-06, |
| "loss": 0.4918, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.896728307254623, |
| "grad_norm": 0.6685652732849121, |
| "learning_rate": 9.411944869831548e-06, |
| "loss": 0.471, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.89900426742532, |
| "grad_norm": 0.31523770093917847, |
| "learning_rate": 9.409395973154366e-06, |
| "loss": 0.5188, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.9012802275960171, |
| "grad_norm": 0.7761457562446594, |
| "learning_rate": 9.406838263542067e-06, |
| "loss": 0.5241, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.9035561877667141, |
| "grad_norm": 0.6903082728385925, |
| "learning_rate": 9.404271695208776e-06, |
| "loss": 0.4898, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.9058321479374111, |
| "grad_norm": 0.46466270089149475, |
| "learning_rate": 9.401696222050887e-06, |
| "loss": 0.5046, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.9081081081081082, |
| "grad_norm": 0.6711629033088684, |
| "learning_rate": 9.399111797644333e-06, |
| "loss": 0.4839, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.9103840682788051, |
| "grad_norm": 4.9447221755981445, |
| "learning_rate": 9.396518375241779e-06, |
| "loss": 0.5149, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9126600284495021, |
| "grad_norm": 0.29814621806144714, |
| "learning_rate": 9.393915907769813e-06, |
| "loss": 0.4683, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.9149359886201991, |
| "grad_norm": 0.4514840245246887, |
| "learning_rate": 9.39130434782609e-06, |
| "loss": 0.5132, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.9172119487908962, |
| "grad_norm": 0.35264846682548523, |
| "learning_rate": 9.388683647676456e-06, |
| "loss": 0.4812, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.9194879089615932, |
| "grad_norm": 0.5081570744514465, |
| "learning_rate": 9.386053759252044e-06, |
| "loss": 0.4965, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.9217638691322901, |
| "grad_norm": 0.4127364158630371, |
| "learning_rate": 9.383414634146341e-06, |
| "loss": 0.5217, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9240398293029872, |
| "grad_norm": 0.8967196941375732, |
| "learning_rate": 9.380766223612199e-06, |
| "loss": 0.4968, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.9263157894736842, |
| "grad_norm": 0.5040143728256226, |
| "learning_rate": 9.378108478558842e-06, |
| "loss": 0.5163, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.9285917496443812, |
| "grad_norm": 0.5223595499992371, |
| "learning_rate": 9.375441349548844e-06, |
| "loss": 0.476, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.9308677098150783, |
| "grad_norm": 0.8500305414199829, |
| "learning_rate": 9.37276478679505e-06, |
| "loss": 0.4958, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.9331436699857752, |
| "grad_norm": 0.35249829292297363, |
| "learning_rate": 9.37007874015748e-06, |
| "loss": 0.4703, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9354196301564722, |
| "grad_norm": 0.3008180558681488, |
| "learning_rate": 9.367383159140211e-06, |
| "loss": 0.4894, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.9376955903271693, |
| "grad_norm": 0.3532486855983734, |
| "learning_rate": 9.36467799288819e-06, |
| "loss": 0.4808, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.9399715504978663, |
| "grad_norm": 0.4361303746700287, |
| "learning_rate": 9.361963190184048e-06, |
| "loss": 0.4882, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.9422475106685633, |
| "grad_norm": 0.3767416477203369, |
| "learning_rate": 9.359238699444885e-06, |
| "loss": 0.5159, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.9445234708392604, |
| "grad_norm": 0.29309436678886414, |
| "learning_rate": 9.356504468718968e-06, |
| "loss": 0.5194, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.9467994310099573, |
| "grad_norm": 0.3375232517719269, |
| "learning_rate": 9.35376044568245e-06, |
| "loss": 0.4884, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.9490753911806543, |
| "grad_norm": 0.32881152629852295, |
| "learning_rate": 9.351006577636037e-06, |
| "loss": 0.4985, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.9513513513513514, |
| "grad_norm": 0.4487358033657074, |
| "learning_rate": 9.348242811501597e-06, |
| "loss": 0.4995, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.9536273115220484, |
| "grad_norm": 0.2813623249530792, |
| "learning_rate": 9.345469093818764e-06, |
| "loss": 0.4929, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.9559032716927454, |
| "grad_norm": 0.3134421408176422, |
| "learning_rate": 9.342685370741485e-06, |
| "loss": 0.5081, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9581792318634423, |
| "grad_norm": 0.498050719499588, |
| "learning_rate": 9.339891588034533e-06, |
| "loss": 0.5307, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.9604551920341394, |
| "grad_norm": 0.5057858228683472, |
| "learning_rate": 9.337087691069992e-06, |
| "loss": 0.4764, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.9627311522048364, |
| "grad_norm": 0.40491783618927, |
| "learning_rate": 9.334273624823695e-06, |
| "loss": 0.4782, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.9650071123755334, |
| "grad_norm": 0.22541995346546173, |
| "learning_rate": 9.33144933387162e-06, |
| "loss": 0.4866, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.9672830725462305, |
| "grad_norm": 0.27406251430511475, |
| "learning_rate": 9.32861476238625e-06, |
| "loss": 0.5138, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.9695590327169274, |
| "grad_norm": 0.6074895262718201, |
| "learning_rate": 9.325769854132903e-06, |
| "loss": 0.5242, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.9718349928876244, |
| "grad_norm": 0.31788596510887146, |
| "learning_rate": 9.322914552466005e-06, |
| "loss": 0.5021, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.9741109530583215, |
| "grad_norm": 0.31486812233924866, |
| "learning_rate": 9.320048800325337e-06, |
| "loss": 0.5006, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.9763869132290185, |
| "grad_norm": 0.24196626245975494, |
| "learning_rate": 9.317172540232226e-06, |
| "loss": 0.5073, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.9786628733997155, |
| "grad_norm": 0.44112899899482727, |
| "learning_rate": 9.314285714285714e-06, |
| "loss": 0.5039, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9809388335704126, |
| "grad_norm": 0.4517861008644104, |
| "learning_rate": 9.31138826415866e-06, |
| "loss": 0.5181, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.9832147937411095, |
| "grad_norm": 0.5457024574279785, |
| "learning_rate": 9.308480131093815e-06, |
| "loss": 0.462, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.9854907539118065, |
| "grad_norm": 0.5349276661872864, |
| "learning_rate": 9.305561255899857e-06, |
| "loss": 0.498, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.9877667140825036, |
| "grad_norm": 0.5990840196609497, |
| "learning_rate": 9.30263157894737e-06, |
| "loss": 0.5181, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.9900426742532006, |
| "grad_norm": 0.8661618232727051, |
| "learning_rate": 9.29969104016478e-06, |
| "loss": 0.5044, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9923186344238976, |
| "grad_norm": 0.28357264399528503, |
| "learning_rate": 9.296739579034256e-06, |
| "loss": 0.5291, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.9945945945945946, |
| "grad_norm": 0.2452535480260849, |
| "learning_rate": 9.293777134587556e-06, |
| "loss": 0.5142, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.9968705547652916, |
| "grad_norm": 0.2766931653022766, |
| "learning_rate": 9.290803645401825e-06, |
| "loss": 0.5433, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.9991465149359886, |
| "grad_norm": 0.44178536534309387, |
| "learning_rate": 9.287819049595353e-06, |
| "loss": 0.5104, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.002275960170697, |
| "grad_norm": 1.2671332359313965, |
| "learning_rate": 9.284823284823286e-06, |
| "loss": 0.9629, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.002275960170697, |
| "eval_loss": 0.4974663555622101, |
| "eval_runtime": 152.4851, |
| "eval_samples_per_second": 3.279, |
| "eval_steps_per_second": 0.82, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.004551920341394, |
| "grad_norm": 0.21263453364372253, |
| "learning_rate": 9.281816288273278e-06, |
| "loss": 0.478, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.006827880512091, |
| "grad_norm": 0.2866232991218567, |
| "learning_rate": 9.278797996661104e-06, |
| "loss": 0.4783, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.0091038406827881, |
| "grad_norm": 0.2101231962442398, |
| "learning_rate": 9.275768346226217e-06, |
| "loss": 0.4887, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.0113798008534851, |
| "grad_norm": 0.6429772973060608, |
| "learning_rate": 9.272727272727273e-06, |
| "loss": 0.4815, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.013655761024182, |
| "grad_norm": 0.9100087285041809, |
| "learning_rate": 9.269674711437566e-06, |
| "loss": 0.4734, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.015931721194879, |
| "grad_norm": 0.5525131821632385, |
| "learning_rate": 9.266610597140455e-06, |
| "loss": 0.4962, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.018207681365576, |
| "grad_norm": 1.0211899280548096, |
| "learning_rate": 9.263534864124712e-06, |
| "loss": 0.5055, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.020483641536273, |
| "grad_norm": 0.3108387589454651, |
| "learning_rate": 9.260447446179823e-06, |
| "loss": 0.4936, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.0227596017069702, |
| "grad_norm": 0.2863866090774536, |
| "learning_rate": 9.257348276591246e-06, |
| "loss": 0.4965, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.0250355618776672, |
| "grad_norm": 0.2815602421760559, |
| "learning_rate": 9.254237288135594e-06, |
| "loss": 0.4887, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.0273115220483642, |
| "grad_norm": 0.24380503594875336, |
| "learning_rate": 9.251114413075781e-06, |
| "loss": 0.4643, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.0295874822190612, |
| "grad_norm": 0.43735039234161377, |
| "learning_rate": 9.247979583156105e-06, |
| "loss": 0.4799, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.0318634423897581, |
| "grad_norm": 0.26354771852493286, |
| "learning_rate": 9.244832729597275e-06, |
| "loss": 0.4949, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.0341394025604551, |
| "grad_norm": 0.350656658411026, |
| "learning_rate": 9.241673783091376e-06, |
| "loss": 0.4852, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.0364153627311523, |
| "grad_norm": 0.30760401487350464, |
| "learning_rate": 9.238502673796792e-06, |
| "loss": 0.5053, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.0386913229018493, |
| "grad_norm": 0.3699236512184143, |
| "learning_rate": 9.235319331333048e-06, |
| "loss": 0.5149, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.0409672830725463, |
| "grad_norm": 0.6595727205276489, |
| "learning_rate": 9.232123684775606e-06, |
| "loss": 0.5321, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.0432432432432432, |
| "grad_norm": 0.6040295958518982, |
| "learning_rate": 9.228915662650604e-06, |
| "loss": 0.4971, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.0455192034139402, |
| "grad_norm": 0.5132520794868469, |
| "learning_rate": 9.225695192929511e-06, |
| "loss": 0.486, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.0477951635846372, |
| "grad_norm": 0.2748469114303589, |
| "learning_rate": 9.222462203023758e-06, |
| "loss": 0.4973, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0500711237553342, |
| "grad_norm": 0.2111174762248993, |
| "learning_rate": 9.21921661977927e-06, |
| "loss": 0.4808, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.0523470839260314, |
| "grad_norm": 0.3842904567718506, |
| "learning_rate": 9.215958369470948e-06, |
| "loss": 0.5554, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.0546230440967284, |
| "grad_norm": 0.3397091329097748, |
| "learning_rate": 9.21268737779709e-06, |
| "loss": 0.5307, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.0568990042674253, |
| "grad_norm": 0.5594832301139832, |
| "learning_rate": 9.20940356987375e-06, |
| "loss": 0.5063, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.0591749644381223, |
| "grad_norm": 0.9360854625701904, |
| "learning_rate": 9.206106870229008e-06, |
| "loss": 0.4856, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.0614509246088193, |
| "grad_norm": 0.25913193821907043, |
| "learning_rate": 9.202797202797205e-06, |
| "loss": 0.5181, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.0637268847795163, |
| "grad_norm": 0.4928555190563202, |
| "learning_rate": 9.199474490913072e-06, |
| "loss": 0.5018, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.0660028449502135, |
| "grad_norm": 0.26356229186058044, |
| "learning_rate": 9.196138657305838e-06, |
| "loss": 0.5101, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.0682788051209104, |
| "grad_norm": 0.5407450795173645, |
| "learning_rate": 9.192789624093207e-06, |
| "loss": 0.4899, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.0705547652916074, |
| "grad_norm": 0.719228208065033, |
| "learning_rate": 9.189427312775331e-06, |
| "loss": 0.5008, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0728307254623044, |
| "grad_norm": 0.6197307705879211, |
| "learning_rate": 9.18605164422865e-06, |
| "loss": 0.5396, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.0751066856330014, |
| "grad_norm": 0.7797653079032898, |
| "learning_rate": 9.182662538699692e-06, |
| "loss": 0.4907, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.0773826458036984, |
| "grad_norm": 0.27928516268730164, |
| "learning_rate": 9.179259915798804e-06, |
| "loss": 0.4898, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.0796586059743953, |
| "grad_norm": 0.37681275606155396, |
| "learning_rate": 9.175843694493784e-06, |
| "loss": 0.4764, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.0819345661450925, |
| "grad_norm": 0.6600402593612671, |
| "learning_rate": 9.17241379310345e-06, |
| "loss": 0.5419, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.0842105263157895, |
| "grad_norm": 0.3575684428215027, |
| "learning_rate": 9.168970129291129e-06, |
| "loss": 0.48, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.0864864864864865, |
| "grad_norm": 0.4149022102355957, |
| "learning_rate": 9.165512620058075e-06, |
| "loss": 0.4847, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.0887624466571835, |
| "grad_norm": 1.0566219091415405, |
| "learning_rate": 9.162041181736796e-06, |
| "loss": 0.5392, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.0910384068278804, |
| "grad_norm": 0.5956203937530518, |
| "learning_rate": 9.158555729984303e-06, |
| "loss": 0.486, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.0933143669985774, |
| "grad_norm": 0.7834517359733582, |
| "learning_rate": 9.155056179775284e-06, |
| "loss": 0.5104, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0955903271692746, |
| "grad_norm": 0.5677323937416077, |
| "learning_rate": 9.151542445395182e-06, |
| "loss": 0.4806, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.0978662873399716, |
| "grad_norm": 0.4465844929218292, |
| "learning_rate": 9.148014440433215e-06, |
| "loss": 0.4894, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.1001422475106686, |
| "grad_norm": 0.38444948196411133, |
| "learning_rate": 9.144472077775265e-06, |
| "loss": 0.4818, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.1024182076813656, |
| "grad_norm": 0.22575457394123077, |
| "learning_rate": 9.14091526959674e-06, |
| "loss": 0.521, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.1046941678520625, |
| "grad_norm": 0.41524603962898254, |
| "learning_rate": 9.13734392735528e-06, |
| "loss": 0.529, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.1069701280227595, |
| "grad_norm": 0.33040255308151245, |
| "learning_rate": 9.133757961783441e-06, |
| "loss": 0.4803, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.1092460881934567, |
| "grad_norm": 0.229473277926445, |
| "learning_rate": 9.13015728288124e-06, |
| "loss": 0.5139, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.1115220483641537, |
| "grad_norm": 0.22900845110416412, |
| "learning_rate": 9.126541799908635e-06, |
| "loss": 0.4961, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.1137980085348507, |
| "grad_norm": 0.6831381320953369, |
| "learning_rate": 9.122911421377889e-06, |
| "loss": 0.5108, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.1160739687055476, |
| "grad_norm": 0.4539848864078522, |
| "learning_rate": 9.119266055045872e-06, |
| "loss": 0.4664, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1183499288762446, |
| "grad_norm": 0.48363742232322693, |
| "learning_rate": 9.115605607906229e-06, |
| "loss": 0.513, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.1206258890469416, |
| "grad_norm": 0.2667783796787262, |
| "learning_rate": 9.111929986181483e-06, |
| "loss": 0.4876, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.1229018492176386, |
| "grad_norm": 0.3707852065563202, |
| "learning_rate": 9.108239095315025e-06, |
| "loss": 0.4735, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.1251778093883358, |
| "grad_norm": 0.3922533392906189, |
| "learning_rate": 9.104532839962999e-06, |
| "loss": 0.4893, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.1274537695590328, |
| "grad_norm": 3.7399353981018066, |
| "learning_rate": 9.100811123986097e-06, |
| "loss": 0.4944, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.1297297297297297, |
| "grad_norm": 0.5482460260391235, |
| "learning_rate": 9.097073850441244e-06, |
| "loss": 0.4691, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.1320056899004267, |
| "grad_norm": 0.579155683517456, |
| "learning_rate": 9.093320921573193e-06, |
| "loss": 0.4981, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.1342816500711237, |
| "grad_norm": 0.4125681221485138, |
| "learning_rate": 9.089552238805971e-06, |
| "loss": 0.5099, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.136557610241821, |
| "grad_norm": 0.8303210139274597, |
| "learning_rate": 9.085767702734285e-06, |
| "loss": 0.5116, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.1388335704125179, |
| "grad_norm": 0.6363510489463806, |
| "learning_rate": 9.081967213114755e-06, |
| "loss": 0.5321, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1411095305832148, |
| "grad_norm": 0.27322062849998474, |
| "learning_rate": 9.078150668857077e-06, |
| "loss": 0.5268, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.1433854907539118, |
| "grad_norm": 0.4494977593421936, |
| "learning_rate": 9.074317968015053e-06, |
| "loss": 0.496, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.1456614509246088, |
| "grad_norm": 0.29483264684677124, |
| "learning_rate": 9.070469007777517e-06, |
| "loss": 0.5173, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.1479374110953058, |
| "grad_norm": 0.21898174285888672, |
| "learning_rate": 9.066603684459141e-06, |
| "loss": 0.479, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.1502133712660028, |
| "grad_norm": 0.660429060459137, |
| "learning_rate": 9.062721893491125e-06, |
| "loss": 0.4868, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.1524893314367, |
| "grad_norm": 0.46798914670944214, |
| "learning_rate": 9.058823529411767e-06, |
| "loss": 0.484, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.154765291607397, |
| "grad_norm": 0.37514424324035645, |
| "learning_rate": 9.054908485856906e-06, |
| "loss": 0.464, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.157041251778094, |
| "grad_norm": 0.5945050120353699, |
| "learning_rate": 9.050976655550263e-06, |
| "loss": 0.506, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.159317211948791, |
| "grad_norm": 0.4302297830581665, |
| "learning_rate": 9.047027930293625e-06, |
| "loss": 0.5332, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.1615931721194879, |
| "grad_norm": 0.32911941409111023, |
| "learning_rate": 9.04306220095694e-06, |
| "loss": 0.4776, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1638691322901848, |
| "grad_norm": 0.26953014731407166, |
| "learning_rate": 9.039079357468235e-06, |
| "loss": 0.4865, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.1661450924608818, |
| "grad_norm": 0.3142431378364563, |
| "learning_rate": 9.035079288803461e-06, |
| "loss": 0.502, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.168421052631579, |
| "grad_norm": 0.5341249108314514, |
| "learning_rate": 9.031061882976163e-06, |
| "loss": 0.5064, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.170697012802276, |
| "grad_norm": 0.2455718070268631, |
| "learning_rate": 9.027027027027028e-06, |
| "loss": 0.4986, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.172972972972973, |
| "grad_norm": 0.6356760859489441, |
| "learning_rate": 9.022974607013302e-06, |
| "loss": 0.5292, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.17524893314367, |
| "grad_norm": 0.43525630235671997, |
| "learning_rate": 9.018904507998063e-06, |
| "loss": 0.4973, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.177524893314367, |
| "grad_norm": 0.44182905554771423, |
| "learning_rate": 9.01481661403935e-06, |
| "loss": 0.4396, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.179800853485064, |
| "grad_norm": 0.5089689493179321, |
| "learning_rate": 9.010710808179164e-06, |
| "loss": 0.4984, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.1820768136557611, |
| "grad_norm": 0.37655216455459595, |
| "learning_rate": 9.0065869724323e-06, |
| "loss": 0.4907, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.184352773826458, |
| "grad_norm": 1.3340935707092285, |
| "learning_rate": 9.002444987775062e-06, |
| "loss": 0.5169, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.186628733997155, |
| "grad_norm": 1.6214795112609863, |
| "learning_rate": 8.99828473413379e-06, |
| "loss": 0.5068, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.188904694167852, |
| "grad_norm": 0.8963609933853149, |
| "learning_rate": 8.99410609037328e-06, |
| "loss": 0.5011, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.191180654338549, |
| "grad_norm": 0.5894596576690674, |
| "learning_rate": 8.989908934285012e-06, |
| "loss": 0.4703, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.193456614509246, |
| "grad_norm": 0.5181412696838379, |
| "learning_rate": 8.985693142575235e-06, |
| "loss": 0.4666, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.1957325746799432, |
| "grad_norm": 0.6675935983657837, |
| "learning_rate": 8.981458590852907e-06, |
| "loss": 0.4947, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.1980085348506402, |
| "grad_norm": 0.6920701861381531, |
| "learning_rate": 8.977205153617443e-06, |
| "loss": 0.4607, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.2002844950213372, |
| "grad_norm": 0.3088913559913635, |
| "learning_rate": 8.97293270424634e-06, |
| "loss": 0.5491, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.2025604551920341, |
| "grad_norm": 0.27273494005203247, |
| "learning_rate": 8.96864111498258e-06, |
| "loss": 0.4774, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.2025604551920341, |
| "eval_loss": 0.4968067407608032, |
| "eval_runtime": 152.6862, |
| "eval_samples_per_second": 3.275, |
| "eval_steps_per_second": 0.819, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.2048364153627311, |
| "grad_norm": 0.4168463349342346, |
| "learning_rate": 8.964330256921927e-06, |
| "loss": 0.4631, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.207112375533428, |
| "grad_norm": 0.8588109612464905, |
| "learning_rate": 8.960000000000002e-06, |
| "loss": 0.5207, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.209388335704125, |
| "grad_norm": 0.5915808081626892, |
| "learning_rate": 8.955650212979206e-06, |
| "loss": 0.5108, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.2116642958748223, |
| "grad_norm": 0.47360390424728394, |
| "learning_rate": 8.951280763435461e-06, |
| "loss": 0.5034, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.2139402560455193, |
| "grad_norm": 0.39106452465057373, |
| "learning_rate": 8.946891517744779e-06, |
| "loss": 0.5013, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.2162162162162162, |
| "grad_norm": 0.9861881136894226, |
| "learning_rate": 8.942482341069628e-06, |
| "loss": 0.4678, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.2184921763869132, |
| "grad_norm": 0.4086526036262512, |
| "learning_rate": 8.938053097345134e-06, |
| "loss": 0.4737, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.2207681365576102, |
| "grad_norm": 0.7135105729103088, |
| "learning_rate": 8.93360364926508e-06, |
| "loss": 0.4806, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.2230440967283072, |
| "grad_norm": 0.46862491965293884, |
| "learning_rate": 8.929133858267717e-06, |
| "loss": 0.5002, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.2253200568990044, |
| "grad_norm": 0.41207781434059143, |
| "learning_rate": 8.924643584521386e-06, |
| "loss": 0.4633, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.2275960170697013, |
| "grad_norm": 0.5379470586776733, |
| "learning_rate": 8.920132686909926e-06, |
| "loss": 0.5418, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.2298719772403983, |
| "grad_norm": 0.6593378782272339, |
| "learning_rate": 8.915601023017903e-06, |
| "loss": 0.5227, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.2321479374110953, |
| "grad_norm": 0.5798733234405518, |
| "learning_rate": 8.911048449115613e-06, |
| "loss": 0.4734, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.2344238975817923, |
| "grad_norm": 0.3299521803855896, |
| "learning_rate": 8.906474820143887e-06, |
| "loss": 0.4953, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.2366998577524893, |
| "grad_norm": 0.7939162850379944, |
| "learning_rate": 8.901879989698689e-06, |
| "loss": 0.5127, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.2389758179231865, |
| "grad_norm": 0.4086650311946869, |
| "learning_rate": 8.897263810015489e-06, |
| "loss": 0.5214, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.2412517780938834, |
| "grad_norm": 0.589647650718689, |
| "learning_rate": 8.892626131953428e-06, |
| "loss": 0.5008, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.2435277382645804, |
| "grad_norm": 0.3771437704563141, |
| "learning_rate": 8.887966804979256e-06, |
| "loss": 0.5281, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.2458036984352774, |
| "grad_norm": 1.033705472946167, |
| "learning_rate": 8.883285677151027e-06, |
| "loss": 0.5148, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.2480796586059744, |
| "grad_norm": 0.5085345506668091, |
| "learning_rate": 8.878582595101617e-06, |
| "loss": 0.5196, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.2503556187766713, |
| "grad_norm": 0.2777687609195709, |
| "learning_rate": 8.873857404021936e-06, |
| "loss": 0.509, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.2526315789473683, |
| "grad_norm": 0.5570908784866333, |
| "learning_rate": 8.869109947643979e-06, |
| "loss": 0.4756, |
| "step": 550 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 878, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 22, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5003226796869026e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|