diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4957 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999288610656613, + "eval_steps": 500, + "global_step": 7028, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001422778686775272, + "grad_norm": 227.20426891789228, + "learning_rate": 2.132701421800948e-07, + "loss": 14.582, + "step": 10 + }, + { + "epoch": 0.002845557373550544, + "grad_norm": 288.4752938926377, + "learning_rate": 4.502369668246446e-07, + "loss": 14.1726, + "step": 20 + }, + { + "epoch": 0.004268336060325817, + "grad_norm": 182.76059043427875, + "learning_rate": 6.872037914691944e-07, + "loss": 13.4078, + "step": 30 + }, + { + "epoch": 0.005691114747101088, + "grad_norm": 127.33519082588683, + "learning_rate": 9.241706161137441e-07, + "loss": 11.4628, + "step": 40 + }, + { + "epoch": 0.007113893433876361, + "grad_norm": 155.10626329925768, + "learning_rate": 1.161137440758294e-06, + "loss": 9.1475, + "step": 50 + }, + { + "epoch": 0.008536672120651633, + "grad_norm": 138.90229390796665, + "learning_rate": 1.3981042654028437e-06, + "loss": 7.7319, + "step": 60 + }, + { + "epoch": 0.009959450807426905, + "grad_norm": 109.7463772232744, + "learning_rate": 1.6350710900473934e-06, + "loss": 6.4941, + "step": 70 + }, + { + "epoch": 0.011382229494202176, + "grad_norm": 83.21567835995891, + "learning_rate": 1.8720379146919433e-06, + "loss": 6.1299, + "step": 80 + }, + { + "epoch": 0.01280500818097745, + "grad_norm": 276.7319598681411, + "learning_rate": 2.109004739336493e-06, + "loss": 5.8824, + "step": 90 + }, + { + "epoch": 0.014227786867752721, + "grad_norm": 150.10206828081996, + "learning_rate": 2.345971563981043e-06, + "loss": 5.4126, + "step": 100 + }, + { + "epoch": 0.015650565554527995, + "grad_norm": 85.51637256924744, + "learning_rate": 2.5829383886255925e-06, + "loss": 5.4679, + "step": 110 + }, + { + "epoch": 0.017073344241303266, + "grad_norm": 54.01631684738819, + "learning_rate": 2.8199052132701426e-06, + "loss": 5.2629, + "step": 120 + }, + { + "epoch": 0.018496122928078538, + "grad_norm": 48.046263311608506, + "learning_rate": 3.0568720379146923e-06, + "loss": 5.1994, + "step": 130 + }, + { + "epoch": 0.01991890161485381, + "grad_norm": 48.29009312747582, + "learning_rate": 3.293838862559242e-06, + "loss": 4.9431, + "step": 140 + }, + { + "epoch": 0.02134168030162908, + "grad_norm": 68.96565083843548, + "learning_rate": 3.5308056872037916e-06, + "loss": 4.9651, + "step": 150 + }, + { + "epoch": 0.022764458988404353, + "grad_norm": 57.886348280789576, + "learning_rate": 3.7677725118483417e-06, + "loss": 4.8191, + "step": 160 + }, + { + "epoch": 0.024187237675179624, + "grad_norm": 33.13905030540343, + "learning_rate": 4.004739336492891e-06, + "loss": 4.601, + "step": 170 + }, + { + "epoch": 0.0256100163619549, + "grad_norm": 36.383587300681455, + "learning_rate": 4.2417061611374415e-06, + "loss": 4.6011, + "step": 180 + }, + { + "epoch": 0.02703279504873017, + "grad_norm": 93.93751717462193, + "learning_rate": 4.478672985781991e-06, + "loss": 4.5838, + "step": 190 + }, + { + "epoch": 0.028455573735505443, + "grad_norm": 161.18260012854552, + "learning_rate": 4.715639810426541e-06, + "loss": 4.473, + "step": 200 + }, + { + "epoch": 0.029878352422280714, + "grad_norm": 48.03951820879001, + "learning_rate": 4.952606635071091e-06, + "loss": 4.328, + "step": 210 + }, + { + "epoch": 0.03130113110905599, + "grad_norm": 48.351614233644206, + "learning_rate": 4.999983009637205e-06, + "loss": 3.8117, + "step": 220 + }, + { + "epoch": 0.03272390979583126, + "grad_norm": 40.879446824882315, + "learning_rate": 4.999913986684146e-06, + "loss": 3.666, + "step": 230 + }, + { + "epoch": 0.03414668848260653, + "grad_norm": 71.68709145083035, + "learning_rate": 4.999791870707925e-06, + "loss": 3.3076, + "step": 240 + }, + { + "epoch": 0.0355694671693818, + "grad_norm": 41.1760727865846, + "learning_rate": 4.999616664302036e-06, + "loss": 3.0586, + "step": 250 + }, + { + "epoch": 0.036992245856157076, + "grad_norm": 35.99541853661167, + "learning_rate": 4.9993883711875e-06, + "loss": 2.6416, + "step": 260 + }, + { + "epoch": 0.038415024542932344, + "grad_norm": 30.668465660210362, + "learning_rate": 4.999106996212795e-06, + "loss": 2.4984, + "step": 270 + }, + { + "epoch": 0.03983780322970762, + "grad_norm": 36.932680181660984, + "learning_rate": 4.998772545353743e-06, + "loss": 2.3382, + "step": 280 + }, + { + "epoch": 0.041260581916482894, + "grad_norm": 61.90188051017363, + "learning_rate": 4.998385025713392e-06, + "loss": 2.0779, + "step": 290 + }, + { + "epoch": 0.04268336060325816, + "grad_norm": 69.48759279430097, + "learning_rate": 4.9979444455218615e-06, + "loss": 2.028, + "step": 300 + }, + { + "epoch": 0.04410613929003344, + "grad_norm": 24.129222083009303, + "learning_rate": 4.997450814136168e-06, + "loss": 2.0022, + "step": 310 + }, + { + "epoch": 0.045528917976808705, + "grad_norm": 57.107184624594545, + "learning_rate": 4.9969041420400235e-06, + "loss": 1.7308, + "step": 320 + }, + { + "epoch": 0.04695169666358398, + "grad_norm": 25.286429840125543, + "learning_rate": 4.996304440843618e-06, + "loss": 1.8255, + "step": 330 + }, + { + "epoch": 0.04837447535035925, + "grad_norm": 41.33604492294289, + "learning_rate": 4.9956517232833696e-06, + "loss": 1.6635, + "step": 340 + }, + { + "epoch": 0.049797254037134524, + "grad_norm": 25.057988737297727, + "learning_rate": 4.994946003221654e-06, + "loss": 1.6383, + "step": 350 + }, + { + "epoch": 0.0512200327239098, + "grad_norm": 29.141194168593408, + "learning_rate": 4.994187295646512e-06, + "loss": 1.6471, + "step": 360 + }, + { + "epoch": 0.05264281141068507, + "grad_norm": 32.28184288482766, + "learning_rate": 4.9933756166713295e-06, + "loss": 1.5427, + "step": 370 + }, + { + "epoch": 0.05406559009746034, + "grad_norm": 21.403858874487227, + "learning_rate": 4.992510983534496e-06, + "loss": 1.4988, + "step": 380 + }, + { + "epoch": 0.05548836878423561, + "grad_norm": 33.94744437090256, + "learning_rate": 4.9915934145990385e-06, + "loss": 1.4508, + "step": 390 + }, + { + "epoch": 0.056911147471010885, + "grad_norm": 19.822673933241806, + "learning_rate": 4.99062292935223e-06, + "loss": 1.4209, + "step": 400 + }, + { + "epoch": 0.05833392615778615, + "grad_norm": 41.542065289192145, + "learning_rate": 4.989599548405178e-06, + "loss": 1.5373, + "step": 410 + }, + { + "epoch": 0.05975670484456143, + "grad_norm": 46.3095442295298, + "learning_rate": 4.9885232934923865e-06, + "loss": 1.3934, + "step": 420 + }, + { + "epoch": 0.061179483531336704, + "grad_norm": 25.997158063369152, + "learning_rate": 4.987394187471292e-06, + "loss": 1.4154, + "step": 430 + }, + { + "epoch": 0.06260226221811198, + "grad_norm": 46.76059720724752, + "learning_rate": 4.98621225432178e-06, + "loss": 1.2577, + "step": 440 + }, + { + "epoch": 0.06402504090488724, + "grad_norm": 39.72838305337924, + "learning_rate": 4.984977519145677e-06, + "loss": 1.3735, + "step": 450 + }, + { + "epoch": 0.06544781959166251, + "grad_norm": 31.8000552139225, + "learning_rate": 4.983690008166213e-06, + "loss": 1.378, + "step": 460 + }, + { + "epoch": 0.06687059827843779, + "grad_norm": 54.99100856311778, + "learning_rate": 4.982349748727471e-06, + "loss": 1.2952, + "step": 470 + }, + { + "epoch": 0.06829337696521307, + "grad_norm": 34.7813184333919, + "learning_rate": 4.980956769293799e-06, + "loss": 1.3304, + "step": 480 + }, + { + "epoch": 0.06971615565198833, + "grad_norm": 56.07368019486356, + "learning_rate": 4.9795110994492104e-06, + "loss": 1.2965, + "step": 490 + }, + { + "epoch": 0.0711389343387636, + "grad_norm": 24.812325854128844, + "learning_rate": 4.978012769896754e-06, + "loss": 1.2318, + "step": 500 + }, + { + "epoch": 0.07256171302553888, + "grad_norm": 16.4693703560664, + "learning_rate": 4.976461812457866e-06, + "loss": 1.3849, + "step": 510 + }, + { + "epoch": 0.07398449171231415, + "grad_norm": 17.511580201601642, + "learning_rate": 4.974858260071684e-06, + "loss": 1.2936, + "step": 520 + }, + { + "epoch": 0.07540727039908943, + "grad_norm": 18.829140041990236, + "learning_rate": 4.973202146794358e-06, + "loss": 1.2266, + "step": 530 + }, + { + "epoch": 0.07683004908586469, + "grad_norm": 30.404720510571355, + "learning_rate": 4.971493507798324e-06, + "loss": 1.2773, + "step": 540 + }, + { + "epoch": 0.07825282777263996, + "grad_norm": 14.60687195849512, + "learning_rate": 4.9697323793715526e-06, + "loss": 1.25, + "step": 550 + }, + { + "epoch": 0.07967560645941524, + "grad_norm": 71.6497384118756, + "learning_rate": 4.967918798916785e-06, + "loss": 1.3495, + "step": 560 + }, + { + "epoch": 0.08109838514619051, + "grad_norm": 16.64365379567024, + "learning_rate": 4.966052804950733e-06, + "loss": 1.1047, + "step": 570 + }, + { + "epoch": 0.08252116383296579, + "grad_norm": 33.87717811951715, + "learning_rate": 4.964134437103265e-06, + "loss": 1.2592, + "step": 580 + }, + { + "epoch": 0.08394394251974105, + "grad_norm": 70.63821742601606, + "learning_rate": 4.962163736116562e-06, + "loss": 1.1634, + "step": 590 + }, + { + "epoch": 0.08536672120651632, + "grad_norm": 32.75072239557704, + "learning_rate": 4.960140743844252e-06, + "loss": 1.2711, + "step": 600 + }, + { + "epoch": 0.0867894998932916, + "grad_norm": 35.5844324891052, + "learning_rate": 4.958065503250526e-06, + "loss": 1.2501, + "step": 610 + }, + { + "epoch": 0.08821227858006687, + "grad_norm": 14.266674138271055, + "learning_rate": 4.955938058409217e-06, + "loss": 1.1745, + "step": 620 + }, + { + "epoch": 0.08963505726684214, + "grad_norm": 18.873108069596782, + "learning_rate": 4.953758454502872e-06, + "loss": 1.1528, + "step": 630 + }, + { + "epoch": 0.09105783595361741, + "grad_norm": 31.56236779309872, + "learning_rate": 4.951526737821787e-06, + "loss": 1.2088, + "step": 640 + }, + { + "epoch": 0.09248061464039269, + "grad_norm": 18.70989762385317, + "learning_rate": 4.949242955763028e-06, + "loss": 1.211, + "step": 650 + }, + { + "epoch": 0.09390339332716796, + "grad_norm": 39.25798153988374, + "learning_rate": 4.94690715682942e-06, + "loss": 1.1713, + "step": 660 + }, + { + "epoch": 0.09532617201394324, + "grad_norm": 22.778346344296285, + "learning_rate": 4.9445193906285206e-06, + "loss": 1.2458, + "step": 670 + }, + { + "epoch": 0.0967489507007185, + "grad_norm": 23.69275420448688, + "learning_rate": 4.942079707871566e-06, + "loss": 1.1972, + "step": 680 + }, + { + "epoch": 0.09817172938749377, + "grad_norm": 39.306258513979415, + "learning_rate": 4.939588160372392e-06, + "loss": 1.1222, + "step": 690 + }, + { + "epoch": 0.09959450807426905, + "grad_norm": 22.896068053456, + "learning_rate": 4.937044801046334e-06, + "loss": 1.1845, + "step": 700 + }, + { + "epoch": 0.10101728676104432, + "grad_norm": 60.82968098497559, + "learning_rate": 4.934449683909106e-06, + "loss": 1.2123, + "step": 710 + }, + { + "epoch": 0.1024400654478196, + "grad_norm": 56.56768516919535, + "learning_rate": 4.931802864075648e-06, + "loss": 0.9821, + "step": 720 + }, + { + "epoch": 0.10386284413459486, + "grad_norm": 24.624330732318725, + "learning_rate": 4.929104397758962e-06, + "loss": 1.2275, + "step": 730 + }, + { + "epoch": 0.10528562282137013, + "grad_norm": 38.57583464398156, + "learning_rate": 4.926354342268913e-06, + "loss": 1.0655, + "step": 740 + }, + { + "epoch": 0.10670840150814541, + "grad_norm": 22.07451047416335, + "learning_rate": 4.9235527560110156e-06, + "loss": 1.0444, + "step": 750 + }, + { + "epoch": 0.10813118019492068, + "grad_norm": 60.36637431473924, + "learning_rate": 4.9206996984851895e-06, + "loss": 1.1408, + "step": 760 + }, + { + "epoch": 0.10955395888169595, + "grad_norm": 20.94795499081589, + "learning_rate": 4.917795230284499e-06, + "loss": 1.0697, + "step": 770 + }, + { + "epoch": 0.11097673756847122, + "grad_norm": 15.489949414714857, + "learning_rate": 4.914839413093867e-06, + "loss": 1.2892, + "step": 780 + }, + { + "epoch": 0.1123995162552465, + "grad_norm": 40.226087592606554, + "learning_rate": 4.911832309688758e-06, + "loss": 0.9953, + "step": 790 + }, + { + "epoch": 0.11382229494202177, + "grad_norm": 14.411243294953653, + "learning_rate": 4.9087739839338586e-06, + "loss": 1.0995, + "step": 800 + }, + { + "epoch": 0.11524507362879705, + "grad_norm": 25.746283972263587, + "learning_rate": 4.905664500781704e-06, + "loss": 1.1964, + "step": 810 + }, + { + "epoch": 0.1166678523155723, + "grad_norm": 24.066909183738016, + "learning_rate": 4.902503926271311e-06, + "loss": 1.0434, + "step": 820 + }, + { + "epoch": 0.11809063100234758, + "grad_norm": 10.58109917774702, + "learning_rate": 4.899292327526771e-06, + "loss": 1.1068, + "step": 830 + }, + { + "epoch": 0.11951340968912286, + "grad_norm": 31.468690458560978, + "learning_rate": 4.896029772755827e-06, + "loss": 1.1184, + "step": 840 + }, + { + "epoch": 0.12093618837589813, + "grad_norm": 34.07373775220134, + "learning_rate": 4.892716331248421e-06, + "loss": 1.131, + "step": 850 + }, + { + "epoch": 0.12235896706267341, + "grad_norm": 32.13791878005405, + "learning_rate": 4.889352073375224e-06, + "loss": 1.0096, + "step": 860 + }, + { + "epoch": 0.12378174574944867, + "grad_norm": 25.032238348130715, + "learning_rate": 4.885937070586143e-06, + "loss": 1.0091, + "step": 870 + }, + { + "epoch": 0.12520452443622396, + "grad_norm": 43.796780744116205, + "learning_rate": 4.882471395408802e-06, + "loss": 1.1524, + "step": 880 + }, + { + "epoch": 0.12662730312299922, + "grad_norm": 39.73622388351608, + "learning_rate": 4.878955121447003e-06, + "loss": 1.0613, + "step": 890 + }, + { + "epoch": 0.12805008180977448, + "grad_norm": 23.415663074214173, + "learning_rate": 4.875388323379161e-06, + "loss": 1.0664, + "step": 900 + }, + { + "epoch": 0.12947286049654977, + "grad_norm": 26.895406711294633, + "learning_rate": 4.871771076956718e-06, + "loss": 1.1549, + "step": 910 + }, + { + "epoch": 0.13089563918332503, + "grad_norm": 21.255238865940857, + "learning_rate": 4.8681034590025385e-06, + "loss": 0.979, + "step": 920 + }, + { + "epoch": 0.13231841787010032, + "grad_norm": 18.490585034014213, + "learning_rate": 4.864385547409272e-06, + "loss": 1.2308, + "step": 930 + }, + { + "epoch": 0.13374119655687558, + "grad_norm": 31.35551833525057, + "learning_rate": 4.8606174211377e-06, + "loss": 1.125, + "step": 940 + }, + { + "epoch": 0.13516397524365084, + "grad_norm": 18.48243362822039, + "learning_rate": 4.8567991602150635e-06, + "loss": 1.0853, + "step": 950 + }, + { + "epoch": 0.13658675393042613, + "grad_norm": 20.45592551281928, + "learning_rate": 4.852930845733358e-06, + "loss": 1.1877, + "step": 960 + }, + { + "epoch": 0.1380095326172014, + "grad_norm": 13.073784179027522, + "learning_rate": 4.849012559847615e-06, + "loss": 1.0812, + "step": 970 + }, + { + "epoch": 0.13943231130397665, + "grad_norm": 58.464808300471525, + "learning_rate": 4.845044385774154e-06, + "loss": 1.0702, + "step": 980 + }, + { + "epoch": 0.14085508999075194, + "grad_norm": 32.284426339203456, + "learning_rate": 4.841026407788818e-06, + "loss": 1.0646, + "step": 990 + }, + { + "epoch": 0.1422778686775272, + "grad_norm": 23.861973812252366, + "learning_rate": 4.836958711225183e-06, + "loss": 1.1582, + "step": 1000 + }, + { + "epoch": 0.1437006473643025, + "grad_norm": 35.10042096090748, + "learning_rate": 4.832841382472744e-06, + "loss": 1.0257, + "step": 1010 + }, + { + "epoch": 0.14512342605107775, + "grad_norm": 14.61198523399814, + "learning_rate": 4.828674508975081e-06, + "loss": 0.9973, + "step": 1020 + }, + { + "epoch": 0.14654620473785301, + "grad_norm": 25.535678355280265, + "learning_rate": 4.8244581792280045e-06, + "loss": 1.1356, + "step": 1030 + }, + { + "epoch": 0.1479689834246283, + "grad_norm": 47.776324802406265, + "learning_rate": 4.820192482777672e-06, + "loss": 0.9028, + "step": 1040 + }, + { + "epoch": 0.14939176211140356, + "grad_norm": 14.799263331252178, + "learning_rate": 4.815877510218687e-06, + "loss": 1.1014, + "step": 1050 + }, + { + "epoch": 0.15081454079817885, + "grad_norm": 24.813065365774627, + "learning_rate": 4.811513353192181e-06, + "loss": 1.1839, + "step": 1060 + }, + { + "epoch": 0.15223731948495411, + "grad_norm": 25.60795480877662, + "learning_rate": 4.807100104383856e-06, + "loss": 0.9862, + "step": 1070 + }, + { + "epoch": 0.15366009817172938, + "grad_norm": 81.74250558980262, + "learning_rate": 4.802637857522026e-06, + "loss": 1.0582, + "step": 1080 + }, + { + "epoch": 0.15508287685850466, + "grad_norm": 32.87867343488564, + "learning_rate": 4.798126707375623e-06, + "loss": 1.1183, + "step": 1090 + }, + { + "epoch": 0.15650565554527993, + "grad_norm": 11.38171030977416, + "learning_rate": 4.793566749752183e-06, + "loss": 1.1165, + "step": 1100 + }, + { + "epoch": 0.15792843423205521, + "grad_norm": 21.175570579434233, + "learning_rate": 4.78895808149581e-06, + "loss": 1.1336, + "step": 1110 + }, + { + "epoch": 0.15935121291883048, + "grad_norm": 17.668418598817492, + "learning_rate": 4.784300800485126e-06, + "loss": 1.0142, + "step": 1120 + }, + { + "epoch": 0.16077399160560574, + "grad_norm": 29.88399695204879, + "learning_rate": 4.779595005631185e-06, + "loss": 1.1087, + "step": 1130 + }, + { + "epoch": 0.16219677029238103, + "grad_norm": 28.1710561261029, + "learning_rate": 4.7748407968753744e-06, + "loss": 1.0295, + "step": 1140 + }, + { + "epoch": 0.1636195489791563, + "grad_norm": 25.185712797257235, + "learning_rate": 4.770038275187295e-06, + "loss": 1.0201, + "step": 1150 + }, + { + "epoch": 0.16504232766593158, + "grad_norm": 59.25807393338673, + "learning_rate": 4.765187542562615e-06, + "loss": 1.0377, + "step": 1160 + }, + { + "epoch": 0.16646510635270684, + "grad_norm": 29.97235785066762, + "learning_rate": 4.760288702020901e-06, + "loss": 1.0311, + "step": 1170 + }, + { + "epoch": 0.1678878850394821, + "grad_norm": 37.35879781998382, + "learning_rate": 4.755341857603436e-06, + "loss": 1.0862, + "step": 1180 + }, + { + "epoch": 0.1693106637262574, + "grad_norm": 23.672541588017463, + "learning_rate": 4.750347114371004e-06, + "loss": 0.9988, + "step": 1190 + }, + { + "epoch": 0.17073344241303265, + "grad_norm": 19.550633147562124, + "learning_rate": 4.745304578401662e-06, + "loss": 0.9844, + "step": 1200 + }, + { + "epoch": 0.17215622109980794, + "grad_norm": 32.695401824329494, + "learning_rate": 4.740214356788486e-06, + "loss": 1.0509, + "step": 1210 + }, + { + "epoch": 0.1735789997865832, + "grad_norm": 9.646363052517763, + "learning_rate": 4.735076557637297e-06, + "loss": 1.0341, + "step": 1220 + }, + { + "epoch": 0.17500177847335846, + "grad_norm": 24.741345371927583, + "learning_rate": 4.729891290064365e-06, + "loss": 1.0208, + "step": 1230 + }, + { + "epoch": 0.17642455716013375, + "grad_norm": 45.651678686892964, + "learning_rate": 4.724658664194091e-06, + "loss": 1.0502, + "step": 1240 + }, + { + "epoch": 0.177847335846909, + "grad_norm": 19.24141408740206, + "learning_rate": 4.71937879115667e-06, + "loss": 0.9975, + "step": 1250 + }, + { + "epoch": 0.17927011453368427, + "grad_norm": 31.556248725504627, + "learning_rate": 4.714051783085728e-06, + "loss": 1.1013, + "step": 1260 + }, + { + "epoch": 0.18069289322045956, + "grad_norm": 20.17633982870767, + "learning_rate": 4.708677753115942e-06, + "loss": 0.9531, + "step": 1270 + }, + { + "epoch": 0.18211567190723482, + "grad_norm": 19.4051188478354, + "learning_rate": 4.703256815380639e-06, + "loss": 1.0627, + "step": 1280 + }, + { + "epoch": 0.1835384505940101, + "grad_norm": 31.784954367517898, + "learning_rate": 4.6977890850093684e-06, + "loss": 0.9364, + "step": 1290 + }, + { + "epoch": 0.18496122928078537, + "grad_norm": 59.59839123261177, + "learning_rate": 4.692274678125461e-06, + "loss": 0.9959, + "step": 1300 + }, + { + "epoch": 0.18638400796756063, + "grad_norm": 86.09375297334182, + "learning_rate": 4.6867137118435565e-06, + "loss": 1.0006, + "step": 1310 + }, + { + "epoch": 0.18780678665433592, + "grad_norm": 22.56759533641289, + "learning_rate": 4.6811063042671245e-06, + "loss": 0.917, + "step": 1320 + }, + { + "epoch": 0.18922956534111118, + "grad_norm": 42.93003422068975, + "learning_rate": 4.675452574485952e-06, + "loss": 0.9586, + "step": 1330 + }, + { + "epoch": 0.19065234402788647, + "grad_norm": 25.098646626316107, + "learning_rate": 4.669752642573609e-06, + "loss": 0.958, + "step": 1340 + }, + { + "epoch": 0.19207512271466173, + "grad_norm": 26.200356526413692, + "learning_rate": 4.664006629584909e-06, + "loss": 0.9578, + "step": 1350 + }, + { + "epoch": 0.193497901401437, + "grad_norm": 40.45559950946015, + "learning_rate": 4.658214657553333e-06, + "loss": 1.1494, + "step": 1360 + }, + { + "epoch": 0.19492068008821228, + "grad_norm": 11.30097276948413, + "learning_rate": 4.652376849488432e-06, + "loss": 1.0556, + "step": 1370 + }, + { + "epoch": 0.19634345877498754, + "grad_norm": 20.822794274521176, + "learning_rate": 4.646493329373226e-06, + "loss": 1.0884, + "step": 1380 + }, + { + "epoch": 0.19776623746176283, + "grad_norm": 29.222350209439792, + "learning_rate": 4.640564222161559e-06, + "loss": 0.9724, + "step": 1390 + }, + { + "epoch": 0.1991890161485381, + "grad_norm": 17.347939039481435, + "learning_rate": 4.634589653775456e-06, + "loss": 0.9809, + "step": 1400 + }, + { + "epoch": 0.20061179483531336, + "grad_norm": 35.203954530331465, + "learning_rate": 4.628569751102442e-06, + "loss": 0.9926, + "step": 1410 + }, + { + "epoch": 0.20203457352208865, + "grad_norm": 21.145870608823014, + "learning_rate": 4.622504641992847e-06, + "loss": 0.9443, + "step": 1420 + }, + { + "epoch": 0.2034573522088639, + "grad_norm": 109.94274559504879, + "learning_rate": 4.616394455257095e-06, + "loss": 1.0751, + "step": 1430 + }, + { + "epoch": 0.2048801308956392, + "grad_norm": 72.18769613220586, + "learning_rate": 4.610239320662965e-06, + "loss": 0.9313, + "step": 1440 + }, + { + "epoch": 0.20630290958241446, + "grad_norm": 13.0262713604652, + "learning_rate": 4.6040393689328365e-06, + "loss": 1.0373, + "step": 1450 + }, + { + "epoch": 0.20772568826918972, + "grad_norm": 24.568104013079804, + "learning_rate": 4.5977947317409115e-06, + "loss": 0.9896, + "step": 1460 + }, + { + "epoch": 0.209148466955965, + "grad_norm": 19.83377026210839, + "learning_rate": 4.591505541710419e-06, + "loss": 0.9114, + "step": 1470 + }, + { + "epoch": 0.21057124564274027, + "grad_norm": 15.803261030668528, + "learning_rate": 4.585171932410802e-06, + "loss": 1.0671, + "step": 1480 + }, + { + "epoch": 0.21199402432951556, + "grad_norm": 73.8034968144292, + "learning_rate": 4.57879403835487e-06, + "loss": 0.9116, + "step": 1490 + }, + { + "epoch": 0.21341680301629082, + "grad_norm": 50.65746396577896, + "learning_rate": 4.572371994995959e-06, + "loss": 0.9233, + "step": 1500 + }, + { + "epoch": 0.21483958170306608, + "grad_norm": 19.756346599520104, + "learning_rate": 4.565905938725036e-06, + "loss": 0.9569, + "step": 1510 + }, + { + "epoch": 0.21626236038984137, + "grad_norm": 9.29993489756742, + "learning_rate": 4.559396006867819e-06, + "loss": 0.8349, + "step": 1520 + }, + { + "epoch": 0.21768513907661663, + "grad_norm": 25.972739952389897, + "learning_rate": 4.552842337681848e-06, + "loss": 1.004, + "step": 1530 + }, + { + "epoch": 0.2191079177633919, + "grad_norm": 88.34667254185341, + "learning_rate": 4.546245070353559e-06, + "loss": 0.9903, + "step": 1540 + }, + { + "epoch": 0.22053069645016718, + "grad_norm": 31.07756438176318, + "learning_rate": 4.539604344995315e-06, + "loss": 1.0547, + "step": 1550 + }, + { + "epoch": 0.22195347513694244, + "grad_norm": 37.65219781115293, + "learning_rate": 4.532920302642444e-06, + "loss": 0.9818, + "step": 1560 + }, + { + "epoch": 0.22337625382371773, + "grad_norm": 11.736978172112924, + "learning_rate": 4.5261930852502374e-06, + "loss": 0.8711, + "step": 1570 + }, + { + "epoch": 0.224799032510493, + "grad_norm": 53.43167755603555, + "learning_rate": 4.519422835690934e-06, + "loss": 1.0607, + "step": 1580 + }, + { + "epoch": 0.22622181119726825, + "grad_norm": 33.172428433273375, + "learning_rate": 4.512609697750687e-06, + "loss": 0.8635, + "step": 1590 + }, + { + "epoch": 0.22764458988404354, + "grad_norm": 30.294310375393177, + "learning_rate": 4.505753816126511e-06, + "loss": 1.0003, + "step": 1600 + }, + { + "epoch": 0.2290673685708188, + "grad_norm": 16.381735397778595, + "learning_rate": 4.4988553364232065e-06, + "loss": 0.9944, + "step": 1610 + }, + { + "epoch": 0.2304901472575941, + "grad_norm": 25.441517619108396, + "learning_rate": 4.491914405150275e-06, + "loss": 0.8836, + "step": 1620 + }, + { + "epoch": 0.23191292594436935, + "grad_norm": 22.84586846575256, + "learning_rate": 4.4849311697187955e-06, + "loss": 0.9763, + "step": 1630 + }, + { + "epoch": 0.2333357046311446, + "grad_norm": 22.741071369838796, + "learning_rate": 4.477905778438303e-06, + "loss": 0.9, + "step": 1640 + }, + { + "epoch": 0.2347584833179199, + "grad_norm": 45.274883279817054, + "learning_rate": 4.470838380513637e-06, + "loss": 0.9619, + "step": 1650 + }, + { + "epoch": 0.23618126200469516, + "grad_norm": 28.411705472066803, + "learning_rate": 4.463729126041769e-06, + "loss": 0.9951, + "step": 1660 + }, + { + "epoch": 0.23760404069147045, + "grad_norm": 23.03515293998754, + "learning_rate": 4.45657816600862e-06, + "loss": 0.9833, + "step": 1670 + }, + { + "epoch": 0.23902681937824571, + "grad_norm": 36.26988848442576, + "learning_rate": 4.449385652285849e-06, + "loss": 0.9385, + "step": 1680 + }, + { + "epoch": 0.24044959806502098, + "grad_norm": 29.98622414763686, + "learning_rate": 4.44215173762763e-06, + "loss": 1.0314, + "step": 1690 + }, + { + "epoch": 0.24187237675179626, + "grad_norm": 25.99130726723674, + "learning_rate": 4.434876575667409e-06, + "loss": 1.0867, + "step": 1700 + }, + { + "epoch": 0.24329515543857153, + "grad_norm": 19.5639741652505, + "learning_rate": 4.42756032091464e-06, + "loss": 1.0046, + "step": 1710 + }, + { + "epoch": 0.24471793412534681, + "grad_norm": 34.129588654435274, + "learning_rate": 4.4202031287515e-06, + "loss": 0.9393, + "step": 1720 + }, + { + "epoch": 0.24614071281212208, + "grad_norm": 18.561263502065916, + "learning_rate": 4.412805155429596e-06, + "loss": 0.9873, + "step": 1730 + }, + { + "epoch": 0.24756349149889734, + "grad_norm": 37.94544849948392, + "learning_rate": 4.4053665580666406e-06, + "loss": 0.9624, + "step": 1740 + }, + { + "epoch": 0.24898627018567263, + "grad_norm": 84.98992365379094, + "learning_rate": 4.397887494643119e-06, + "loss": 0.9292, + "step": 1750 + }, + { + "epoch": 0.2504090488724479, + "grad_norm": 41.9440027405956, + "learning_rate": 4.390368123998928e-06, + "loss": 1.1042, + "step": 1760 + }, + { + "epoch": 0.2518318275592232, + "grad_norm": 18.271645141393737, + "learning_rate": 4.382808605830014e-06, + "loss": 0.8905, + "step": 1770 + }, + { + "epoch": 0.25325460624599844, + "grad_norm": 25.615500179028004, + "learning_rate": 4.375209100684968e-06, + "loss": 0.9344, + "step": 1780 + }, + { + "epoch": 0.2546773849327737, + "grad_norm": 34.000225650472906, + "learning_rate": 4.367569769961624e-06, + "loss": 1.0618, + "step": 1790 + }, + { + "epoch": 0.25610016361954896, + "grad_norm": 17.946869566933312, + "learning_rate": 4.35989077590363e-06, + "loss": 0.9339, + "step": 1800 + }, + { + "epoch": 0.2575229423063243, + "grad_norm": 39.112615826338605, + "learning_rate": 4.352172281596999e-06, + "loss": 0.887, + "step": 1810 + }, + { + "epoch": 0.25894572099309954, + "grad_norm": 10.173767469816399, + "learning_rate": 4.344414450966652e-06, + "loss": 0.9607, + "step": 1820 + }, + { + "epoch": 0.2603684996798748, + "grad_norm": 20.08966991211306, + "learning_rate": 4.336617448772929e-06, + "loss": 1.0094, + "step": 1830 + }, + { + "epoch": 0.26179127836665006, + "grad_norm": 23.382294735645083, + "learning_rate": 4.3287814406080955e-06, + "loss": 0.9473, + "step": 1840 + }, + { + "epoch": 0.2632140570534253, + "grad_norm": 13.416514028844402, + "learning_rate": 4.320906592892821e-06, + "loss": 0.8759, + "step": 1850 + }, + { + "epoch": 0.26463683574020064, + "grad_norm": 18.21997944762085, + "learning_rate": 4.3129930728726475e-06, + "loss": 0.9567, + "step": 1860 + }, + { + "epoch": 0.2660596144269759, + "grad_norm": 26.833449827216317, + "learning_rate": 4.305041048614441e-06, + "loss": 0.8975, + "step": 1870 + }, + { + "epoch": 0.26748239311375116, + "grad_norm": 17.479289371987786, + "learning_rate": 4.297050689002814e-06, + "loss": 0.8648, + "step": 1880 + }, + { + "epoch": 0.2689051718005264, + "grad_norm": 43.85707088350821, + "learning_rate": 4.289022163736544e-06, + "loss": 0.9487, + "step": 1890 + }, + { + "epoch": 0.2703279504873017, + "grad_norm": 59.69136365583277, + "learning_rate": 4.280955643324969e-06, + "loss": 0.8577, + "step": 1900 + }, + { + "epoch": 0.271750729174077, + "grad_norm": 24.95179608305563, + "learning_rate": 4.272851299084365e-06, + "loss": 0.9763, + "step": 1910 + }, + { + "epoch": 0.27317350786085226, + "grad_norm": 25.905615353023105, + "learning_rate": 4.264709303134308e-06, + "loss": 0.8112, + "step": 1920 + }, + { + "epoch": 0.2745962865476275, + "grad_norm": 16.44416792587216, + "learning_rate": 4.256529828394019e-06, + "loss": 0.9131, + "step": 1930 + }, + { + "epoch": 0.2760190652344028, + "grad_norm": 21.581395452993693, + "learning_rate": 4.2483130485786924e-06, + "loss": 0.8873, + "step": 1940 + }, + { + "epoch": 0.27744184392117804, + "grad_norm": 19.647619107063836, + "learning_rate": 4.2400591381958044e-06, + "loss": 0.8884, + "step": 1950 + }, + { + "epoch": 0.2788646226079533, + "grad_norm": 23.162495970616806, + "learning_rate": 4.231768272541408e-06, + "loss": 0.8656, + "step": 1960 + }, + { + "epoch": 0.2802874012947286, + "grad_norm": 24.785879598129082, + "learning_rate": 4.22344062769641e-06, + "loss": 0.8738, + "step": 1970 + }, + { + "epoch": 0.2817101799815039, + "grad_norm": 41.50134900928378, + "learning_rate": 4.215076380522832e-06, + "loss": 0.9892, + "step": 1980 + }, + { + "epoch": 0.28313295866827914, + "grad_norm": 26.038557737763075, + "learning_rate": 4.206675708660053e-06, + "loss": 0.9443, + "step": 1990 + }, + { + "epoch": 0.2845557373550544, + "grad_norm": 22.726648702579652, + "learning_rate": 4.198238790521037e-06, + "loss": 0.9251, + "step": 2000 + }, + { + "epoch": 0.28597851604182967, + "grad_norm": 53.35902150926828, + "learning_rate": 4.189765805288547e-06, + "loss": 0.8751, + "step": 2010 + }, + { + "epoch": 0.287401294728605, + "grad_norm": 17.06572011162811, + "learning_rate": 4.181256932911333e-06, + "loss": 0.9443, + "step": 2020 + }, + { + "epoch": 0.28882407341538024, + "grad_norm": 11.916586599185443, + "learning_rate": 4.172712354100315e-06, + "loss": 0.9999, + "step": 2030 + }, + { + "epoch": 0.2902468521021555, + "grad_norm": 23.991331588369764, + "learning_rate": 4.164132250324747e-06, + "loss": 0.9772, + "step": 2040 + }, + { + "epoch": 0.29166963078893077, + "grad_norm": 12.585371928264612, + "learning_rate": 4.155516803808357e-06, + "loss": 0.91, + "step": 2050 + }, + { + "epoch": 0.29309240947570603, + "grad_norm": 28.322986319509504, + "learning_rate": 4.14686619752548e-06, + "loss": 1.0151, + "step": 2060 + }, + { + "epoch": 0.29451518816248135, + "grad_norm": 13.797656671652906, + "learning_rate": 4.138180615197173e-06, + "loss": 0.8534, + "step": 2070 + }, + { + "epoch": 0.2959379668492566, + "grad_norm": 12.278884499684143, + "learning_rate": 4.129460241287313e-06, + "loss": 0.92, + "step": 2080 + }, + { + "epoch": 0.29736074553603187, + "grad_norm": 21.325682869764425, + "learning_rate": 4.120705260998677e-06, + "loss": 0.9308, + "step": 2090 + }, + { + "epoch": 0.29878352422280713, + "grad_norm": 15.28691034387365, + "learning_rate": 4.111915860269011e-06, + "loss": 0.9071, + "step": 2100 + }, + { + "epoch": 0.3002063029095824, + "grad_norm": 17.27001412320014, + "learning_rate": 4.103092225767079e-06, + "loss": 1.0093, + "step": 2110 + }, + { + "epoch": 0.3016290815963577, + "grad_norm": 9.580279459737769, + "learning_rate": 4.094234544888702e-06, + "loss": 0.8731, + "step": 2120 + }, + { + "epoch": 0.30305186028313297, + "grad_norm": 39.44419290450929, + "learning_rate": 4.085343005752774e-06, + "loss": 1.0284, + "step": 2130 + }, + { + "epoch": 0.30447463896990823, + "grad_norm": 21.611840041612204, + "learning_rate": 4.07641779719727e-06, + "loss": 0.9289, + "step": 2140 + }, + { + "epoch": 0.3058974176566835, + "grad_norm": 18.649439161622322, + "learning_rate": 4.067459108775233e-06, + "loss": 0.8894, + "step": 2150 + }, + { + "epoch": 0.30732019634345875, + "grad_norm": 13.95416605872919, + "learning_rate": 4.058467130750751e-06, + "loss": 0.9318, + "step": 2160 + }, + { + "epoch": 0.30874297503023407, + "grad_norm": 20.547631527897213, + "learning_rate": 4.049442054094911e-06, + "loss": 0.9263, + "step": 2170 + }, + { + "epoch": 0.31016575371700933, + "grad_norm": 35.25360992586632, + "learning_rate": 4.040384070481751e-06, + "loss": 0.9331, + "step": 2180 + }, + { + "epoch": 0.3115885324037846, + "grad_norm": 19.707802730532986, + "learning_rate": 4.031293372284182e-06, + "loss": 0.9877, + "step": 2190 + }, + { + "epoch": 0.31301131109055985, + "grad_norm": 51.74288791532746, + "learning_rate": 4.022170152569908e-06, + "loss": 0.9195, + "step": 2200 + }, + { + "epoch": 0.3144340897773351, + "grad_norm": 12.795646156712829, + "learning_rate": 4.013014605097321e-06, + "loss": 0.9595, + "step": 2210 + }, + { + "epoch": 0.31585686846411043, + "grad_norm": 26.854021865817128, + "learning_rate": 4.0038269243113875e-06, + "loss": 0.8867, + "step": 2220 + }, + { + "epoch": 0.3172796471508857, + "grad_norm": 17.896171542975722, + "learning_rate": 3.994607305339522e-06, + "loss": 0.9779, + "step": 2230 + }, + { + "epoch": 0.31870242583766095, + "grad_norm": 55.11927308390862, + "learning_rate": 3.985355943987437e-06, + "loss": 0.8624, + "step": 2240 + }, + { + "epoch": 0.3201252045244362, + "grad_norm": 17.36025025212368, + "learning_rate": 3.976073036734992e-06, + "loss": 0.9457, + "step": 2250 + }, + { + "epoch": 0.3215479832112115, + "grad_norm": 23.497001094477394, + "learning_rate": 3.966758780732012e-06, + "loss": 0.8444, + "step": 2260 + }, + { + "epoch": 0.3229707618979868, + "grad_norm": 24.99517493980936, + "learning_rate": 3.957413373794106e-06, + "loss": 0.8614, + "step": 2270 + }, + { + "epoch": 0.32439354058476205, + "grad_norm": 13.13877674656499, + "learning_rate": 3.948037014398467e-06, + "loss": 0.8804, + "step": 2280 + }, + { + "epoch": 0.3258163192715373, + "grad_norm": 57.07057351332041, + "learning_rate": 3.938629901679652e-06, + "loss": 0.8595, + "step": 2290 + }, + { + "epoch": 0.3272390979583126, + "grad_norm": 23.765236030390856, + "learning_rate": 3.929192235425358e-06, + "loss": 0.9584, + "step": 2300 + }, + { + "epoch": 0.32866187664508784, + "grad_norm": 28.279346440606176, + "learning_rate": 3.919724216072173e-06, + "loss": 0.9113, + "step": 2310 + }, + { + "epoch": 0.33008465533186315, + "grad_norm": 11.409844719072586, + "learning_rate": 3.910226044701323e-06, + "loss": 0.8503, + "step": 2320 + }, + { + "epoch": 0.3315074340186384, + "grad_norm": 22.617931436775116, + "learning_rate": 3.900697923034401e-06, + "loss": 0.9235, + "step": 2330 + }, + { + "epoch": 0.3329302127054137, + "grad_norm": 15.776516228855538, + "learning_rate": 3.891140053429085e-06, + "loss": 0.8564, + "step": 2340 + }, + { + "epoch": 0.33435299139218894, + "grad_norm": 15.955596620116578, + "learning_rate": 3.881552638874835e-06, + "loss": 0.8281, + "step": 2350 + }, + { + "epoch": 0.3357757700789642, + "grad_norm": 17.290067596352323, + "learning_rate": 3.8719358829885845e-06, + "loss": 0.8976, + "step": 2360 + }, + { + "epoch": 0.3371985487657395, + "grad_norm": 11.323070820416119, + "learning_rate": 3.862289990010418e-06, + "loss": 0.7097, + "step": 2370 + }, + { + "epoch": 0.3386213274525148, + "grad_norm": 13.479043421646962, + "learning_rate": 3.852615164799233e-06, + "loss": 0.9754, + "step": 2380 + }, + { + "epoch": 0.34004410613929004, + "grad_norm": 30.713987560244302, + "learning_rate": 3.842911612828385e-06, + "loss": 0.7999, + "step": 2390 + }, + { + "epoch": 0.3414668848260653, + "grad_norm": 13.737577446750928, + "learning_rate": 3.8331795401813285e-06, + "loss": 0.7721, + "step": 2400 + }, + { + "epoch": 0.34288966351284056, + "grad_norm": 22.24678775851343, + "learning_rate": 3.823419153547238e-06, + "loss": 0.8526, + "step": 2410 + }, + { + "epoch": 0.3443124421996159, + "grad_norm": 17.04844934249826, + "learning_rate": 3.8136306602166194e-06, + "loss": 0.8595, + "step": 2420 + }, + { + "epoch": 0.34573522088639114, + "grad_norm": 25.308993688836463, + "learning_rate": 3.8038142680769063e-06, + "loss": 1.0052, + "step": 2430 + }, + { + "epoch": 0.3471579995731664, + "grad_norm": 14.393502318885858, + "learning_rate": 3.7939701856080457e-06, + "loss": 0.8819, + "step": 2440 + }, + { + "epoch": 0.34858077825994166, + "grad_norm": 10.4265150253465, + "learning_rate": 3.7840986218780685e-06, + "loss": 0.8679, + "step": 2450 + }, + { + "epoch": 0.3500035569467169, + "grad_norm": 17.551190684773992, + "learning_rate": 3.774199786538654e-06, + "loss": 0.8846, + "step": 2460 + }, + { + "epoch": 0.35142633563349224, + "grad_norm": 18.0417618263592, + "learning_rate": 3.764273889820673e-06, + "loss": 0.8088, + "step": 2470 + }, + { + "epoch": 0.3528491143202675, + "grad_norm": 46.10956168861658, + "learning_rate": 3.754321142529724e-06, + "loss": 0.8241, + "step": 2480 + }, + { + "epoch": 0.35427189300704276, + "grad_norm": 25.26864515730438, + "learning_rate": 3.744341756041655e-06, + "loss": 0.8299, + "step": 2490 + }, + { + "epoch": 0.355694671693818, + "grad_norm": 21.076814874729575, + "learning_rate": 3.7343359422980795e-06, + "loss": 0.8453, + "step": 2500 + }, + { + "epoch": 0.3571174503805933, + "grad_norm": 14.730127479771735, + "learning_rate": 3.724303913801867e-06, + "loss": 0.8901, + "step": 2510 + }, + { + "epoch": 0.35854022906736854, + "grad_norm": 19.02164497852309, + "learning_rate": 3.714245883612637e-06, + "loss": 0.7881, + "step": 2520 + }, + { + "epoch": 0.35996300775414386, + "grad_norm": 25.87504804339562, + "learning_rate": 3.7041620653422318e-06, + "loss": 1.0057, + "step": 2530 + }, + { + "epoch": 0.3613857864409191, + "grad_norm": 18.900367608478, + "learning_rate": 3.6940526731501765e-06, + "loss": 0.8415, + "step": 2540 + }, + { + "epoch": 0.3628085651276944, + "grad_norm": 13.818205544374647, + "learning_rate": 3.6839179217391376e-06, + "loss": 0.8001, + "step": 2550 + }, + { + "epoch": 0.36423134381446964, + "grad_norm": 15.922467404725849, + "learning_rate": 3.673758026350355e-06, + "loss": 0.9267, + "step": 2560 + }, + { + "epoch": 0.3656541225012449, + "grad_norm": 14.467916102674197, + "learning_rate": 3.6635732027590787e-06, + "loss": 0.8614, + "step": 2570 + }, + { + "epoch": 0.3670769011880202, + "grad_norm": 11.21170529268701, + "learning_rate": 3.6533636672699795e-06, + "loss": 0.9035, + "step": 2580 + }, + { + "epoch": 0.3684996798747955, + "grad_norm": 27.606309272195567, + "learning_rate": 3.6431296367125612e-06, + "loss": 0.7962, + "step": 2590 + }, + { + "epoch": 0.36992245856157074, + "grad_norm": 19.853262159190642, + "learning_rate": 3.632871328436548e-06, + "loss": 0.8038, + "step": 2600 + }, + { + "epoch": 0.371345237248346, + "grad_norm": 19.731745048336382, + "learning_rate": 3.622588960307275e-06, + "loss": 0.9304, + "step": 2610 + }, + { + "epoch": 0.37276801593512127, + "grad_norm": 21.55654616341265, + "learning_rate": 3.612282750701062e-06, + "loss": 0.8366, + "step": 2620 + }, + { + "epoch": 0.3741907946218966, + "grad_norm": 26.397576545609592, + "learning_rate": 3.601952918500567e-06, + "loss": 0.9263, + "step": 2630 + }, + { + "epoch": 0.37561357330867184, + "grad_norm": 25.926948460471703, + "learning_rate": 3.591599683090148e-06, + "loss": 0.9104, + "step": 2640 + }, + { + "epoch": 0.3770363519954471, + "grad_norm": 15.998065211902164, + "learning_rate": 3.5812232643511958e-06, + "loss": 0.7771, + "step": 2650 + }, + { + "epoch": 0.37845913068222237, + "grad_norm": 26.96727158385917, + "learning_rate": 3.5708238826574693e-06, + "loss": 0.9005, + "step": 2660 + }, + { + "epoch": 0.37988190936899763, + "grad_norm": 20.10325840992243, + "learning_rate": 3.560401758870412e-06, + "loss": 0.7469, + "step": 2670 + }, + { + "epoch": 0.38130468805577294, + "grad_norm": 50.12252765157174, + "learning_rate": 3.549957114334464e-06, + "loss": 0.844, + "step": 2680 + }, + { + "epoch": 0.3827274667425482, + "grad_norm": 35.46308598534144, + "learning_rate": 3.5394901708723583e-06, + "loss": 0.713, + "step": 2690 + }, + { + "epoch": 0.38415024542932347, + "grad_norm": 13.37358811877764, + "learning_rate": 3.529001150780412e-06, + "loss": 0.8835, + "step": 2700 + }, + { + "epoch": 0.38557302411609873, + "grad_norm": 67.31815559112255, + "learning_rate": 3.5184902768238036e-06, + "loss": 0.8744, + "step": 2710 + }, + { + "epoch": 0.386995802802874, + "grad_norm": 20.83885896533027, + "learning_rate": 3.5079577722318438e-06, + "loss": 0.8298, + "step": 2720 + }, + { + "epoch": 0.3884185814896493, + "grad_norm": 35.929755336977074, + "learning_rate": 3.497403860693232e-06, + "loss": 0.824, + "step": 2730 + }, + { + "epoch": 0.38984136017642457, + "grad_norm": 22.769671014939643, + "learning_rate": 3.4868287663513088e-06, + "loss": 0.8447, + "step": 2740 + }, + { + "epoch": 0.39126413886319983, + "grad_norm": 33.4314261483403, + "learning_rate": 3.4762327137992915e-06, + "loss": 0.7872, + "step": 2750 + }, + { + "epoch": 0.3926869175499751, + "grad_norm": 30.356120631372097, + "learning_rate": 3.46561592807551e-06, + "loss": 0.8708, + "step": 2760 + }, + { + "epoch": 0.39410969623675035, + "grad_norm": 13.978203051247352, + "learning_rate": 3.4549786346586212e-06, + "loss": 0.7915, + "step": 2770 + }, + { + "epoch": 0.39553247492352567, + "grad_norm": 29.198172693027075, + "learning_rate": 3.444321059462823e-06, + "loss": 0.9637, + "step": 2780 + }, + { + "epoch": 0.39695525361030093, + "grad_norm": 38.94328556809456, + "learning_rate": 3.4336434288330595e-06, + "loss": 0.7995, + "step": 2790 + }, + { + "epoch": 0.3983780322970762, + "grad_norm": 12.890373512474305, + "learning_rate": 3.4229459695402068e-06, + "loss": 0.9066, + "step": 2800 + }, + { + "epoch": 0.39980081098385145, + "grad_norm": 29.611565622555542, + "learning_rate": 3.4122289087762646e-06, + "loss": 0.8502, + "step": 2810 + }, + { + "epoch": 0.4012235896706267, + "grad_norm": 19.90868187031473, + "learning_rate": 3.4014924741495263e-06, + "loss": 0.8319, + "step": 2820 + }, + { + "epoch": 0.40264636835740203, + "grad_norm": 19.810385554932147, + "learning_rate": 3.3907368936797443e-06, + "loss": 0.8489, + "step": 2830 + }, + { + "epoch": 0.4040691470441773, + "grad_norm": 33.35123940706816, + "learning_rate": 3.3799623957932924e-06, + "loss": 0.8622, + "step": 2840 + }, + { + "epoch": 0.40549192573095255, + "grad_norm": 24.606830290642474, + "learning_rate": 3.3691692093183093e-06, + "loss": 0.8794, + "step": 2850 + }, + { + "epoch": 0.4069147044177278, + "grad_norm": 24.341434672257584, + "learning_rate": 3.358357563479841e-06, + "loss": 0.7623, + "step": 2860 + }, + { + "epoch": 0.4083374831045031, + "grad_norm": 20.93359541770851, + "learning_rate": 3.3475276878949724e-06, + "loss": 0.7811, + "step": 2870 + }, + { + "epoch": 0.4097602617912784, + "grad_norm": 11.64588748669064, + "learning_rate": 3.336679812567953e-06, + "loss": 0.8174, + "step": 2880 + }, + { + "epoch": 0.41118304047805365, + "grad_norm": 29.809570461704386, + "learning_rate": 3.3258141678853064e-06, + "loss": 0.7583, + "step": 2890 + }, + { + "epoch": 0.4126058191648289, + "grad_norm": 19.944685397539168, + "learning_rate": 3.3149309846109403e-06, + "loss": 0.8549, + "step": 2900 + }, + { + "epoch": 0.4140285978516042, + "grad_norm": 13.417267929086359, + "learning_rate": 3.3040304938812506e-06, + "loss": 0.7833, + "step": 2910 + }, + { + "epoch": 0.41545137653837944, + "grad_norm": 11.077322871687823, + "learning_rate": 3.2931129272002033e-06, + "loss": 0.7773, + "step": 2920 + }, + { + "epoch": 0.41687415522515475, + "grad_norm": 12.517933094558126, + "learning_rate": 3.2821785164344242e-06, + "loss": 0.8651, + "step": 2930 + }, + { + "epoch": 0.41829693391193, + "grad_norm": 28.71996579019108, + "learning_rate": 3.2712274938082746e-06, + "loss": 0.7801, + "step": 2940 + }, + { + "epoch": 0.4197197125987053, + "grad_norm": 16.51415564607136, + "learning_rate": 3.2602600918989126e-06, + "loss": 0.7822, + "step": 2950 + }, + { + "epoch": 0.42114249128548054, + "grad_norm": 24.564509764438558, + "learning_rate": 3.249276543631363e-06, + "loss": 0.8514, + "step": 2960 + }, + { + "epoch": 0.4225652699722558, + "grad_norm": 15.847385700324564, + "learning_rate": 3.238277082273564e-06, + "loss": 0.8609, + "step": 2970 + }, + { + "epoch": 0.4239880486590311, + "grad_norm": 38.683130093540626, + "learning_rate": 3.2272619414314167e-06, + "loss": 0.777, + "step": 2980 + }, + { + "epoch": 0.4254108273458064, + "grad_norm": 22.996043924193113, + "learning_rate": 3.2162313550438197e-06, + "loss": 0.7298, + "step": 2990 + }, + { + "epoch": 0.42683360603258164, + "grad_norm": 33.18898104375557, + "learning_rate": 3.2051855573777045e-06, + "loss": 0.846, + "step": 3000 + }, + { + "epoch": 0.4282563847193569, + "grad_norm": 30.006547705606142, + "learning_rate": 3.19412478302306e-06, + "loss": 0.8202, + "step": 3010 + }, + { + "epoch": 0.42967916340613216, + "grad_norm": 21.366134925657423, + "learning_rate": 3.1830492668879464e-06, + "loss": 0.805, + "step": 3020 + }, + { + "epoch": 0.4311019420929075, + "grad_norm": 15.471273661530745, + "learning_rate": 3.171959244193512e-06, + "loss": 0.906, + "step": 3030 + }, + { + "epoch": 0.43252472077968274, + "grad_norm": 62.7358588675245, + "learning_rate": 3.160854950468991e-06, + "loss": 0.9361, + "step": 3040 + }, + { + "epoch": 0.433947499466458, + "grad_norm": 87.71714813403881, + "learning_rate": 3.149736621546709e-06, + "loss": 0.8383, + "step": 3050 + }, + { + "epoch": 0.43537027815323326, + "grad_norm": 27.520759090414302, + "learning_rate": 3.1386044935570665e-06, + "loss": 0.876, + "step": 3060 + }, + { + "epoch": 0.4367930568400085, + "grad_norm": 13.9736259962008, + "learning_rate": 3.1274588029235297e-06, + "loss": 0.6913, + "step": 3070 + }, + { + "epoch": 0.4382158355267838, + "grad_norm": 15.111893897955833, + "learning_rate": 3.1162997863576076e-06, + "loss": 0.8171, + "step": 3080 + }, + { + "epoch": 0.4396386142135591, + "grad_norm": 25.631492477295634, + "learning_rate": 3.1051276808538238e-06, + "loss": 0.8492, + "step": 3090 + }, + { + "epoch": 0.44106139290033436, + "grad_norm": 43.326724052088984, + "learning_rate": 3.093942723684684e-06, + "loss": 0.754, + "step": 3100 + }, + { + "epoch": 0.4424841715871096, + "grad_norm": 31.65852965127963, + "learning_rate": 3.0827451523956385e-06, + "loss": 0.781, + "step": 3110 + }, + { + "epoch": 0.4439069502738849, + "grad_norm": 10.557234288549818, + "learning_rate": 3.071535204800032e-06, + "loss": 0.8802, + "step": 3120 + }, + { + "epoch": 0.44532972896066014, + "grad_norm": 7.741616062639269, + "learning_rate": 3.060313118974059e-06, + "loss": 0.9144, + "step": 3130 + }, + { + "epoch": 0.44675250764743546, + "grad_norm": 17.118385663085036, + "learning_rate": 3.049079133251706e-06, + "loss": 0.8421, + "step": 3140 + }, + { + "epoch": 0.4481752863342107, + "grad_norm": 15.048257306198911, + "learning_rate": 3.0378334862196857e-06, + "loss": 0.8138, + "step": 3150 + }, + { + "epoch": 0.449598065020986, + "grad_norm": 31.256968405353184, + "learning_rate": 3.026576416712376e-06, + "loss": 0.9385, + "step": 3160 + }, + { + "epoch": 0.45102084370776124, + "grad_norm": 11.182338189217198, + "learning_rate": 3.015308163806741e-06, + "loss": 0.791, + "step": 3170 + }, + { + "epoch": 0.4524436223945365, + "grad_norm": 17.60192918913859, + "learning_rate": 3.0040289668172616e-06, + "loss": 0.7683, + "step": 3180 + }, + { + "epoch": 0.4538664010813118, + "grad_norm": 30.05089385535281, + "learning_rate": 2.992739065290845e-06, + "loss": 0.803, + "step": 3190 + }, + { + "epoch": 0.4552891797680871, + "grad_norm": 13.204065583525782, + "learning_rate": 2.9814386990017434e-06, + "loss": 0.9288, + "step": 3200 + }, + { + "epoch": 0.45671195845486234, + "grad_norm": 13.859245888833582, + "learning_rate": 2.970128107946456e-06, + "loss": 0.8033, + "step": 3210 + }, + { + "epoch": 0.4581347371416376, + "grad_norm": 21.38437791515857, + "learning_rate": 2.958807532338637e-06, + "loss": 0.7454, + "step": 3220 + }, + { + "epoch": 0.45955751582841287, + "grad_norm": 17.25273898261663, + "learning_rate": 2.947477212603993e-06, + "loss": 0.8496, + "step": 3230 + }, + { + "epoch": 0.4609802945151882, + "grad_norm": 17.365610931311284, + "learning_rate": 2.9361373893751714e-06, + "loss": 0.7646, + "step": 3240 + }, + { + "epoch": 0.46240307320196344, + "grad_norm": 22.98057561120779, + "learning_rate": 2.92478830348666e-06, + "loss": 0.7607, + "step": 3250 + }, + { + "epoch": 0.4638258518887387, + "grad_norm": 65.67946753021502, + "learning_rate": 2.913430195969662e-06, + "loss": 0.7932, + "step": 3260 + }, + { + "epoch": 0.46524863057551397, + "grad_norm": 10.472509256267601, + "learning_rate": 2.9020633080469832e-06, + "loss": 0.6801, + "step": 3270 + }, + { + "epoch": 0.4666714092622892, + "grad_norm": 8.728102914060528, + "learning_rate": 2.890687881127907e-06, + "loss": 0.7957, + "step": 3280 + }, + { + "epoch": 0.46809418794906454, + "grad_norm": 36.73652541253815, + "learning_rate": 2.8793041568030676e-06, + "loss": 0.7548, + "step": 3290 + }, + { + "epoch": 0.4695169666358398, + "grad_norm": 15.554148968457284, + "learning_rate": 2.867912376839318e-06, + "loss": 0.74, + "step": 3300 + }, + { + "epoch": 0.47093974532261507, + "grad_norm": 22.587356388546713, + "learning_rate": 2.856512783174598e-06, + "loss": 0.8052, + "step": 3310 + }, + { + "epoch": 0.4723625240093903, + "grad_norm": 12.635529890330547, + "learning_rate": 2.845105617912793e-06, + "loss": 0.733, + "step": 3320 + }, + { + "epoch": 0.4737853026961656, + "grad_norm": 26.954609723737445, + "learning_rate": 2.8336911233185934e-06, + "loss": 0.8855, + "step": 3330 + }, + { + "epoch": 0.4752080813829409, + "grad_norm": 86.32637501223262, + "learning_rate": 2.8222695418123508e-06, + "loss": 0.8312, + "step": 3340 + }, + { + "epoch": 0.47663086006971617, + "grad_norm": 14.733465114880971, + "learning_rate": 2.810841115964928e-06, + "loss": 0.7234, + "step": 3350 + }, + { + "epoch": 0.47805363875649143, + "grad_norm": 19.34741466004809, + "learning_rate": 2.7994060884925457e-06, + "loss": 0.8293, + "step": 3360 + }, + { + "epoch": 0.4794764174432667, + "grad_norm": 7.637471653925083, + "learning_rate": 2.787964702251632e-06, + "loss": 0.803, + "step": 3370 + }, + { + "epoch": 0.48089919613004195, + "grad_norm": 29.511379406914546, + "learning_rate": 2.7765172002336605e-06, + "loss": 0.7792, + "step": 3380 + }, + { + "epoch": 0.48232197481681727, + "grad_norm": 34.4635844809523, + "learning_rate": 2.765063825559991e-06, + "loss": 0.8358, + "step": 3390 + }, + { + "epoch": 0.48374475350359253, + "grad_norm": 13.354157874294469, + "learning_rate": 2.7536048214767095e-06, + "loss": 0.8608, + "step": 3400 + }, + { + "epoch": 0.4851675321903678, + "grad_norm": 27.02117147205228, + "learning_rate": 2.742140431349454e-06, + "loss": 0.9341, + "step": 3410 + }, + { + "epoch": 0.48659031087714305, + "grad_norm": 25.28820023979218, + "learning_rate": 2.730670898658255e-06, + "loss": 0.7388, + "step": 3420 + }, + { + "epoch": 0.4880130895639183, + "grad_norm": 18.21003423824356, + "learning_rate": 2.71919646699236e-06, + "loss": 0.8631, + "step": 3430 + }, + { + "epoch": 0.48943586825069363, + "grad_norm": 21.784602372887232, + "learning_rate": 2.7077173800450586e-06, + "loss": 0.7476, + "step": 3440 + }, + { + "epoch": 0.4908586469374689, + "grad_norm": 10.68299097369915, + "learning_rate": 2.6962338816085103e-06, + "loss": 0.8056, + "step": 3450 + }, + { + "epoch": 0.49228142562424415, + "grad_norm": 28.685154064471742, + "learning_rate": 2.684746215568566e-06, + "loss": 0.782, + "step": 3460 + }, + { + "epoch": 0.4937042043110194, + "grad_norm": 8.053005322889824, + "learning_rate": 2.6732546258995875e-06, + "loss": 0.7254, + "step": 3470 + }, + { + "epoch": 0.4951269829977947, + "grad_norm": 13.318128655622132, + "learning_rate": 2.661759356659265e-06, + "loss": 0.8543, + "step": 3480 + }, + { + "epoch": 0.49654976168457, + "grad_norm": 8.05788198458803, + "learning_rate": 2.6502606519834382e-06, + "loss": 0.7091, + "step": 3490 + }, + { + "epoch": 0.49797254037134525, + "grad_norm": 12.644292212569455, + "learning_rate": 2.638758756080906e-06, + "loss": 0.8854, + "step": 3500 + }, + { + "epoch": 0.4993953190581205, + "grad_norm": 20.711604720983626, + "learning_rate": 2.6272539132282413e-06, + "loss": 0.7534, + "step": 3510 + }, + { + "epoch": 0.5008180977448958, + "grad_norm": 11.915903750704732, + "learning_rate": 2.6157463677646094e-06, + "loss": 0.799, + "step": 3520 + }, + { + "epoch": 0.502240876431671, + "grad_norm": 12.013523847402071, + "learning_rate": 2.604236364086567e-06, + "loss": 0.7497, + "step": 3530 + }, + { + "epoch": 0.5036636551184464, + "grad_norm": 29.372864262754756, + "learning_rate": 2.5927241466428822e-06, + "loss": 0.7534, + "step": 3540 + }, + { + "epoch": 0.5050864338052216, + "grad_norm": 11.132378268449663, + "learning_rate": 2.581209959929339e-06, + "loss": 0.8214, + "step": 3550 + }, + { + "epoch": 0.5065092124919969, + "grad_norm": 60.09517726214323, + "learning_rate": 2.5696940484835408e-06, + "loss": 0.663, + "step": 3560 + }, + { + "epoch": 0.5079319911787722, + "grad_norm": 27.6512864848014, + "learning_rate": 2.5581766568797254e-06, + "loss": 0.6626, + "step": 3570 + }, + { + "epoch": 0.5093547698655474, + "grad_norm": 63.10018193684602, + "learning_rate": 2.546658029723565e-06, + "loss": 0.8257, + "step": 3580 + }, + { + "epoch": 0.5107775485523227, + "grad_norm": 27.616273418999626, + "learning_rate": 2.5351384116469703e-06, + "loss": 0.7819, + "step": 3590 + }, + { + "epoch": 0.5122003272390979, + "grad_norm": 13.470993574295683, + "learning_rate": 2.5236180473028983e-06, + "loss": 0.7722, + "step": 3600 + }, + { + "epoch": 0.5136231059258732, + "grad_norm": 14.99732405273878, + "learning_rate": 2.5120971813601553e-06, + "loss": 0.7715, + "step": 3610 + }, + { + "epoch": 0.5150458846126486, + "grad_norm": 9.656051019345648, + "learning_rate": 2.5005760584982015e-06, + "loss": 0.6556, + "step": 3620 + }, + { + "epoch": 0.5164686632994238, + "grad_norm": 13.74072835052684, + "learning_rate": 2.489054923401952e-06, + "loss": 0.8078, + "step": 3630 + }, + { + "epoch": 0.5178914419861991, + "grad_norm": 25.080939371100442, + "learning_rate": 2.477534020756581e-06, + "loss": 0.7669, + "step": 3640 + }, + { + "epoch": 0.5193142206729743, + "grad_norm": 12.0826969622057, + "learning_rate": 2.46601359524233e-06, + "loss": 0.7708, + "step": 3650 + }, + { + "epoch": 0.5207369993597496, + "grad_norm": 19.16440576327706, + "learning_rate": 2.454493891529302e-06, + "loss": 0.7616, + "step": 3660 + }, + { + "epoch": 0.5221597780465249, + "grad_norm": 7.519923215474658, + "learning_rate": 2.442975154272273e-06, + "loss": 0.8015, + "step": 3670 + }, + { + "epoch": 0.5235825567333001, + "grad_norm": 12.175173720340789, + "learning_rate": 2.4314576281054954e-06, + "loss": 0.7262, + "step": 3680 + }, + { + "epoch": 0.5250053354200754, + "grad_norm": 22.52963523965808, + "learning_rate": 2.419941557637496e-06, + "loss": 0.7193, + "step": 3690 + }, + { + "epoch": 0.5264281141068506, + "grad_norm": 13.643765438313824, + "learning_rate": 2.4084271874458885e-06, + "loss": 0.7809, + "step": 3700 + }, + { + "epoch": 0.527850892793626, + "grad_norm": 20.518020446748704, + "learning_rate": 2.396914762072177e-06, + "loss": 0.7998, + "step": 3710 + }, + { + "epoch": 0.5292736714804013, + "grad_norm": 15.048773713943275, + "learning_rate": 2.3854045260165605e-06, + "loss": 0.776, + "step": 3720 + }, + { + "epoch": 0.5306964501671765, + "grad_norm": 16.141296496771535, + "learning_rate": 2.373896723732739e-06, + "loss": 0.7137, + "step": 3730 + }, + { + "epoch": 0.5321192288539518, + "grad_norm": 14.4660866817992, + "learning_rate": 2.3623915996227278e-06, + "loss": 0.7506, + "step": 3740 + }, + { + "epoch": 0.533542007540727, + "grad_norm": 17.017665454072137, + "learning_rate": 2.350889398031664e-06, + "loss": 0.7292, + "step": 3750 + }, + { + "epoch": 0.5349647862275023, + "grad_norm": 15.28793249418399, + "learning_rate": 2.339390363242611e-06, + "loss": 0.6674, + "step": 3760 + }, + { + "epoch": 0.5363875649142776, + "grad_norm": 12.014033119525148, + "learning_rate": 2.327894739471381e-06, + "loss": 0.6896, + "step": 3770 + }, + { + "epoch": 0.5378103436010528, + "grad_norm": 13.815575430114484, + "learning_rate": 2.3164027708613424e-06, + "loss": 0.7688, + "step": 3780 + }, + { + "epoch": 0.5392331222878282, + "grad_norm": 18.681441273504802, + "learning_rate": 2.3049147014782323e-06, + "loss": 0.6952, + "step": 3790 + }, + { + "epoch": 0.5406559009746034, + "grad_norm": 9.652036109574787, + "learning_rate": 2.2934307753049796e-06, + "loss": 0.8163, + "step": 3800 + }, + { + "epoch": 0.5420786796613787, + "grad_norm": 21.08779764453933, + "learning_rate": 2.28195123623652e-06, + "loss": 0.8639, + "step": 3810 + }, + { + "epoch": 0.543501458348154, + "grad_norm": 14.423681782810991, + "learning_rate": 2.270476328074615e-06, + "loss": 0.8514, + "step": 3820 + }, + { + "epoch": 0.5449242370349292, + "grad_norm": 13.108333913672809, + "learning_rate": 2.2590062945226728e-06, + "loss": 0.8109, + "step": 3830 + }, + { + "epoch": 0.5463470157217045, + "grad_norm": 24.62140487582271, + "learning_rate": 2.2475413791805793e-06, + "loss": 0.7165, + "step": 3840 + }, + { + "epoch": 0.5477697944084797, + "grad_norm": 14.631735750619352, + "learning_rate": 2.236081825539518e-06, + "loss": 0.841, + "step": 3850 + }, + { + "epoch": 0.549192573095255, + "grad_norm": 28.771238476821182, + "learning_rate": 2.2246278769768007e-06, + "loss": 0.8079, + "step": 3860 + }, + { + "epoch": 0.5506153517820304, + "grad_norm": 11.04153065779966, + "learning_rate": 2.2131797767507e-06, + "loss": 0.6889, + "step": 3870 + }, + { + "epoch": 0.5520381304688056, + "grad_norm": 15.667877055627352, + "learning_rate": 2.2017377679952797e-06, + "loss": 0.8782, + "step": 3880 + }, + { + "epoch": 0.5534609091555809, + "grad_norm": 17.915285579619628, + "learning_rate": 2.190302093715237e-06, + "loss": 0.6891, + "step": 3890 + }, + { + "epoch": 0.5548836878423561, + "grad_norm": 8.177091476378212, + "learning_rate": 2.1788729967807343e-06, + "loss": 0.6736, + "step": 3900 + }, + { + "epoch": 0.5563064665291314, + "grad_norm": 14.742743308283796, + "learning_rate": 2.1674507199222457e-06, + "loss": 0.7508, + "step": 3910 + }, + { + "epoch": 0.5577292452159066, + "grad_norm": 11.775787837102852, + "learning_rate": 2.156035505725403e-06, + "loss": 0.7598, + "step": 3920 + }, + { + "epoch": 0.5591520239026819, + "grad_norm": 13.551003247447502, + "learning_rate": 2.1446275966258387e-06, + "loss": 0.7771, + "step": 3930 + }, + { + "epoch": 0.5605748025894572, + "grad_norm": 18.121404299267205, + "learning_rate": 2.133227234904041e-06, + "loss": 0.7978, + "step": 3940 + }, + { + "epoch": 0.5619975812762324, + "grad_norm": 15.58681745499423, + "learning_rate": 2.1218346626802093e-06, + "loss": 0.685, + "step": 3950 + }, + { + "epoch": 0.5634203599630078, + "grad_norm": 22.175071520706346, + "learning_rate": 2.110450121909107e-06, + "loss": 0.752, + "step": 3960 + }, + { + "epoch": 0.564843138649783, + "grad_norm": 15.293504173125388, + "learning_rate": 2.0990738543749275e-06, + "loss": 0.7029, + "step": 3970 + }, + { + "epoch": 0.5662659173365583, + "grad_norm": 8.048785808139954, + "learning_rate": 2.087706101686159e-06, + "loss": 0.816, + "step": 3980 + }, + { + "epoch": 0.5676886960233336, + "grad_norm": 23.32897951386727, + "learning_rate": 2.07634710527045e-06, + "loss": 0.7524, + "step": 3990 + }, + { + "epoch": 0.5691114747101088, + "grad_norm": 11.411010901933299, + "learning_rate": 2.0649971063694834e-06, + "loss": 0.7571, + "step": 4000 + }, + { + "epoch": 0.5705342533968841, + "grad_norm": 23.62461099105402, + "learning_rate": 2.053656346033856e-06, + "loss": 0.7853, + "step": 4010 + }, + { + "epoch": 0.5719570320836593, + "grad_norm": 14.060482033029814, + "learning_rate": 2.0423250651179545e-06, + "loss": 0.7517, + "step": 4020 + }, + { + "epoch": 0.5733798107704347, + "grad_norm": 20.428898918513838, + "learning_rate": 2.0310035042748416e-06, + "loss": 0.7691, + "step": 4030 + }, + { + "epoch": 0.57480258945721, + "grad_norm": 15.764443220749541, + "learning_rate": 2.019691903951148e-06, + "loss": 0.7349, + "step": 4040 + }, + { + "epoch": 0.5762253681439852, + "grad_norm": 17.47815964928467, + "learning_rate": 2.0083905043819623e-06, + "loss": 0.7858, + "step": 4050 + }, + { + "epoch": 0.5776481468307605, + "grad_norm": 66.34722872295633, + "learning_rate": 1.997099545585729e-06, + "loss": 0.8182, + "step": 4060 + }, + { + "epoch": 0.5790709255175357, + "grad_norm": 24.5569265136684, + "learning_rate": 1.9858192673591536e-06, + "loss": 0.7509, + "step": 4070 + }, + { + "epoch": 0.580493704204311, + "grad_norm": 20.773532557907654, + "learning_rate": 1.974549909272108e-06, + "loss": 0.71, + "step": 4080 + }, + { + "epoch": 0.5819164828910863, + "grad_norm": 22.26196471143872, + "learning_rate": 1.9632917106625434e-06, + "loss": 0.646, + "step": 4090 + }, + { + "epoch": 0.5833392615778615, + "grad_norm": 24.53608217109384, + "learning_rate": 1.952044910631405e-06, + "loss": 0.799, + "step": 4100 + }, + { + "epoch": 0.5847620402646369, + "grad_norm": 39.53398159892062, + "learning_rate": 1.9408097480375557e-06, + "loss": 0.7533, + "step": 4110 + }, + { + "epoch": 0.5861848189514121, + "grad_norm": 18.337674444590398, + "learning_rate": 1.929586461492707e-06, + "loss": 0.6658, + "step": 4120 + }, + { + "epoch": 0.5876075976381874, + "grad_norm": 39.29566731972662, + "learning_rate": 1.918375289356342e-06, + "loss": 0.8095, + "step": 4130 + }, + { + "epoch": 0.5890303763249627, + "grad_norm": 21.6126934252539, + "learning_rate": 1.9071764697306619e-06, + "loss": 0.6963, + "step": 4140 + }, + { + "epoch": 0.5904531550117379, + "grad_norm": 17.062659972993746, + "learning_rate": 1.895990240455527e-06, + "loss": 0.7375, + "step": 4150 + }, + { + "epoch": 0.5918759336985132, + "grad_norm": 25.470882444505943, + "learning_rate": 1.8848168391034021e-06, + "loss": 0.7067, + "step": 4160 + }, + { + "epoch": 0.5932987123852884, + "grad_norm": 14.034661319660938, + "learning_rate": 1.8736565029743138e-06, + "loss": 0.6564, + "step": 4170 + }, + { + "epoch": 0.5947214910720637, + "grad_norm": 21.67807376393738, + "learning_rate": 1.8625094690908118e-06, + "loss": 0.8221, + "step": 4180 + }, + { + "epoch": 0.596144269758839, + "grad_norm": 27.935245493154525, + "learning_rate": 1.8513759741929321e-06, + "loss": 0.7042, + "step": 4190 + }, + { + "epoch": 0.5975670484456143, + "grad_norm": 17.1910089435646, + "learning_rate": 1.8402562547331704e-06, + "loss": 0.7366, + "step": 4200 + }, + { + "epoch": 0.5989898271323896, + "grad_norm": 84.46253693971576, + "learning_rate": 1.8291505468714615e-06, + "loss": 0.7689, + "step": 4210 + }, + { + "epoch": 0.6004126058191648, + "grad_norm": 15.71473456214827, + "learning_rate": 1.8180590864701628e-06, + "loss": 0.8016, + "step": 4220 + }, + { + "epoch": 0.6018353845059401, + "grad_norm": 10.447588979119375, + "learning_rate": 1.8069821090890427e-06, + "loss": 0.7882, + "step": 4230 + }, + { + "epoch": 0.6032581631927154, + "grad_norm": 56.25796085812837, + "learning_rate": 1.7959198499802832e-06, + "loss": 0.6945, + "step": 4240 + }, + { + "epoch": 0.6046809418794906, + "grad_norm": 6.285689054380614, + "learning_rate": 1.7848725440834783e-06, + "loss": 0.8445, + "step": 4250 + }, + { + "epoch": 0.6061037205662659, + "grad_norm": 58.716542467523524, + "learning_rate": 1.7738404260206487e-06, + "loss": 0.7795, + "step": 4260 + }, + { + "epoch": 0.6075264992530411, + "grad_norm": 16.08617915947653, + "learning_rate": 1.7628237300912542e-06, + "loss": 0.6113, + "step": 4270 + }, + { + "epoch": 0.6089492779398165, + "grad_norm": 14.376354723346575, + "learning_rate": 1.7518226902672214e-06, + "loss": 0.827, + "step": 4280 + }, + { + "epoch": 0.6103720566265918, + "grad_norm": 17.953652516839444, + "learning_rate": 1.7408375401879748e-06, + "loss": 0.7564, + "step": 4290 + }, + { + "epoch": 0.611794835313367, + "grad_norm": 21.00900096740313, + "learning_rate": 1.7298685131554716e-06, + "loss": 0.7692, + "step": 4300 + }, + { + "epoch": 0.6132176140001423, + "grad_norm": 18.309612056889513, + "learning_rate": 1.7189158421292484e-06, + "loss": 0.6907, + "step": 4310 + }, + { + "epoch": 0.6146403926869175, + "grad_norm": 16.619415766247535, + "learning_rate": 1.707979759721476e-06, + "loss": 0.6678, + "step": 4320 + }, + { + "epoch": 0.6160631713736928, + "grad_norm": 19.779361226835274, + "learning_rate": 1.6970604981920174e-06, + "loss": 0.7107, + "step": 4330 + }, + { + "epoch": 0.6174859500604681, + "grad_norm": 19.91886633881263, + "learning_rate": 1.6861582894434902e-06, + "loss": 0.713, + "step": 4340 + }, + { + "epoch": 0.6189087287472433, + "grad_norm": 130.5283349642206, + "learning_rate": 1.675273365016351e-06, + "loss": 0.6687, + "step": 4350 + }, + { + "epoch": 0.6203315074340187, + "grad_norm": 12.030263554177433, + "learning_rate": 1.6644059560839726e-06, + "loss": 0.717, + "step": 4360 + }, + { + "epoch": 0.6217542861207939, + "grad_norm": 13.146734464615218, + "learning_rate": 1.6535562934477307e-06, + "loss": 0.721, + "step": 4370 + }, + { + "epoch": 0.6231770648075692, + "grad_norm": 13.276782675467445, + "learning_rate": 1.6427246075321099e-06, + "loss": 0.8076, + "step": 4380 + }, + { + "epoch": 0.6245998434943445, + "grad_norm": 40.769061214515816, + "learning_rate": 1.6319111283798078e-06, + "loss": 0.6972, + "step": 4390 + }, + { + "epoch": 0.6260226221811197, + "grad_norm": 12.294066061763774, + "learning_rate": 1.6211160856468416e-06, + "loss": 0.6645, + "step": 4400 + }, + { + "epoch": 0.627445400867895, + "grad_norm": 14.365616243961242, + "learning_rate": 1.6103397085976824e-06, + "loss": 0.7766, + "step": 4410 + }, + { + "epoch": 0.6288681795546702, + "grad_norm": 25.766483277820093, + "learning_rate": 1.59958222610038e-06, + "loss": 0.7166, + "step": 4420 + }, + { + "epoch": 0.6302909582414455, + "grad_norm": 11.836958911544688, + "learning_rate": 1.5888438666217012e-06, + "loss": 0.7651, + "step": 4430 + }, + { + "epoch": 0.6317137369282209, + "grad_norm": 13.9380916036791, + "learning_rate": 1.5781248582222787e-06, + "loss": 0.68, + "step": 4440 + }, + { + "epoch": 0.6331365156149961, + "grad_norm": 18.812370433142473, + "learning_rate": 1.5674254285517712e-06, + "loss": 0.7157, + "step": 4450 + }, + { + "epoch": 0.6345592943017714, + "grad_norm": 29.237875217787078, + "learning_rate": 1.5567458048440232e-06, + "loss": 0.6739, + "step": 4460 + }, + { + "epoch": 0.6359820729885466, + "grad_norm": 10.488421538243566, + "learning_rate": 1.54608621391224e-06, + "loss": 0.7193, + "step": 4470 + }, + { + "epoch": 0.6374048516753219, + "grad_norm": 11.137577579014978, + "learning_rate": 1.5354468821441762e-06, + "loss": 0.7398, + "step": 4480 + }, + { + "epoch": 0.6388276303620972, + "grad_norm": 14.916947670390048, + "learning_rate": 1.5248280354973195e-06, + "loss": 0.8009, + "step": 4490 + }, + { + "epoch": 0.6402504090488724, + "grad_norm": 9.492480685750538, + "learning_rate": 1.514229899494099e-06, + "loss": 0.6761, + "step": 4500 + }, + { + "epoch": 0.6416731877356477, + "grad_norm": 17.176525722076306, + "learning_rate": 1.50365269921709e-06, + "loss": 0.6576, + "step": 4510 + }, + { + "epoch": 0.643095966422423, + "grad_norm": 9.346985316225243, + "learning_rate": 1.4930966593042362e-06, + "loss": 0.8006, + "step": 4520 + }, + { + "epoch": 0.6445187451091983, + "grad_norm": 14.91630735957912, + "learning_rate": 1.4825620039440812e-06, + "loss": 0.8409, + "step": 4530 + }, + { + "epoch": 0.6459415237959736, + "grad_norm": 11.473298576346068, + "learning_rate": 1.4720489568710016e-06, + "loss": 0.7263, + "step": 4540 + }, + { + "epoch": 0.6473643024827488, + "grad_norm": 15.939839366948728, + "learning_rate": 1.4615577413604604e-06, + "loss": 0.6488, + "step": 4550 + }, + { + "epoch": 0.6487870811695241, + "grad_norm": 16.191145799664508, + "learning_rate": 1.4510885802242637e-06, + "loss": 0.7685, + "step": 4560 + }, + { + "epoch": 0.6502098598562993, + "grad_norm": 18.698577322891694, + "learning_rate": 1.440641695805825e-06, + "loss": 0.668, + "step": 4570 + }, + { + "epoch": 0.6516326385430746, + "grad_norm": 13.016791806898487, + "learning_rate": 1.4302173099754506e-06, + "loss": 0.7125, + "step": 4580 + }, + { + "epoch": 0.6530554172298499, + "grad_norm": 40.70752668906216, + "learning_rate": 1.4198156441256194e-06, + "loss": 0.7818, + "step": 4590 + }, + { + "epoch": 0.6544781959166251, + "grad_norm": 7.281754729573183, + "learning_rate": 1.4094369191662876e-06, + "loss": 0.7283, + "step": 4600 + }, + { + "epoch": 0.6559009746034005, + "grad_norm": 21.2940004124432, + "learning_rate": 1.3990813555201916e-06, + "loss": 0.7082, + "step": 4610 + }, + { + "epoch": 0.6573237532901757, + "grad_norm": 27.176553938994964, + "learning_rate": 1.38874917311817e-06, + "loss": 0.6501, + "step": 4620 + }, + { + "epoch": 0.658746531976951, + "grad_norm": 13.520934938840828, + "learning_rate": 1.3784405913944948e-06, + "loss": 0.7522, + "step": 4630 + }, + { + "epoch": 0.6601693106637263, + "grad_norm": 27.795368348387175, + "learning_rate": 1.368155829282204e-06, + "loss": 0.6796, + "step": 4640 + }, + { + "epoch": 0.6615920893505015, + "grad_norm": 13.139909160482343, + "learning_rate": 1.3578951052084566e-06, + "loss": 0.6848, + "step": 4650 + }, + { + "epoch": 0.6630148680372768, + "grad_norm": 20.60306083940586, + "learning_rate": 1.347658637089896e-06, + "loss": 0.7656, + "step": 4660 + }, + { + "epoch": 0.664437646724052, + "grad_norm": 14.25958816780152, + "learning_rate": 1.3374466423280185e-06, + "loss": 0.6432, + "step": 4670 + }, + { + "epoch": 0.6658604254108274, + "grad_norm": 19.038734308507603, + "learning_rate": 1.3272593378045534e-06, + "loss": 0.8042, + "step": 4680 + }, + { + "epoch": 0.6672832040976027, + "grad_norm": 29.21367724067394, + "learning_rate": 1.3170969398768649e-06, + "loss": 0.6578, + "step": 4690 + }, + { + "epoch": 0.6687059827843779, + "grad_norm": 533.8160870471642, + "learning_rate": 1.306959664373349e-06, + "loss": 0.7387, + "step": 4700 + }, + { + "epoch": 0.6701287614711532, + "grad_norm": 14.771200737749313, + "learning_rate": 1.296847726588853e-06, + "loss": 0.6526, + "step": 4710 + }, + { + "epoch": 0.6715515401579284, + "grad_norm": 16.43963547785031, + "learning_rate": 1.2867613412801055e-06, + "loss": 0.7479, + "step": 4720 + }, + { + "epoch": 0.6729743188447037, + "grad_norm": 12.308142199707067, + "learning_rate": 1.2767007226611533e-06, + "loss": 0.7722, + "step": 4730 + }, + { + "epoch": 0.674397097531479, + "grad_norm": 21.747552443599744, + "learning_rate": 1.2666660843988088e-06, + "loss": 0.7633, + "step": 4740 + }, + { + "epoch": 0.6758198762182542, + "grad_norm": 9.250080675840156, + "learning_rate": 1.2566576396081188e-06, + "loss": 0.6876, + "step": 4750 + }, + { + "epoch": 0.6772426549050296, + "grad_norm": 17.884494630302683, + "learning_rate": 1.2466756008478303e-06, + "loss": 0.6994, + "step": 4760 + }, + { + "epoch": 0.6786654335918048, + "grad_norm": 9.122887241170309, + "learning_rate": 1.236720180115886e-06, + "loss": 0.6575, + "step": 4770 + }, + { + "epoch": 0.6800882122785801, + "grad_norm": 7.956747557348674, + "learning_rate": 1.2267915888449114e-06, + "loss": 0.7269, + "step": 4780 + }, + { + "epoch": 0.6815109909653554, + "grad_norm": 20.631525051921074, + "learning_rate": 1.216890037897733e-06, + "loss": 0.75, + "step": 4790 + }, + { + "epoch": 0.6829337696521306, + "grad_norm": 35.036205257844856, + "learning_rate": 1.207015737562896e-06, + "loss": 0.7037, + "step": 4800 + }, + { + "epoch": 0.6843565483389059, + "grad_norm": 19.008308176713022, + "learning_rate": 1.1971688975501971e-06, + "loss": 0.7218, + "step": 4810 + }, + { + "epoch": 0.6857793270256811, + "grad_norm": 7.128767920750745, + "learning_rate": 1.1873497269862327e-06, + "loss": 0.6508, + "step": 4820 + }, + { + "epoch": 0.6872021057124564, + "grad_norm": 9.501177883754682, + "learning_rate": 1.1775584344099586e-06, + "loss": 0.7482, + "step": 4830 + }, + { + "epoch": 0.6886248843992318, + "grad_norm": 14.480692474454747, + "learning_rate": 1.1677952277682597e-06, + "loss": 0.6636, + "step": 4840 + }, + { + "epoch": 0.690047663086007, + "grad_norm": 8.260427118275636, + "learning_rate": 1.1580603144115301e-06, + "loss": 0.7237, + "step": 4850 + }, + { + "epoch": 0.6914704417727823, + "grad_norm": 17.5013994162, + "learning_rate": 1.1483539010892777e-06, + "loss": 0.6309, + "step": 4860 + }, + { + "epoch": 0.6928932204595575, + "grad_norm": 13.22124517933893, + "learning_rate": 1.1386761939457247e-06, + "loss": 0.6407, + "step": 4870 + }, + { + "epoch": 0.6943159991463328, + "grad_norm": 25.784036778389154, + "learning_rate": 1.1290273985154333e-06, + "loss": 0.7658, + "step": 4880 + }, + { + "epoch": 0.6957387778331081, + "grad_norm": 31.750375019656428, + "learning_rate": 1.1194077197189427e-06, + "loss": 0.6312, + "step": 4890 + }, + { + "epoch": 0.6971615565198833, + "grad_norm": 13.27769247274, + "learning_rate": 1.1098173618584147e-06, + "loss": 0.656, + "step": 4900 + }, + { + "epoch": 0.6985843352066586, + "grad_norm": 24.745992727476718, + "learning_rate": 1.1002565286132914e-06, + "loss": 0.6578, + "step": 4910 + }, + { + "epoch": 0.7000071138934338, + "grad_norm": 6.572338606115534, + "learning_rate": 1.0907254230359777e-06, + "loss": 0.6173, + "step": 4920 + }, + { + "epoch": 0.7014298925802092, + "grad_norm": 21.375498414332217, + "learning_rate": 1.0812242475475192e-06, + "loss": 0.6443, + "step": 4930 + }, + { + "epoch": 0.7028526712669845, + "grad_norm": 18.135153621580898, + "learning_rate": 1.071753203933313e-06, + "loss": 0.733, + "step": 4940 + }, + { + "epoch": 0.7042754499537597, + "grad_norm": 14.042627759924816, + "learning_rate": 1.0623124933388126e-06, + "loss": 0.6542, + "step": 4950 + }, + { + "epoch": 0.705698228640535, + "grad_norm": 30.16915992497821, + "learning_rate": 1.0529023162652635e-06, + "loss": 0.7769, + "step": 4960 + }, + { + "epoch": 0.7071210073273102, + "grad_norm": 14.388444772922258, + "learning_rate": 1.0435228725654426e-06, + "loss": 0.6462, + "step": 4970 + }, + { + "epoch": 0.7085437860140855, + "grad_norm": 18.03414525268103, + "learning_rate": 1.0341743614394105e-06, + "loss": 0.729, + "step": 4980 + }, + { + "epoch": 0.7099665647008608, + "grad_norm": 13.85227892598916, + "learning_rate": 1.0248569814302864e-06, + "loss": 0.6415, + "step": 4990 + }, + { + "epoch": 0.711389343387636, + "grad_norm": 15.215365219685497, + "learning_rate": 1.015570930420027e-06, + "loss": 0.6695, + "step": 5000 + }, + { + "epoch": 0.7128121220744114, + "grad_norm": 97.40345906478952, + "learning_rate": 1.0063164056252254e-06, + "loss": 0.6863, + "step": 5010 + }, + { + "epoch": 0.7142349007611866, + "grad_norm": 16.79920759950203, + "learning_rate": 9.97093603592924e-07, + "loss": 0.6956, + "step": 5020 + }, + { + "epoch": 0.7156576794479619, + "grad_norm": 14.27314471502695, + "learning_rate": 9.879027201964392e-07, + "loss": 0.7643, + "step": 5030 + }, + { + "epoch": 0.7170804581347371, + "grad_norm": 34.06257052430994, + "learning_rate": 9.787439506312016e-07, + "loss": 0.6817, + "step": 5040 + }, + { + "epoch": 0.7185032368215124, + "grad_norm": 44.832955767998286, + "learning_rate": 9.696174894106081e-07, + "loss": 0.7809, + "step": 5050 + }, + { + "epoch": 0.7199260155082877, + "grad_norm": 16.245080415300535, + "learning_rate": 9.605235303618935e-07, + "loss": 0.727, + "step": 5060 + }, + { + "epoch": 0.7213487941950629, + "grad_norm": 14.111285705311632, + "learning_rate": 9.514622666220158e-07, + "loss": 0.6188, + "step": 5070 + }, + { + "epoch": 0.7227715728818382, + "grad_norm": 15.162572043299535, + "learning_rate": 9.424338906335486e-07, + "loss": 0.7122, + "step": 5080 + }, + { + "epoch": 0.7241943515686134, + "grad_norm": 27.668155036946754, + "learning_rate": 9.334385941406002e-07, + "loss": 0.7043, + "step": 5090 + }, + { + "epoch": 0.7256171302553888, + "grad_norm": 11.64701587775779, + "learning_rate": 9.244765681847379e-07, + "loss": 0.6523, + "step": 5100 + }, + { + "epoch": 0.7270399089421641, + "grad_norm": 17.376356821899527, + "learning_rate": 9.155480031009304e-07, + "loss": 0.6972, + "step": 5110 + }, + { + "epoch": 0.7284626876289393, + "grad_norm": 8.690171383763024, + "learning_rate": 9.066530885135055e-07, + "loss": 0.7344, + "step": 5120 + }, + { + "epoch": 0.7298854663157146, + "grad_norm": 22.08471670413809, + "learning_rate": 8.977920133321266e-07, + "loss": 0.7304, + "step": 5130 + }, + { + "epoch": 0.7313082450024898, + "grad_norm": 51.96286186100988, + "learning_rate": 8.889649657477773e-07, + "loss": 0.6957, + "step": 5140 + }, + { + "epoch": 0.7327310236892651, + "grad_norm": 14.379879755090343, + "learning_rate": 8.801721332287619e-07, + "loss": 0.7775, + "step": 5150 + }, + { + "epoch": 0.7341538023760404, + "grad_norm": 26.855334316137547, + "learning_rate": 8.714137025167319e-07, + "loss": 0.714, + "step": 5160 + }, + { + "epoch": 0.7355765810628156, + "grad_norm": 16.00017490250412, + "learning_rate": 8.626898596227118e-07, + "loss": 0.652, + "step": 5170 + }, + { + "epoch": 0.736999359749591, + "grad_norm": 8.432034143264096, + "learning_rate": 8.540007898231525e-07, + "loss": 0.8362, + "step": 5180 + }, + { + "epoch": 0.7384221384363662, + "grad_norm": 22.25669162158987, + "learning_rate": 8.453466776559974e-07, + "loss": 0.6658, + "step": 5190 + }, + { + "epoch": 0.7398449171231415, + "grad_norm": 8.791913538046265, + "learning_rate": 8.367277069167612e-07, + "loss": 0.7508, + "step": 5200 + }, + { + "epoch": 0.7412676958099168, + "grad_norm": 19.57210800521164, + "learning_rate": 8.281440606546278e-07, + "loss": 0.7802, + "step": 5210 + }, + { + "epoch": 0.742690474496692, + "grad_norm": 11.354804669570045, + "learning_rate": 8.195959211685606e-07, + "loss": 0.6344, + "step": 5220 + }, + { + "epoch": 0.7441132531834673, + "grad_norm": 15.17145858767825, + "learning_rate": 8.110834700034315e-07, + "loss": 0.7363, + "step": 5230 + }, + { + "epoch": 0.7455360318702425, + "grad_norm": 29.579246308700657, + "learning_rate": 8.026068879461693e-07, + "loss": 0.6516, + "step": 5240 + }, + { + "epoch": 0.7469588105570178, + "grad_norm": 18.39668533555811, + "learning_rate": 7.941663550219134e-07, + "loss": 0.6841, + "step": 5250 + }, + { + "epoch": 0.7483815892437932, + "grad_norm": 22.82729737786706, + "learning_rate": 7.857620504901953e-07, + "loss": 0.68, + "step": 5260 + }, + { + "epoch": 0.7498043679305684, + "grad_norm": 6.6003652423985155, + "learning_rate": 7.773941528411316e-07, + "loss": 0.655, + "step": 5270 + }, + { + "epoch": 0.7512271466173437, + "grad_norm": 22.211334230492756, + "learning_rate": 7.690628397916291e-07, + "loss": 0.7384, + "step": 5280 + }, + { + "epoch": 0.7526499253041189, + "grad_norm": 12.074461866500236, + "learning_rate": 7.607682882816136e-07, + "loss": 0.6816, + "step": 5290 + }, + { + "epoch": 0.7540727039908942, + "grad_norm": 16.231173521043722, + "learning_rate": 7.525106744702725e-07, + "loss": 0.6975, + "step": 5300 + }, + { + "epoch": 0.7554954826776695, + "grad_norm": 31.721613985668142, + "learning_rate": 7.442901737323133e-07, + "loss": 0.681, + "step": 5310 + }, + { + "epoch": 0.7569182613644447, + "grad_norm": 17.681404615301002, + "learning_rate": 7.361069606542354e-07, + "loss": 0.6685, + "step": 5320 + }, + { + "epoch": 0.75834104005122, + "grad_norm": 9.124774910267016, + "learning_rate": 7.279612090306285e-07, + "loss": 0.5928, + "step": 5330 + }, + { + "epoch": 0.7597638187379953, + "grad_norm": 30.734994130943175, + "learning_rate": 7.198530918604754e-07, + "loss": 0.709, + "step": 5340 + }, + { + "epoch": 0.7611865974247706, + "grad_norm": 11.112472348569415, + "learning_rate": 7.117827813434813e-07, + "loss": 0.6674, + "step": 5350 + }, + { + "epoch": 0.7626093761115459, + "grad_norm": 16.480693812459887, + "learning_rate": 7.037504488764174e-07, + "loss": 0.6491, + "step": 5360 + }, + { + "epoch": 0.7640321547983211, + "grad_norm": 12.34032417959078, + "learning_rate": 6.957562650494781e-07, + "loss": 0.616, + "step": 5370 + }, + { + "epoch": 0.7654549334850964, + "grad_norm": 12.791383068851031, + "learning_rate": 6.878003996426605e-07, + "loss": 0.7438, + "step": 5380 + }, + { + "epoch": 0.7668777121718716, + "grad_norm": 37.40607230940538, + "learning_rate": 6.798830216221552e-07, + "loss": 0.6666, + "step": 5390 + }, + { + "epoch": 0.7683004908586469, + "grad_norm": 23.140755286118655, + "learning_rate": 6.720042991367614e-07, + "loss": 0.7311, + "step": 5400 + }, + { + "epoch": 0.7697232695454223, + "grad_norm": 20.51227690551173, + "learning_rate": 6.641643995143152e-07, + "loss": 0.7196, + "step": 5410 + }, + { + "epoch": 0.7711460482321975, + "grad_norm": 11.565002812640744, + "learning_rate": 6.563634892581324e-07, + "loss": 0.6755, + "step": 5420 + }, + { + "epoch": 0.7725688269189728, + "grad_norm": 20.44574240987166, + "learning_rate": 6.486017340434777e-07, + "loss": 0.6708, + "step": 5430 + }, + { + "epoch": 0.773991605605748, + "grad_norm": 23.82537630461409, + "learning_rate": 6.408792987140433e-07, + "loss": 0.6907, + "step": 5440 + }, + { + "epoch": 0.7754143842925233, + "grad_norm": 24.844791674029324, + "learning_rate": 6.331963472784458e-07, + "loss": 0.6266, + "step": 5450 + }, + { + "epoch": 0.7768371629792986, + "grad_norm": 34.51982306880169, + "learning_rate": 6.255530429067463e-07, + "loss": 0.68, + "step": 5460 + }, + { + "epoch": 0.7782599416660738, + "grad_norm": 22.6817440592934, + "learning_rate": 6.179495479269848e-07, + "loss": 0.5493, + "step": 5470 + }, + { + "epoch": 0.7796827203528491, + "grad_norm": 19.193324207599662, + "learning_rate": 6.103860238217316e-07, + "loss": 0.7456, + "step": 5480 + }, + { + "epoch": 0.7811054990396243, + "grad_norm": 31.92374383346149, + "learning_rate": 6.028626312246563e-07, + "loss": 0.6894, + "step": 5490 + }, + { + "epoch": 0.7825282777263997, + "grad_norm": 14.402098783610006, + "learning_rate": 5.953795299171208e-07, + "loss": 0.6572, + "step": 5500 + }, + { + "epoch": 0.783951056413175, + "grad_norm": 16.58225414656111, + "learning_rate": 5.879368788247805e-07, + "loss": 0.6419, + "step": 5510 + }, + { + "epoch": 0.7853738350999502, + "grad_norm": 15.51842210754784, + "learning_rate": 5.805348360142119e-07, + "loss": 0.6338, + "step": 5520 + }, + { + "epoch": 0.7867966137867255, + "grad_norm": 18.454262994429182, + "learning_rate": 5.731735586895562e-07, + "loss": 0.6657, + "step": 5530 + }, + { + "epoch": 0.7882193924735007, + "grad_norm": 19.16725689823867, + "learning_rate": 5.658532031891798e-07, + "loss": 0.5431, + "step": 5540 + }, + { + "epoch": 0.789642171160276, + "grad_norm": 14.314115828548886, + "learning_rate": 5.585739249823532e-07, + "loss": 0.7796, + "step": 5550 + }, + { + "epoch": 0.7910649498470513, + "grad_norm": 18.83332788207389, + "learning_rate": 5.5133587866595e-07, + "loss": 0.7011, + "step": 5560 + }, + { + "epoch": 0.7924877285338265, + "grad_norm": 6.571592737822653, + "learning_rate": 5.441392179611623e-07, + "loss": 0.6035, + "step": 5570 + }, + { + "epoch": 0.7939105072206019, + "grad_norm": 23.07747991496131, + "learning_rate": 5.369840957102399e-07, + "loss": 0.6608, + "step": 5580 + }, + { + "epoch": 0.7953332859073771, + "grad_norm": 17.394620247817357, + "learning_rate": 5.298706638732373e-07, + "loss": 0.5769, + "step": 5590 + }, + { + "epoch": 0.7967560645941524, + "grad_norm": 37.99269008869083, + "learning_rate": 5.227990735247942e-07, + "loss": 0.7183, + "step": 5600 + }, + { + "epoch": 0.7981788432809277, + "grad_norm": 14.27327658796223, + "learning_rate": 5.157694748509218e-07, + "loss": 0.6888, + "step": 5610 + }, + { + "epoch": 0.7996016219677029, + "grad_norm": 25.80673651224746, + "learning_rate": 5.087820171458149e-07, + "loss": 0.6342, + "step": 5620 + }, + { + "epoch": 0.8010244006544782, + "grad_norm": 314.11857800515276, + "learning_rate": 5.018368488086808e-07, + "loss": 0.7554, + "step": 5630 + }, + { + "epoch": 0.8024471793412534, + "grad_norm": 14.451508569333676, + "learning_rate": 4.949341173405863e-07, + "loss": 0.6255, + "step": 5640 + }, + { + "epoch": 0.8038699580280287, + "grad_norm": 27.119138215469455, + "learning_rate": 4.880739693413294e-07, + "loss": 0.7462, + "step": 5650 + }, + { + "epoch": 0.8052927367148041, + "grad_norm": 19.870097640286513, + "learning_rate": 4.812565505063207e-07, + "loss": 0.6811, + "step": 5660 + }, + { + "epoch": 0.8067155154015793, + "grad_norm": 10.389938860504465, + "learning_rate": 4.7448200562349197e-07, + "loss": 0.6477, + "step": 5670 + }, + { + "epoch": 0.8081382940883546, + "grad_norm": 13.92659812488069, + "learning_rate": 4.677504785702219e-07, + "loss": 0.6891, + "step": 5680 + }, + { + "epoch": 0.8095610727751298, + "grad_norm": 32.85051628694543, + "learning_rate": 4.610621123102768e-07, + "loss": 0.6472, + "step": 5690 + }, + { + "epoch": 0.8109838514619051, + "grad_norm": 7.863872935498879, + "learning_rate": 4.544170488907776e-07, + "loss": 0.6665, + "step": 5700 + }, + { + "epoch": 0.8124066301486804, + "grad_norm": 22.860753676803004, + "learning_rate": 4.478154294391826e-07, + "loss": 0.6891, + "step": 5710 + }, + { + "epoch": 0.8138294088354556, + "grad_norm": 11.732849180082756, + "learning_rate": 4.412573941602902e-07, + "loss": 0.6427, + "step": 5720 + }, + { + "epoch": 0.8152521875222309, + "grad_norm": 17.869770041617677, + "learning_rate": 4.3474308233325823e-07, + "loss": 0.7251, + "step": 5730 + }, + { + "epoch": 0.8166749662090061, + "grad_norm": 19.42876313677897, + "learning_rate": 4.2827263230865195e-07, + "loss": 0.6646, + "step": 5740 + }, + { + "epoch": 0.8180977448957815, + "grad_norm": 8.590975505290642, + "learning_rate": 4.218461815054994e-07, + "loss": 0.7297, + "step": 5750 + }, + { + "epoch": 0.8195205235825568, + "grad_norm": 19.349716561967885, + "learning_rate": 4.154638664083768e-07, + "loss": 0.6522, + "step": 5760 + }, + { + "epoch": 0.820943302269332, + "grad_norm": 9.483732611169318, + "learning_rate": 4.091258225645095e-07, + "loss": 0.5925, + "step": 5770 + }, + { + "epoch": 0.8223660809561073, + "grad_norm": 17.221316669496034, + "learning_rate": 4.0283218458089164e-07, + "loss": 0.6834, + "step": 5780 + }, + { + "epoch": 0.8237888596428825, + "grad_norm": 16.143634622370726, + "learning_rate": 3.965830861214301e-07, + "loss": 0.7427, + "step": 5790 + }, + { + "epoch": 0.8252116383296578, + "grad_norm": 15.675226060931111, + "learning_rate": 3.903786599041018e-07, + "loss": 0.7599, + "step": 5800 + }, + { + "epoch": 0.8266344170164331, + "grad_norm": 25.996033801532082, + "learning_rate": 3.8421903769813777e-07, + "loss": 0.7081, + "step": 5810 + }, + { + "epoch": 0.8280571957032083, + "grad_norm": 11.960706714057824, + "learning_rate": 3.781043503212259e-07, + "loss": 0.6384, + "step": 5820 + }, + { + "epoch": 0.8294799743899837, + "grad_norm": 26.473024863638063, + "learning_rate": 3.720347276367281e-07, + "loss": 0.6261, + "step": 5830 + }, + { + "epoch": 0.8309027530767589, + "grad_norm": 33.97170669671055, + "learning_rate": 3.6601029855092754e-07, + "loss": 0.6457, + "step": 5840 + }, + { + "epoch": 0.8323255317635342, + "grad_norm": 13.60176717003026, + "learning_rate": 3.600311910102877e-07, + "loss": 0.6363, + "step": 5850 + }, + { + "epoch": 0.8337483104503095, + "grad_norm": 15.646223876804923, + "learning_rate": 3.5409753199873524e-07, + "loss": 0.6772, + "step": 5860 + }, + { + "epoch": 0.8351710891370847, + "grad_norm": 14.887721160003261, + "learning_rate": 3.4820944753496316e-07, + "loss": 0.6259, + "step": 5870 + }, + { + "epoch": 0.83659386782386, + "grad_norm": 8.180457549268645, + "learning_rate": 3.4236706266975673e-07, + "loss": 0.7065, + "step": 5880 + }, + { + "epoch": 0.8380166465106352, + "grad_norm": 20.131585730116342, + "learning_rate": 3.3657050148333507e-07, + "loss": 0.7261, + "step": 5890 + }, + { + "epoch": 0.8394394251974105, + "grad_norm": 15.01194640883317, + "learning_rate": 3.308198870827159e-07, + "loss": 0.7131, + "step": 5900 + }, + { + "epoch": 0.8408622038841859, + "grad_norm": 25.50169407499349, + "learning_rate": 3.2511534159910347e-07, + "loss": 0.7371, + "step": 5910 + }, + { + "epoch": 0.8422849825709611, + "grad_norm": 8.932934403329037, + "learning_rate": 3.1945698618529195e-07, + "loss": 0.604, + "step": 5920 + }, + { + "epoch": 0.8437077612577364, + "grad_norm": 21.804238920752727, + "learning_rate": 3.138449410130931e-07, + "loss": 0.661, + "step": 5930 + }, + { + "epoch": 0.8451305399445116, + "grad_norm": 31.003829882682634, + "learning_rate": 3.082793252707861e-07, + "loss": 0.6599, + "step": 5940 + }, + { + "epoch": 0.8465533186312869, + "grad_norm": 13.798244734908524, + "learning_rate": 3.027602571605845e-07, + "loss": 0.7096, + "step": 5950 + }, + { + "epoch": 0.8479760973180622, + "grad_norm": 21.048536525617802, + "learning_rate": 2.972878538961241e-07, + "loss": 0.6568, + "step": 5960 + }, + { + "epoch": 0.8493988760048374, + "grad_norm": 35.14605252696549, + "learning_rate": 2.9186223169997835e-07, + "loss": 0.6926, + "step": 5970 + }, + { + "epoch": 0.8508216546916127, + "grad_norm": 6.287182276066726, + "learning_rate": 2.8648350580118445e-07, + "loss": 0.677, + "step": 5980 + }, + { + "epoch": 0.852244433378388, + "grad_norm": 21.352684241357913, + "learning_rate": 2.811517904328012e-07, + "loss": 0.6748, + "step": 5990 + }, + { + "epoch": 0.8536672120651633, + "grad_norm": 8.739882575649565, + "learning_rate": 2.758671988294784e-07, + "loss": 0.6458, + "step": 6000 + }, + { + "epoch": 0.8550899907519386, + "grad_norm": 49.48961762778536, + "learning_rate": 2.7062984322505564e-07, + "loss": 0.5956, + "step": 6010 + }, + { + "epoch": 0.8565127694387138, + "grad_norm": 9.765359265138134, + "learning_rate": 2.654398348501777e-07, + "loss": 0.6783, + "step": 6020 + }, + { + "epoch": 0.8579355481254891, + "grad_norm": 10.353183038698942, + "learning_rate": 2.6029728392993e-07, + "loss": 0.7489, + "step": 6030 + }, + { + "epoch": 0.8593583268122643, + "grad_norm": 9.492744746937786, + "learning_rate": 2.552022996814998e-07, + "loss": 0.6283, + "step": 6040 + }, + { + "epoch": 0.8607811054990396, + "grad_norm": 15.573884263724349, + "learning_rate": 2.5015499031185746e-07, + "loss": 0.7418, + "step": 6050 + }, + { + "epoch": 0.862203884185815, + "grad_norm": 55.47118233050117, + "learning_rate": 2.451554630154565e-07, + "loss": 0.6575, + "step": 6060 + }, + { + "epoch": 0.8636266628725902, + "grad_norm": 11.171500931865225, + "learning_rate": 2.4020382397195675e-07, + "loss": 0.6171, + "step": 6070 + }, + { + "epoch": 0.8650494415593655, + "grad_norm": 10.465530747854013, + "learning_rate": 2.353001783439715e-07, + "loss": 0.7969, + "step": 6080 + }, + { + "epoch": 0.8664722202461407, + "grad_norm": 17.91155449474328, + "learning_rate": 2.30444630274832e-07, + "loss": 0.6153, + "step": 6090 + }, + { + "epoch": 0.867894998932916, + "grad_norm": 14.885236298278624, + "learning_rate": 2.2563728288637648e-07, + "loss": 0.7035, + "step": 6100 + }, + { + "epoch": 0.8693177776196913, + "grad_norm": 18.276684017399496, + "learning_rate": 2.208782382767599e-07, + "loss": 0.6747, + "step": 6110 + }, + { + "epoch": 0.8707405563064665, + "grad_norm": 10.293023049649177, + "learning_rate": 2.1616759751828697e-07, + "loss": 0.7219, + "step": 6120 + }, + { + "epoch": 0.8721633349932418, + "grad_norm": 14.965682612306166, + "learning_rate": 2.1150546065526245e-07, + "loss": 0.7804, + "step": 6130 + }, + { + "epoch": 0.873586113680017, + "grad_norm": 29.038048950021082, + "learning_rate": 2.0689192670186957e-07, + "loss": 0.6408, + "step": 6140 + }, + { + "epoch": 0.8750088923667924, + "grad_norm": 15.060073917261208, + "learning_rate": 2.0232709364006482e-07, + "loss": 0.5891, + "step": 6150 + }, + { + "epoch": 0.8764316710535676, + "grad_norm": 21.41557776637373, + "learning_rate": 1.978110584174994e-07, + "loss": 0.6116, + "step": 6160 + }, + { + "epoch": 0.8778544497403429, + "grad_norm": 7.518014180571353, + "learning_rate": 1.933439169454579e-07, + "loss": 0.5941, + "step": 6170 + }, + { + "epoch": 0.8792772284271182, + "grad_norm": 11.884159235216817, + "learning_rate": 1.8892576409682256e-07, + "loss": 0.7353, + "step": 6180 + }, + { + "epoch": 0.8807000071138934, + "grad_norm": 21.83174021803315, + "learning_rate": 1.8455669370405882e-07, + "loss": 0.6475, + "step": 6190 + }, + { + "epoch": 0.8821227858006687, + "grad_norm": 14.057248171008613, + "learning_rate": 1.8023679855722093e-07, + "loss": 0.7441, + "step": 6200 + }, + { + "epoch": 0.8835455644874439, + "grad_norm": 21.79237041631171, + "learning_rate": 1.7596617040198327e-07, + "loss": 0.7151, + "step": 6210 + }, + { + "epoch": 0.8849683431742192, + "grad_norm": 10.609700883252497, + "learning_rate": 1.7174489993768917e-07, + "loss": 0.6349, + "step": 6220 + }, + { + "epoch": 0.8863911218609946, + "grad_norm": 13.311107416979928, + "learning_rate": 1.6757307681542821e-07, + "loss": 0.6425, + "step": 6230 + }, + { + "epoch": 0.8878139005477698, + "grad_norm": 34.676737788107566, + "learning_rate": 1.6345078963612897e-07, + "loss": 0.6191, + "step": 6240 + }, + { + "epoch": 0.8892366792345451, + "grad_norm": 10.877524165504067, + "learning_rate": 1.59378125948679e-07, + "loss": 0.6359, + "step": 6250 + }, + { + "epoch": 0.8906594579213203, + "grad_norm": 169.81539374163754, + "learning_rate": 1.553551722480659e-07, + "loss": 0.6926, + "step": 6260 + }, + { + "epoch": 0.8920822366080956, + "grad_norm": 33.283503418457634, + "learning_rate": 1.5138201397353834e-07, + "loss": 0.7244, + "step": 6270 + }, + { + "epoch": 0.8935050152948709, + "grad_norm": 16.66512043307651, + "learning_rate": 1.4745873550679274e-07, + "loss": 0.7099, + "step": 6280 + }, + { + "epoch": 0.8949277939816461, + "grad_norm": 39.05716141566033, + "learning_rate": 1.4358542017018222e-07, + "loss": 0.6016, + "step": 6290 + }, + { + "epoch": 0.8963505726684214, + "grad_norm": 11.399154977290856, + "learning_rate": 1.397621502249452e-07, + "loss": 0.7192, + "step": 6300 + }, + { + "epoch": 0.8977733513551966, + "grad_norm": 15.074253686543178, + "learning_rate": 1.3598900686945916e-07, + "loss": 0.6877, + "step": 6310 + }, + { + "epoch": 0.899196130041972, + "grad_norm": 42.56082012662866, + "learning_rate": 1.3226607023751642e-07, + "loss": 0.5992, + "step": 6320 + }, + { + "epoch": 0.9006189087287473, + "grad_norm": 26.049955980737074, + "learning_rate": 1.2859341939662178e-07, + "loss": 0.7737, + "step": 6330 + }, + { + "epoch": 0.9020416874155225, + "grad_norm": 30.559789796724967, + "learning_rate": 1.2497113234631238e-07, + "loss": 0.6726, + "step": 6340 + }, + { + "epoch": 0.9034644661022978, + "grad_norm": 11.508415485790165, + "learning_rate": 1.2139928601650393e-07, + "loss": 0.5407, + "step": 6350 + }, + { + "epoch": 0.904887244789073, + "grad_norm": 22.16281781432751, + "learning_rate": 1.1787795626585441e-07, + "loss": 0.6747, + "step": 6360 + }, + { + "epoch": 0.9063100234758483, + "grad_norm": 8.512128854917334, + "learning_rate": 1.1440721788015308e-07, + "loss": 0.5865, + "step": 6370 + }, + { + "epoch": 0.9077328021626236, + "grad_norm": 9.064303945494332, + "learning_rate": 1.1098714457073417e-07, + "loss": 0.7327, + "step": 6380 + }, + { + "epoch": 0.9091555808493988, + "grad_norm": 18.62534412436195, + "learning_rate": 1.0761780897290852e-07, + "loss": 0.683, + "step": 6390 + }, + { + "epoch": 0.9105783595361742, + "grad_norm": 15.04597150265621, + "learning_rate": 1.0429928264442362e-07, + "loss": 0.6818, + "step": 6400 + }, + { + "epoch": 0.9120011382229494, + "grad_norm": 17.9224272604521, + "learning_rate": 1.0103163606394239e-07, + "loss": 0.6399, + "step": 6410 + }, + { + "epoch": 0.9134239169097247, + "grad_norm": 15.200205393349849, + "learning_rate": 9.781493862954655e-08, + "loss": 0.6882, + "step": 6420 + }, + { + "epoch": 0.9148466955965, + "grad_norm": 61.61576418334018, + "learning_rate": 9.464925865726366e-08, + "loss": 0.7736, + "step": 6430 + }, + { + "epoch": 0.9162694742832752, + "grad_norm": 23.93288153019585, + "learning_rate": 9.153466337961436e-08, + "loss": 0.7299, + "step": 6440 + }, + { + "epoch": 0.9176922529700505, + "grad_norm": 16.01184842044598, + "learning_rate": 8.847121894418609e-08, + "loss": 0.6777, + "step": 6450 + }, + { + "epoch": 0.9191150316568257, + "grad_norm": 11.579637767092105, + "learning_rate": 8.545899041222827e-08, + "loss": 0.6941, + "step": 6460 + }, + { + "epoch": 0.920537810343601, + "grad_norm": 17.965375169975463, + "learning_rate": 8.249804175726934e-08, + "loss": 0.6288, + "step": 6470 + }, + { + "epoch": 0.9219605890303764, + "grad_norm": 15.134319011043464, + "learning_rate": 7.958843586375886e-08, + "loss": 0.6137, + "step": 6480 + }, + { + "epoch": 0.9233833677171516, + "grad_norm": 17.336267124248604, + "learning_rate": 7.673023452573286e-08, + "loss": 0.6739, + "step": 6490 + }, + { + "epoch": 0.9248061464039269, + "grad_norm": 17.494109579354692, + "learning_rate": 7.392349844549923e-08, + "loss": 0.7235, + "step": 6500 + }, + { + "epoch": 0.9262289250907021, + "grad_norm": 25.849714031596356, + "learning_rate": 7.116828723234991e-08, + "loss": 0.7266, + "step": 6510 + }, + { + "epoch": 0.9276517037774774, + "grad_norm": 11.328018184834953, + "learning_rate": 6.846465940129609e-08, + "loss": 0.5282, + "step": 6520 + }, + { + "epoch": 0.9290744824642527, + "grad_norm": 8.210571934863204, + "learning_rate": 6.581267237182304e-08, + "loss": 0.7225, + "step": 6530 + }, + { + "epoch": 0.9304972611510279, + "grad_norm": 26.615240326619148, + "learning_rate": 6.321238246667166e-08, + "loss": 0.6524, + "step": 6540 + }, + { + "epoch": 0.9319200398378032, + "grad_norm": 10.506543994481495, + "learning_rate": 6.066384491064337e-08, + "loss": 0.6497, + "step": 6550 + }, + { + "epoch": 0.9333428185245785, + "grad_norm": 32.578044982132084, + "learning_rate": 5.8167113829425416e-08, + "loss": 0.7905, + "step": 6560 + }, + { + "epoch": 0.9347655972113538, + "grad_norm": 26.139082460876924, + "learning_rate": 5.572224224844297e-08, + "loss": 0.6009, + "step": 6570 + }, + { + "epoch": 0.9361883758981291, + "grad_norm": 25.70715498977883, + "learning_rate": 5.332928209173138e-08, + "loss": 0.7175, + "step": 6580 + }, + { + "epoch": 0.9376111545849043, + "grad_norm": 21.18820776002297, + "learning_rate": 5.0988284180835404e-08, + "loss": 0.6472, + "step": 6590 + }, + { + "epoch": 0.9390339332716796, + "grad_norm": 30.441715942004116, + "learning_rate": 4.8699298233727546e-08, + "loss": 0.7279, + "step": 6600 + }, + { + "epoch": 0.9404567119584548, + "grad_norm": 21.34748999867495, + "learning_rate": 4.646237286375449e-08, + "loss": 0.701, + "step": 6610 + }, + { + "epoch": 0.9418794906452301, + "grad_norm": 13.676207981782268, + "learning_rate": 4.427755557860236e-08, + "loss": 0.5594, + "step": 6620 + }, + { + "epoch": 0.9433022693320054, + "grad_norm": 13.877508873616051, + "learning_rate": 4.2144892779290015e-08, + "loss": 0.7165, + "step": 6630 + }, + { + "epoch": 0.9447250480187807, + "grad_norm": 34.9111327739568, + "learning_rate": 4.006442975918123e-08, + "loss": 0.7097, + "step": 6640 + }, + { + "epoch": 0.946147826705556, + "grad_norm": 8.860238134882508, + "learning_rate": 3.8036210703025486e-08, + "loss": 0.6653, + "step": 6650 + }, + { + "epoch": 0.9475706053923312, + "grad_norm": 12.97838301460571, + "learning_rate": 3.606027868601702e-08, + "loss": 0.6635, + "step": 6660 + }, + { + "epoch": 0.9489933840791065, + "grad_norm": 13.245086828871894, + "learning_rate": 3.413667567288143e-08, + "loss": 0.5832, + "step": 6670 + }, + { + "epoch": 0.9504161627658818, + "grad_norm": 14.890537012064636, + "learning_rate": 3.2265442516983826e-08, + "loss": 0.6808, + "step": 6680 + }, + { + "epoch": 0.951838941452657, + "grad_norm": 30.709204073798546, + "learning_rate": 3.0446618959461826e-08, + "loss": 0.6955, + "step": 6690 + }, + { + "epoch": 0.9532617201394323, + "grad_norm": 9.62781816402037, + "learning_rate": 2.8680243628381177e-08, + "loss": 0.6631, + "step": 6700 + }, + { + "epoch": 0.9546844988262075, + "grad_norm": 10.056987346093363, + "learning_rate": 2.6966354037914477e-08, + "loss": 0.6459, + "step": 6710 + }, + { + "epoch": 0.9561072775129829, + "grad_norm": 9.393608339470743, + "learning_rate": 2.5304986587546544e-08, + "loss": 0.5365, + "step": 6720 + }, + { + "epoch": 0.9575300561997582, + "grad_norm": 9.350612385267924, + "learning_rate": 2.369617656129919e-08, + "loss": 0.7408, + "step": 6730 + }, + { + "epoch": 0.9589528348865334, + "grad_norm": 17.57921704990656, + "learning_rate": 2.2139958126983217e-08, + "loss": 0.6266, + "step": 6740 + }, + { + "epoch": 0.9603756135733087, + "grad_norm": 9.372225143512935, + "learning_rate": 2.0636364335472614e-08, + "loss": 0.7491, + "step": 6750 + }, + { + "epoch": 0.9617983922600839, + "grad_norm": 21.57591391004256, + "learning_rate": 1.9185427120001487e-08, + "loss": 0.6907, + "step": 6760 + }, + { + "epoch": 0.9632211709468592, + "grad_norm": 17.18911624015644, + "learning_rate": 1.7787177295487956e-08, + "loss": 0.5657, + "step": 6770 + }, + { + "epoch": 0.9646439496336345, + "grad_norm": 36.75906035966361, + "learning_rate": 1.6441644557877444e-08, + "loss": 0.7114, + "step": 6780 + }, + { + "epoch": 0.9660667283204097, + "grad_norm": 21.24728321301748, + "learning_rate": 1.514885748351319e-08, + "loss": 0.6113, + "step": 6790 + }, + { + "epoch": 0.9674895070071851, + "grad_norm": 7.101447822320762, + "learning_rate": 1.3908843528530337e-08, + "loss": 0.6532, + "step": 6800 + }, + { + "epoch": 0.9689122856939603, + "grad_norm": 18.090559454068472, + "learning_rate": 1.2721629028269733e-08, + "loss": 0.6345, + "step": 6810 + }, + { + "epoch": 0.9703350643807356, + "grad_norm": 16.802326235518294, + "learning_rate": 1.1587239196722555e-08, + "loss": 0.6792, + "step": 6820 + }, + { + "epoch": 0.9717578430675109, + "grad_norm": 10.580873804156397, + "learning_rate": 1.050569812599156e-08, + "loss": 0.728, + "step": 6830 + }, + { + "epoch": 0.9731806217542861, + "grad_norm": 23.751550135831035, + "learning_rate": 9.47702878578094e-09, + "loss": 0.6499, + "step": 6840 + }, + { + "epoch": 0.9746034004410614, + "grad_norm": 15.286902696444267, + "learning_rate": 8.501253022908662e-09, + "loss": 0.7415, + "step": 6850 + }, + { + "epoch": 0.9760261791278366, + "grad_norm": 23.750942999798756, + "learning_rate": 7.578391560841836e-09, + "loss": 0.6233, + "step": 6860 + }, + { + "epoch": 0.9774489578146119, + "grad_norm": 13.098266803006062, + "learning_rate": 6.70846399925651e-09, + "loss": 0.7737, + "step": 6870 + }, + { + "epoch": 0.9788717365013873, + "grad_norm": 11.660221339653082, + "learning_rate": 5.891488813621893e-09, + "loss": 0.777, + "step": 6880 + }, + { + "epoch": 0.9802945151881625, + "grad_norm": 44.79723668339218, + "learning_rate": 5.127483354808449e-09, + "loss": 0.6923, + "step": 6890 + }, + { + "epoch": 0.9817172938749378, + "grad_norm": 13.137774107102898, + "learning_rate": 4.416463848717911e-09, + "loss": 0.6114, + "step": 6900 + }, + { + "epoch": 0.983140072561713, + "grad_norm": 13.86061937649414, + "learning_rate": 3.758445395939669e-09, + "loss": 0.6833, + "step": 6910 + }, + { + "epoch": 0.9845628512484883, + "grad_norm": 13.909081899756915, + "learning_rate": 3.1534419714307484e-09, + "loss": 0.6655, + "step": 6920 + }, + { + "epoch": 0.9859856299352636, + "grad_norm": 9.192505102390527, + "learning_rate": 2.6014664242168806e-09, + "loss": 0.6961, + "step": 6930 + }, + { + "epoch": 0.9874084086220388, + "grad_norm": 14.296169677609232, + "learning_rate": 2.102530477121889e-09, + "loss": 0.6308, + "step": 6940 + }, + { + "epoch": 0.9888311873088141, + "grad_norm": 20.607220237529024, + "learning_rate": 1.6566447265176089e-09, + "loss": 0.8115, + "step": 6950 + }, + { + "epoch": 0.9902539659955893, + "grad_norm": 18.63472394489305, + "learning_rate": 1.2638186420985132e-09, + "loss": 0.6796, + "step": 6960 + }, + { + "epoch": 0.9916767446823647, + "grad_norm": 21.10612820302392, + "learning_rate": 9.240605666818725e-10, + "loss": 0.6622, + "step": 6970 + }, + { + "epoch": 0.99309952336914, + "grad_norm": 31.484150350563915, + "learning_rate": 6.373777160287309e-10, + "loss": 0.6983, + "step": 6980 + }, + { + "epoch": 0.9945223020559152, + "grad_norm": 20.57209304211363, + "learning_rate": 4.0377617869263863e-10, + "loss": 0.6418, + "step": 6990 + }, + { + "epoch": 0.9959450807426905, + "grad_norm": 129.38234765145998, + "learning_rate": 2.232609158889232e-10, + "loss": 0.7882, + "step": 7000 + }, + { + "epoch": 0.9973678594294657, + "grad_norm": 11.228945900702065, + "learning_rate": 9.583576138977313e-11, + "loss": 0.6529, + "step": 7010 + }, + { + "epoch": 0.998790638116241, + "grad_norm": 31.57256099590383, + "learning_rate": 2.150342144319195e-11, + "loss": 0.5964, + "step": 7020 + }, + { + "epoch": 0.9999288610656613, + "step": 7028, + "total_flos": 1.4353626173538304e+16, + "train_loss": 1.0448219544112716, + "train_runtime": 120927.7644, + "train_samples_per_second": 3.72, + "train_steps_per_second": 0.058 + } + ], + "logging_steps": 10, + "max_steps": 7028, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4353626173538304e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}